# This is an example to visualize how learning rate and momentum affect to reduce model's loss function import numpy as np import matplotlib.pyplot as plt def optimize_and_plot(lr=0.01, momentum=0.0): x = torch.tensor(2.0, requires_grad=True) buffer = torch.zeros_like(x.data) values = [] for i in range(10): y = function(x) values.append((x.clone(), y.clone())) y.backward() d_p = x.grad.data if momentum !=0 : buffer.mul_(momentum).add_(d_p) d_p = buffer x.data.add_(d_p, alpha=-lr) x.grad.zero_() x = np.arange(-3, 2, 0.001) y = function(x) plt.figure(figsize=(10, 5)) plt.plot([v[0].detach().numpy() for v in values], [v[1].detach().numpy() for v in values], 'r-X', linewidth=2, markersize=7) for i in range(10): plt.text(values[i][0]+0.1, values[i][1], f'step {i}', fontdict={'color': 'r'}) plt.plot(x, y, linewidth=2) plt.grid() plt.tick_params(axis='both', which='major', labelsize=12) plt.legend(['Optimizer steps', 'Square function']) plt.show() def function(x): return x**4 + x**3 - 5*x**2 # return x**6 + x**5 - 5*x**4 # Try a first learning rate value lr0 = 0.01 # also do this for 0.1, 0.09 optimize_and_plot(lr=lr0) # Try a first value for momentum mom0 = 0.3 # also do this for 0.94 optimize_and_plot(momentum=mom0)