The fastai library
simplifies training fast and accurate neural nets using modern best
practices. See the fastai website to get started. The library is based
on research into deep learning best practices undertaken at
fast.ai, and includes “out of the box” support for
vision, text, tabular, and
collab (collaborative filtering) models.
Interesting posts about NN from scratch using R:
To be able to give examples of optimizer steps, we will need some steppers, like the following:
library(magrittr)
library(fastai)
tst_param = function(val, grad = NULL) {
"Create a tensor with `val` and a gradient of `grad` for testing"
res = tensor(val) %>% float()
if(is.null(grad)) {
grad = tensor(val / 10)
} else {
grad = tensor(grad)
}
res$grad = grad %>% float()
res
}p = tst_param(1., 0.1)
ptensor(1.)
sgd_step(p, 1.)
ptensor(0.9000)
p$gradtensor(0.1000)
p = tst_param(1., 0.1)
weight_decay(p, 1., 0.1)
ptensor(0.9000)
p = tst_param(1., 0.1)
l2_reg(p, 1., 0.1)
p$gradtensor(0.2000)
This method will loop over all param groups, then all parameters for
which grad is not NULL and call each function
in stepper, passing it the parameter p with the
hyper-parameters in the corresponding dict in hypers.
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, sgd_step, lr=0.1)
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(2.9700)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, list(weight_decay, sgd_step), lr=0.1, wd = 0.1)
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9800)
$ :tensor(1.9600)
$ :tensor(2.9400)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, sgd_step, lr=0.1)
try(params[3]$grad <- NULL,
TRUE)
params[3]$grad
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(3.)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(list(params[0:1],params[2:3]), sgd_step, lr=0.1)
opt$hypers$items[[1]][[1]] = 0.01
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9990)
$ :tensor(1.9800)
$ :tensor(2.9700)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, list(weight_decay, sgd_step), lr=0.1, wd = 0.1)
opt$zero_grad()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(1.)
$ :tensor(2.)
$ :tensor(3.)
Keeps track of the avg grads of p in state
with mom.
`dampening = FALSE gives the classical formula for
momentum in SGD:
whereas dampening = TRUE makes it an exponential moving
average:
p = tst_param(c(1,2,3), c(4,5,6))
state = average_grad(p, mom = 0.9, dampening = FALSE, grad_avg = NULL)
p$grad
# tensor([4., 5., 6.])
state = average_grad(p, mom=0.9, dampening = TRUE)
p$grad*0.1
# tensor([0.4000, 0.5000, 0.6000])
p$grad*(0.1*0.9+0.1)
# tensor([0.7600, 0.9500, 1.1400])dampening = FALSE gives the classical formula for
momentum in SGD:
whereas dampening = TRUE makes it an exponential moving
average:
p = tst_param(c(1,2,3), c(4,5,6))
state = average_sqr_grad(p, sqr_mom = 0.99, dampening = FALSE)
p$grad$pow(2)
# tensor([16., 25., 36.])
p$grad$pow(2) * 1.99
# tensor([31.8400, 49.7500, 71.6400])
state = average_sqr_grad(p, sqr_mom = 0.99)
p$grad$pow(2) * 1e-2
# tensor([0.1600, 0.2500, 0.3600])
state = average_sqr_grad(p, sqr_mom = 0.99)
p$grad$pow(2)*(0.01*0.99+0.01)
# tensor([0.3184, 0.4975, 0.7164])
params = L(lapply(0:3, function(x) tst_param(x)))
opt = Optimizer(params, sgd_step, lr = 0.1)
opt$freeze_to(1L)A Optimizer for SGD with lr and
mom and params.
Optional weight decay of wd is applied, as true weight
decay (decay the weights directly) if decouple_wd = TRUE
else as L2 regularization (add the decay to the
gradients).
params = L(lapply(0:3, function(x) tst_param(x)))
opt = SGD(params, lr = 0.1)
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(2.9700)
params = L(lapply(0:3, function(x) tst_param(x)))
opt = SGD(params, lr = 0.1, mom = 0.9)
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9900)
$ :tensor(1.9800)
$ :tensor(2.9700)
Test weight decay, notice how we can see that L2
regularization is different from weight decay even for simple SGD with
momentum.
params = L(lapply(0:3, function(x) tst_param(x)))
#Weight decay
opt = SGD(params, lr=0.1, mom=0.9, wd=0.1)
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9800)
$ :tensor(1.9600)
$ :tensor(2.9400)
params = L(lapply(0:3, function(x) tst_param(x)))
#L2 reg
opt = SGD(params, lr=0.1, mom=0.9, wd=0.1, decouple_wd=FALSE)
opt$step()
str(params$items)List of 4
$ :tensor(0.)
$ :tensor(0.9800)
$ :tensor(1.9600)
$ :tensor(2.9400)
A Optimizer for RMSProp with
lr, sqr_mom, mom and
params.
RMSProp was introduced by Geoffrey Hinton in his course.
What is named sqr_mom here is the alpha in the
course. Optional weight decay of wd is applied, as true
weight decay (decay the weights directly) if
decouple_wd = TRUE else as L2 regularization (add the decay
to the gradients).
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = RMSProp(params, lr=0.1)
opt$step()
opt$step()
step = (-0.1 * 0.1) / (sqrt((0.01*0.99+0.01) * 0.1**2) + 1e-8)
params; tensor(c(step, 1+step, 2+step))tensor([-0.7089, 0.2911, 1.2911])
tensor([-0.7089, 0.2911, 1.2911])
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = RMSProp(params, lr=0.1, mom=0.9)
opt$step()
opt$step()
step = (- 0.1 * (0.1 + 0.9*0.1)) / (sqrt((0.01*0.99+0.01) * 0.1**2) + 1e-8)
params; tensor(c(step, 1+step, 2+step))tensor([-1.3469, -0.3469, 0.6531])
tensor([-1.3469, -0.3469, 0.6531])
A Optimizer for Adam with lr, mom,
sqr_mom, eps and params.
Adam was introduced by Diederik P. Kingma and Jimmy Ba in Adam: A
Method for Stochastic Optimization. For consistency across optimizers,
we renamed beta1 and beta2 in the paper to
mom and sqr_mom. Note that our defaults also
differ from the paper (0.99 for sqr_mom or
beta2, 1e-5 for eps). Those values seem to be
better from our experiments in a wide range of situations.
Optional weight decay of wd is applied, as true weight
decay (decay the weights directly) if decouple_wd=TRUE else
as L2 regularization (add the decay to the gradients).
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = Adam(params, lr=0.1, wd=0)
opt$step()
step = (-0.1 * 0.1) / (sqrt(0.1**2) + 1e-8)
params; tensor(c(1+step, 2+step, 3+step))tensor([0.9000, 1.9000, 2.9000])
tensor([0.9000, 1.9000, 2.9000])
opt$step()
params;tensor(tensor(c(1+2*step, 2+2*step, 3+2*step)))tensor([0.8000, 1.8000, 2.8000])
tensor([0.8000, 1.8000, 2.8000])
beta = 0.99
r_inf = 2/(1-beta) - 1
rs = lapply(5:500, function(s) {r_inf - 2*s*beta**s/(1-beta**s)}) %>% as.numeric()
v = sqrt(((rs-4) * (rs-2) * r_inf)/((r_inf-4)*(r_inf-2)*rs))
df_high = data.frame(x = 1:length(v), y = v)
library(highcharter)
hchart(df_high,'line', hcaes(x,y))An Optimizer for Adam with lr, mom,
sqr_mom, nus, eps and
params.
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = QHAdam(params, lr=0.1)
opt$step()
step = (-0.1 * (((1-0.7) * 0.1) + (0.7 * 0.1)) )/ (
sqrt(((1-1.0) * 0.1**2) + (1.0 * 0.1**2)) + 1e-8)
params; tensor(c(1+step, 2+step, 3+step))
# tensor([0.9000, 1.9000, 2.9000])
# tensor([0.9000, 1.9000, 2.9000])
opt$step()
params; tensor(c(1+2*step, 2+2*step, 3+2*step))
# tensor([0.8000, 1.8000, 2.8000])
# tensor([0.8000, 1.8000, 2.8000])A Optimizer for Adam with lr,
mom, sqr_mom, eps and
params.
The LARS optimizer was first introduced in Large Batch Training of
Convolutional Networks then refined in its LARC variant (original LARS
is with clip=FALSE). A learning rate is computed for each
individual layer with a certain trust_coefficient, then
clipped to be always less than lr.
Optional weight decay of wd is applied, as true weight
decay (decay the weights directly) if decouple_wd = TRUE
else as L2 regularization (add the decay to the
gradients).
params = list(tst_param(c(1:3), c(0.1,0.2,0.3)), tst_param(c(1:3), c(0.01,0.02,0.03)))
opt = Larc(params, lr=0.1)
opt$step()
#First param local lr is 0.02 < lr so it's not clipped
opt$state[params[[1]]]['local_lr']$local_lr
tensor(0.0200)
opt$state[params[[2]]]['local_lr']$local_lr
[1] 0.1
params = list(tst_param(c(1:3), c(0.1,0.2,0.3)), tst_param(c(1:3), c(0.01,0.02,0.03)))
opt = Larc(params, lr=0.1, clip = FALSE)
opt$step()
#Second param local lr is 0.2 > lr so it's clipped
opt$state[params[[1]]]['local_lr']$local_lr
tensor(0.0200)
opt$state[params[[2]]]['local_lr']$local_lr
tensor(0.2000)
A Optimizer for Adam with lr,
mom, sqr_mom, eps and
params.
LAMB was introduced in Large Batch Optimization for
Deep Learning: Training BERT in 76 minutes. Intuitively, it’s LARC
applied to Adam. As in Adam, we renamed beta1 and beta2 in the paper to
mom and sqr_mom. Note that our defaults also differ from the paper (0.99
for sqr_mom or beta2, 1e-5 for
eps). Those values seem to be better from our experiments
in a wide range of situations.
Optional weight decay of wd is applied, as true weight
decay (decay the weights directly) if decouple_wd=TRUE else
as L2 regularization (add the decay to the gradients).
params = tst_param(c(1:3), c(0.1,0.2,0.3))
opt = Lamb(params, lr=0.1)
opt$step()
paramstensor([0.7840, 1.7840, 2.7840])
params = tst_param(c(1:3), c(0.1,0.2,0.3))
p = params$data$clone()
g = tensor(c(0.1,0.2,0.3))
opt = Lookahead(SGD(params, lr=0.1))
for(i in 1:5) {
opt$step()
}
#first 5 steps are normal SGD steps
params; p - g * 0.5
# tensor([0.9500, 1.9000, 2.8500])
# tensor([0.9500, 1.9000, 2.8500])
#Since k=6, sixth step is a moving average of the 6 SGD steps with the initial weight
opt$step()
params; p * 0.5 + (p-g*0.6) * 0.5
# tensor([0.9700, 1.9400, 2.9100])
# tensor([0.9700, 1.9400, 2.9100])