forked from fastai/fastai_dev
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptimizer.py
353 lines (297 loc) · 14.1 KB
/
optimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#AUTOGENERATED! DO NOT EDIT! File to edit: dev/12_optimizer.ipynb (unless otherwise specified).
__all__ = ['Optimizer', 'sgd_step', 'weight_decay', 'l2_reg', 'average_grad', 'average_sqr_grad', 'momentum_step',
'SGD', 'rms_prop_step', 'RMSProp', 'step_stat', 'debias', 'adam_step', 'Adam', 'radam_step', 'RAdam',
'qhadam_step', 'QHAdam', 'larc_layer_lr', 'larc_step', 'Larc', 'lamb_step', 'Lamb', 'Lookahead',
'detuplify_pg', 'set_item_pg', 'pytorch_hp_map', 'OptimWrapper']
#Cell
from .torch_basics import *
from .test import *
#Cell
class _BaseOptimizer():
"Common functionality between `Optimizer` and `OptimWrapper`"
def all_params(self, n=slice(None), with_grad=False):
res = L((p,pg,self.state[p],hyper) for pg,hyper in zip(self.param_groups[n],self.hypers[n]) for p in pg)
return L(o for o in res if o[0].grad is not None) if with_grad else res
def _set_require_grad(self, rg, p,pg,state,h): p.requires_grad_(rg or state.get('force_train', False))
def freeze_to(self, n):
self.frozen_idx = n if n >= 0 else len(self.param_groups) + n
if self.frozen_idx >= len(self.param_groups):
warn(f"Freezing {self.frozen_idx} groups; model has {len(self.param_groups)}; whole model is frozen.")
for o in self.all_params(slice(n, None)): self._set_require_grad(True, *o)
for o in self.all_params(slice(None, n)): self._set_require_grad(False, *o)
def freeze(self):
assert(len(self.param_groups)>1)
self.freeze_to(-1)
def unfreeze(self): self.freeze_to(0)
def set_hypers(self, **kwargs): L(kwargs.items()).starmap(self.set_hyper)
def _set_hyper(self, k, v):
for v_,h in zip(v, self.hypers): h[k] = v_
def set_hyper(self, k, v):
if isinstance(v, slice):
if v.start: v = even_mults(v.start, v.stop, len(self.param_groups))
else: v = [v.stop/10]*(len(self.param_groups)-1) + [v.stop]
v = L(v, use_list=None)
if len(v)==1: v = v*len(self.param_groups)
assert len(v) == len(self.hypers), f"Trying to set {len(v)} values for {k} but there are {len(self.param_groups)} parameter groups."
self._set_hyper(k, v)
#Cell
class Optimizer(_BaseOptimizer):
"Base optimizer class for the fastai library, updating `params` with `steppers`"
_keep_on_clear = ['force_train', 'do_wd']
def __init__(self, params, steppers, stats=None, train_bn=True, **defaults):
params = L(params)
self.steppers,self.stats,self.state,self.train_bn = L(steppers),L(stats),defaultdict(dict),train_bn
defaults = merge(*self.stats.attrgot('defaults'), *self.steppers.attrgot('defaults'), defaults)
self.param_groups = L(L(p) for p in params) if isinstance(params[0], (L,list)) else L([params])
#self.step_func = compose(*steppers)
self.hypers = L({} for _ in range_of(self.param_groups))
self.set_hypers(**defaults)
self.frozen_idx = 0
def zero_grad(self):
for p,*_ in self.all_params(with_grad=True):
p.grad.detach_()
p.grad.zero_()
def step(self):
for p,pg,state,hyper in self.all_params(with_grad=True):
for stat in self.stats: state = stat(state, p, **hyper)
for step in self.steppers: step(p, **{**state, **hyper})
self.state[p] = state
def clear_state(self):
for p,pg,state,hyper in self.all_params():
self.state[p] = {k: state[k] for k in self._keep_on_clear if k in state}
def state_dict(self):
state = [self.state[p] for p,*_ in self.all_params()]
return {'state': state, 'hypers': self.hypers}
def load_state_dict(self, sd):
assert len(sd["hypers"]) == len(self.param_groups)
assert len(sd["state"]) == sum([len(pg) for pg in self.param_groups])
self.hypers = sd['hypers']
self.state = {p: s for p,s in zip(self.all_params().itemgot(0), sd['state'])}
#Cell
def sgd_step(p, lr, **kwargs):
p.data.add_(-lr, p.grad.data)
return p
#Cell
def weight_decay(p, lr, wd, do_wd=True, **kwargs):
"Weight decay as decaying `p` with `lr*wd`"
if do_wd and wd!=0: p.data.mul_(1 - lr*wd)
return p
weight_decay.defaults = dict(wd=0.)
#Cell
def l2_reg(p, lr, wd, do_wd=True, **kwargs):
"L2 regularization as adding `wd*p` to `p.grad`"
if do_wd and wd!=0: p.grad.data.add_(wd, p.data)
return p
l2_reg.defaults = dict(wd=0.)
#Cell
def average_grad(state, p, mom, dampening=False, **kwargs):
"Keeps track of the avg grads of `p` in `state` with `mom`."
if 'grad_avg' not in state: state['grad_avg'] = torch.zeros_like(p.grad.data)
damp = 1-mom if dampening else 1.
state['grad_avg'].mul_(mom).add_(damp, p.grad.data)
return state
average_grad.defaults = dict(mom=0.9)
#Cell
def average_sqr_grad(state, p, sqr_mom, dampening=True, **kwargs):
if 'sqr_avg' not in state: state['sqr_avg'] = torch.zeros_like(p.grad.data)
damp = 1-sqr_mom if dampening else 1.
state['sqr_avg'].mul_(sqr_mom).addcmul_(damp, p.grad.data, p.grad.data)
return state
average_sqr_grad.defaults = dict(sqr_mom=0.99)
#Cell
def momentum_step(p, lr, grad_avg, **kwargs):
"Step for SGD with momentum with `lr`"
p.data.add_(-lr, grad_avg)
return p
#Cell
def SGD(params, lr, mom=0., wd=0., decouple_wd=True):
"A `Optimizer` for SGD with `lr` and `mom` and `params`"
steppers = [weight_decay] if decouple_wd else [l2_reg]
steppers.append(sgd_step if mom==0 else momentum_step)
if mom == 0.: return Optimizer(params, steppers, lr=lr, wd=wd)
else: return Optimizer(params, steppers, stats=average_grad, lr=lr, mom=mom, wd=wd)
#Cell
def rms_prop_step(p, lr, sqr_avg, eps, grad_avg=None, **kwargs):
"Step for SGD with momentum with `lr`"
denom = sqr_avg.sqrt().add_(eps)
p.data.addcdiv_(-lr, (grad_avg if grad_avg is not None else p.grad), denom)
return p
rms_prop_step.defaults = dict(eps=1e-8)
#Cell
def RMSProp(params, lr, sqr_mom=0.99, mom=0., wd=0., decouple_wd=True):
"A `Optimizer` for RMSProp with `lr`, `sqr_mom`, `mom` and `params`"
steppers = [weight_decay] if decouple_wd else [l2_reg]
steppers.append(rms_prop_step)
stats = [average_sqr_grad] if mom==0. else [average_grad, average_sqr_grad]
return Optimizer(params, steppers, stats=stats, lr=lr, mom=mom, sqr_mom=sqr_mom, wd=wd)
#Cell
def step_stat(state, p, **kwargs):
"Register the number of steps done in `state` for `p`"
if 'step' not in state: state['step'] = 0
state['step'] += 1
return state
#Cell
def debias(mom, damp, step): return damp * (1 - mom**step) / (1-mom)
#Cell
def adam_step(p, lr, mom, step, sqr_mom, grad_avg, sqr_avg, eps, **kwargs):
"Step for Adam with `lr` on `p`"
debias1 = debias(mom, 1-mom, step)
debias2 = debias(sqr_mom, 1-sqr_mom, step)
p.data.addcdiv_(-lr / debias1, grad_avg, (sqr_avg/debias2).sqrt() + eps)
return p
adam_step._defaults = dict(eps=1e-5)
#Cell
def Adam(params, lr, mom=0.9, sqr_mom=0.99, eps=1e-5, wd=0., decouple_wd=True):
"A `Optimizer` for Adam with `lr`, `mom`, `sqr_mom`, `eps` and `params`"
steppers = [weight_decay] if decouple_wd else [l2_reg]
steppers.append(adam_step)
stats = [partial(average_grad, dampening=True), average_sqr_grad, step_stat]
return Optimizer(params, steppers, stats=stats, lr=lr, mom=mom, sqr_mom=sqr_mom, eps=eps, wd=wd)
#Cell
def radam_step(p, lr, mom, step, sqr_mom, grad_avg, sqr_avg, eps, **kwargs):
"Step for RAdam with `lr` on `p`"
debias1 = debias(mom, 1-mom, step)
debias2 = debias(sqr_mom, 1-sqr_mom, step)
r_inf = 2/(1-sqr_mom) - 1
r = r_inf - 2*step*sqr_mom**step/(1-sqr_mom**step)
if r > 4:
v = math.sqrt(((r-4) * (r-2) * r_inf)/((r_inf-4)*(r_inf-2)*r))
p.data.addcdiv_(-lr*v / debias1, grad_avg, (sqr_avg/debias2).sqrt() + eps)
else: p.data.add_(-lr / debias1, grad_avg)
return p
radam_step._defaults = dict(eps=1e-5)
#Cell
def RAdam(params, lr, mom=0.9, sqr_mom=0.99, eps=1e-5, wd=0., decouple_wd=True):
"A `Optimizer` for Adam with `lr`, `mom`, `sqr_mom`, `eps` and `params`"
steppers = [weight_decay] if decouple_wd else [l2_reg]
steppers.append(radam_step)
stats = [partial(average_grad, dampening=True), average_sqr_grad, step_stat]
return Optimizer(params, steppers, stats=stats, lr=lr, mom=mom, sqr_mom=sqr_mom, eps=eps, wd=wd)
#Cell
def qhadam_step(p, lr, mom, sqr_mom, sqr_avg, nu_1, nu_2, step, grad_avg, eps, **kwargs):
debias1 = debias(mom, 1-mom, step)
debias2 = debias(sqr_mom, 1-sqr_mom, step)
p.data.addcdiv_(-lr, ((1-nu_1) * p.grad.data) + (nu_1 * (grad_avg / debias1)),
(((1 - nu_2) * (p.grad.data)**2) + (nu_2 * (sqr_avg / debias2))).sqrt() + eps)
return p
qhadam_step._defaults = dict(eps=1e-8)
#Cell
def QHAdam(params, lr, mom=0.999, sqr_mom=0.999, nu_1=0.7, nu_2 = 1.0, eps=1e-8, wd=0., decouple_wd=True):
"An `Optimizer` for Adam with `lr`, `mom`, `sqr_mom`, `nus`, eps` and `params`"
steppers = [weight_decay] if decouple_wd else [l2_reg]
steppers.append(qhadam_step)
stats = [partial(average_grad, dampening=True), partial(average_sqr_grad, dampening=True), step_stat]
return Optimizer(params, steppers, stats=stats, lr=lr, nu_1=nu_1, nu_2=nu_2 ,
mom=mom, sqr_mom=sqr_mom, eps=eps, wd=wd)
#Cell
def larc_layer_lr(state, p, lr, trust_coeff, wd, eps, clip=True, **kwargs):
"Computes the local lr before weight decay is applied"
p_norm,g_norm = torch.norm(p.data),torch.norm(p.grad.data)
local_lr = lr*trust_coeff * (p_norm) / (g_norm + p_norm * wd + eps)
state['local_lr'] = min(lr, local_lr) if clip else local_lr
return state
larc_layer_lr.defaults = dict(trust_coeff=0.02, wd=0., eps=1e-8)
#Cell
def larc_step(p, local_lr, grad_avg=None, **kwargs):
p.data.add_(-local_lr, p.grad.data if grad_avg is None else grad_avg)
"Step for LARC `local_lr` on `p`"
return p
#Cell
def Larc(params, lr, mom=0.9, clip=True, trust_coeff=0.02, eps=1e-8, wd=0., decouple_wd=True):
"A `Optimizer` for Adam with `lr`, `mom`, `sqr_mom`, `eps` and `params`"
steppers = [weight_decay] if decouple_wd else [l2_reg]
steppers.append(larc_step)
stats = [] if mom==0. else [average_grad]
stats.append(partial(larc_layer_lr, clip=clip))
return Optimizer(params, steppers, stats=stats, lr=lr, mom=mom, trust_coeff=trust_coeff, eps=eps, wd=wd)
#Cell
def lamb_step(p, lr, mom, step, sqr_mom, grad_avg, sqr_avg, eps, **kwargs):
"Step for LAMB with `lr` on `p`"
debias1 = debias(mom, 1-mom, step)
debias2 = debias(sqr_mom, 1-sqr_mom, step)
r1 = p.data.pow(2).mean().sqrt()
step = (grad_avg/debias1) / ((sqr_avg/debias2).sqrt()+eps)
r2 = step.pow(2).mean().sqrt()
q = 1 if r1 == 0 or r2 == 0 else min(r1/r2,10)
p.data.add_(-lr * q, step)
return p
lamb_step._defaults = dict(eps=1e-6, wd=0.)
#Cell
def Lamb(params, lr, mom=0.9, sqr_mom=0.99, eps=1e-5, wd=0., decouple_wd=True):
"A `Optimizer` for Adam with `lr`, `mom`, `sqr_mom`, `eps` and `params`"
steppers = [weight_decay] if decouple_wd else [l2_reg]
steppers.append(lamb_step)
stats = [partial(average_grad, dampening=True), average_sqr_grad, step_stat]
return Optimizer(params, steppers, stats=stats, lr=lr, mom=mom, sqr_mom=sqr_mom, eps=eps, wd=wd)
#Cell
class Lookahead(Optimizer, GetAttr):
"Wrap `opt` in a lookahead optimizer"
_default='opt'
def __init__(self, opt, k=6, alpha=0.5):
store_attr(self, 'opt,k,alpha')
self._init_state()
def step(self):
if self.slow_weights is None: self._copy_weights()
self.opt.step()
self.count += 1
if self.count%self.k != 0: return
for slow_pg,fast_pg in zip(self.slow_weights,self.param_groups):
for slow_p,fast_p in zip(slow_pg,fast_pg):
slow_p.data.add_(self.alpha, fast_p.data-slow_p.data)
fast_p.data.copy_(slow_p.data)
def clear_state(self):
self.opt.clear_state()
self._init_state()
def state_dict(self):
state = self.opt.state_dict()
state.update({'count': self.count, 'slow_weights': self.slow_weights})
return state
def load_state_dict(self, sd):
self.count = sd.pop('count')
self.slow_weights = sd.pop('slow_weights')
self.opt.load_state_dict(sd)
def _init_state(self): self.count,self.slow_weights = 0,None
def _copy_weights(self): self.slow_weights = L(L(p.clone().detach() for p in pg) for pg in self.param_groups)
@property
def param_groups(self): return self.opt.param_groups
@param_groups.setter
def param_groups(self, v): self.opt.param_groups = v
#Cell
def detuplify_pg(d):
res = {}
for k,v in d.items():
if k == 'params': continue
if is_listy(v): res.update(**{f'{k}__{i}': v_ for i,v_ in enumerate(v)})
else: res[k] = v
return res
#Cell
def set_item_pg(pg, k, v):
if '__' not in k: pg[k] = v
else:
name,idx = k.split('__')
pg[name] = tuple(v if i==int(idx) else pg[name][i] for i in range_of(pg[name]))
return pg
#Cell
pytorch_hp_map = {'momentum': 'mom', 'weight_decay': 'wd', 'alpha': 'sqr_mom', 'betas__0': 'mom', 'betas__1': 'sqr_mom'}
#Cell
class OptimWrapper(_BaseOptimizer, GetAttr):
_xtra=['zero_grad', 'step', 'state_dict', 'load_state_dict']
_default='opt'
def __init__(self, opt, hp_map=None):
self.opt = opt
if hp_map is None: hp_map = pytorch_hp_map
self.fwd_map = {k: hp_map[k] if k in hp_map else k for k in detuplify_pg(opt.param_groups[0]).keys()}
self.bwd_map = {v:k for k,v in self.fwd_map.items()}
self.state = defaultdict(dict, {})
self.frozen_idx = 0
@property
def param_groups(self): return [pg['params'] for pg in self.opt.param_groups]
@param_groups.setter
def param_groups(self, v):
for pg,v_ in zip(self.opt.param_groups,v): pg['params'] = v_
@property
def hypers(self):
return [{self.fwd_map[k]:v for k,v in detuplify_pg(pg).items() if k != 'params'} for pg in self.opt.param_groups]
def _set_hyper(self, k, v):
for pg,v_ in zip(self.opt.param_groups,v): pg = set_item_pg(pg, self.bwd_map[k], v_)
def clear_state(self): self.opt.state = defaultdict(dict, {})