Commit 9a951216 authored by Sergey Edunov's avatar Sergey Edunov
Browse files

Adjust weight decay by the current learning rate to make it work correctly during annealing

parent e4c935aa
......@@ -96,7 +96,7 @@ class Adam(Optimizer):
step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
if group['weight_decay'] != 0:
p.data.add_(-group['weight_decay'], p.data)
p.data.add_(-group['weight_decay'] * group['lr'], p.data)
p.data.addcdiv_(-step_size, exp_avg, denom)
......
......@@ -44,7 +44,7 @@ class NAG(Optimizer):
buf = param_state['momentum_buffer']
if weight_decay != 0:
p.data.mul_(1 - weight_decay)
p.data.mul_(1 - lr * weight_decay)
p.data.add_(momentum * momentum * lr_correct, buf)
p.data.add_(-(1 + momentum) * lr, d_p)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment