Adjust weight decay by the current learning rate to make it work correctly during annealing

9a951216 · Sergey Edunov · e4c935aa · 9a951216 · 9a951216
Commit 9a951216 authored Feb 08, 2018 by Sergey Edunov
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

fairseq/optim/adam.py fairseq/optim/adam.py +1 -1

fairseq/optim/nag.py fairseq/optim/nag.py +1 -1

No files found.
--- a/fairseq/optim/adam.py
+++ b/fairseq/optim/adam.py
@@ -96,7 +96,7 @@ class Adam(Optimizer):
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1

                if group['weight_decay'] != 0:
-                    p.data.add_(-group['weight_decay'], p.data)
+                    p.data.add_(-group['weight_decay'] * group['lr'], p.data)

                p.data.addcdiv_(-step_size, exp_avg, denom)


--- a/fairseq/optim/nag.py
+++ b/fairseq/optim/nag.py
@@ -44,7 +44,7 @@ class NAG(Optimizer):
                buf = param_state['momentum_buffer']

                if weight_decay != 0:
-                    p.data.mul_(1 - weight_decay)
+                    p.data.mul_(1 - lr * weight_decay)
                p.data.add_(momentum * momentum * lr_correct, buf)
                p.data.add_(-(1 + momentum) * lr, d_p)