AdamW implementation minor fix (#261)

b37110e3 · Farhad Ramezanghorbani · GitHub · 48eeb3dc · b37110e3
Unverified Commit b37110e3 authored Jul 09, 2019 by Farhad Ramezanghorbani Committed by GitHub Jul 09, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 8 deletions

torchani/optim.py torchani/optim.py +11 -8

No files found.
--- a/torchani/optim.py
+++ b/torchani/optim.py
@@ -65,6 +65,12 @@ class AdamW(Optimizer):
            for p in group['params']:
                if p.grad is None:
                    continue
+                # Perform stepweight decay
+                # p.data.mul_(1 - group['lr'] * group['weight_decay'])  # AdamW
+                p.data.mul_(1 - group['weight_decay'])  # Neurochem
+                # Perform optimization step
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
@@ -89,6 +95,8 @@ class AdamW(Optimizer):
                beta1, beta2 = group['betas']
                state['step'] += 1
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
@@ -97,17 +105,12 @@ class AdamW(Optimizer):
                    # Maintains the maximum of all 2nd moment running avg. till now
                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                    # Use the max. for normalizing running avg. of gradient
-                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                else:
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-                bias_correction1 = 1 - beta1 ** state['step']
+                step_size = group['lr'] / bias_correction1
-                bias_correction2 = 1 - beta2 ** state['step']
-                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
                p.data.addcdiv_(-step_size, exp_avg, denom)
-                if group['weight_decay'] != 0:
-                    p.data.add_(-group['weight_decay'], p.data)
        return loss