Change AdamW from TorchANI to PyTorch (#464)

* switch AdamW from torchany to pytorch * fix __init__ Co-authored-by: Farhad Ramezanghorbani <farhadrgh@users.noreply.github.com>

Change AdamW from TorchANI to PyTorch (#464)
* switch AdamW from torchany to pytorch * fix __init__ Co-authored-by: Farhad Ramezanghorbani <farhadrgh@users.noreply.github.com>
14b9a395 · Rocco Meli · GitHub · de9020d1 · 14b9a395 · 14b9a395
Unverified Commit 14b9a395 authored May 11, 2020 by Rocco Meli Committed by GitHub May 11, 2020
6 changed files
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -65,14 +65,6 @@ ASE Interface
 .. automodule:: torchani.ase
 .. autoclass:: torchani.ase.Calculator

-
-TorchANI Optimizater
-====================
-
-.. automodule:: torchani.optim
-.. autoclass:: torchani.optim.AdamW
-
-
 Units
 =====


--- a/examples/nnp_training.py
+++ b/examples/nnp_training.py
@@ -178,7 +178,7 @@ model = torchani.nn.Sequential(aev_computer, nn).to(device)
 # .. _Decoupled Weight Decay Regularization:
 #   https://arxiv.org/abs/1711.05101

-AdamW = torchani.optim.AdamW([
+AdamW = torch.optim.AdamW([
    # H networks
    {'params': [H_network[0].weight]},
    {'params': [H_network[2].weight], 'weight_decay': 0.00001},

--- a/examples/nnp_training_force.py
+++ b/examples/nnp_training_force.py
@@ -135,7 +135,7 @@ model = torchani.nn.Sequential(aev_computer, nn).to(device)
 # Here we will use Adam with weight decay for the weights and Stochastic Gradient
 # Descent for biases.

-AdamW = torchani.optim.AdamW([
+AdamW = torch.optim.AdamW([
    # H networks
    {'params': [H_network[0].weight]},
    {'params': [H_network[2].weight], 'weight_decay': 0.00001},

--- a/torchani/__init__.py
+++ b/torchani/__init__.py
@@ -29,7 +29,6 @@ from .aev import AEVComputer
 from . import utils
 from . import neurochem
 from . import models
-from . import optim
 from . import units
 from pkg_resources import get_distribution, DistributionNotFound

@@ -40,7 +39,7 @@ except DistributionNotFound:
    pass

 __all__ = ['AEVComputer', 'EnergyShifter', 'ANIModel', 'Ensemble', 'SpeciesConverter',
-           'utils', 'neurochem', 'models', 'optim', 'units']
+           'utils', 'neurochem', 'models', 'units']

 try:
    from . import ase  # noqa: F401

--- a/torchani/neurochem/__init__.py
+++ b/torchani/neurochem/__init__.py
@@ -14,7 +14,7 @@ import sys
 from ..nn import ANIModel, Ensemble, Gaussian, Sequential
 from ..utils import EnergyShifter, ChemicalSymbolsToInts
 from ..aev import AEVComputer
-from ..optim import AdamW
+from torch.optim import AdamW
 from collections import OrderedDict
 from torchani.units import hartree2kcalmol


--- a/torchani/optim.py
+++ b/torchani/optim.py
-"""AdamW implementation"""
-import math
-import torch
-from torch.optim.optimizer import Optimizer
-
-
-# Copied and modified from: https://github.com/pytorch/pytorch/pull/4429
-class AdamW(Optimizer):
-    r"""Implements AdamW algorithm.
-
-    It has been proposed in `Decoupled Weight Decay Regularization`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay factor (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _Decoupled Weight Decay Regularization:
-        https://arxiv.org/abs/1711.05101
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
-                 weight_decay=0, amsgrad=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad)
-        super(AdamW, self).__init__(params, defaults)
-
-    def __setstate__(self, state):
-        super(AdamW, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('amsgrad', False)
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-
-                # Perform stepweight decay
-                # p.data.mul_(1 - group['lr'] * group['weight_decay'])  # AdamW
-                p.data.mul_(1 - group['weight_decay'])  # Neurochem
-
-                # Perform optimization step
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                amsgrad = group['amsgrad']
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_sq'] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                if amsgrad:
-                    max_exp_avg_sq = state['max_exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                if amsgrad:
-                    # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
-                    # Use the max. for normalizing running avg. of gradient
-                    denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-                else:
-                    denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-
-                step_size = group['lr'] / bias_correction1
-
-                p.data.addcdiv_(-step_size, exp_avg, denom)
-
-        return loss