Unverified Commit 923c8af4 authored by Gao, Xiang's avatar Gao, Xiang Committed by GitHub
Browse files

Python2 Inference Support (#171)

parent 1b2faf43
queue:
name: Hosted Ubuntu 1604
timeoutInMinutes: 30
trigger:
batch: true
branches:
include:
- master
variables:
python.version: '2.7'
steps:
- task: UsePythonVersion@0
displayName: 'Use Python $(python.version)'
inputs:
versionSpec: '$(python.version)'
- script: 'azure/install_dependencies.sh && pip install .'
displayName: 'Install dependencies'
- script: 'python2 examples/energy_force.py'
displayName: Energy and Force Example
- script: 'python2 examples/ase_interface.py'
displayName: ASE Interface Example
......@@ -16,6 +16,7 @@ calculator.
###############################################################################
# To begin with, let's first import the modules we will use:
from __future__ import print_function
from ase.lattice.cubic import Diamond
from ase.md.langevin import Langevin
from ase.optimize import BFGS
......
......@@ -9,6 +9,7 @@ TorchANI and can be used directly.
###############################################################################
# To begin with, let's first import the modules we will use:
from __future__ import print_function
import torch
import torchani
......
......@@ -27,12 +27,14 @@ at :attr:`torchani.ignite`, and more at :attr:`torchani.utils`.
from .utils import EnergyShifter
from .nn import ANIModel, Ensemble
from .aev import AEVComputer
from . import ignite
from . import utils
from . import neurochem
from . import data
from . import models
from pkg_resources import get_distribution, DistributionNotFound
import sys
if sys.version_info[0] > 2:
from . import ignite
from . import data
try:
__version__ = get_distribution(__name__).version
......
import math
if not hasattr(math, 'inf'):
math.inf = float('inf')
import torch
import itertools
from . import _six # noqa:F401
import math
from . import utils
......
......@@ -5,6 +5,7 @@
https://wiki.fysik.dtu.dk/ase
"""
from __future__ import absolute_import
import math
import torch
import ase.neighborlist
......@@ -60,7 +61,7 @@ class NeighborList:
dtype=coordinates.dtype)
cell = torch.tensor(self.cell, device=coordinates.device,
dtype=coordinates.dtype)
D += shift @ cell
D += torch.mm(shift, cell)
d = D.norm(2, -1)
neighbor_species1 = []
neighbor_distances1 = []
......
# -*- coding: utf-8 -*-
"""Helpers for working with ignite."""
from __future__ import absolute_import
import torch
from . import utils
from torch.nn.modules.loss import _Loss
from ignite.metrics.metric import Metric
from ignite.metrics import RootMeanSquaredError
from ignite.metrics import Metric, RootMeanSquaredError
from ignite.contrib.metrics.regression import MaximumAbsoluteError
......
......@@ -11,14 +11,16 @@ import itertools
import ignite
import math
import timeit
from collections.abc import Mapping
from . import _six # noqa:F401
import collections
import sys
from ..nn import ANIModel, Ensemble, Gaussian
from ..utils import EnergyShifter, ChemicalSymbolsToInts
from ..aev import AEVComputer
from ..ignite import Container, MSELoss, TransformedLoss, RMSEMetric, MAEMetric
class Constants(Mapping):
class Constants(collections.abc.Mapping):
"""NeuroChem constants. Objects of this class can be used as arguments
to :class:`torchani.AEVComputer`, like ``torchani.AEVComputer(**consts)``.
......@@ -259,7 +261,7 @@ def load_model_ensemble(species, prefix, count):
return Ensemble(models)
class BuiltinsAbstract:
class BuiltinsAbstract(object):
"""Base class for loading ANI neural network from configuration files.
Arguments:
......@@ -377,408 +379,415 @@ def hartree2kcal(x):
return 627.509 * x
from ..data import BatchedANIDataset # noqa: E402
from ..data import AEVCacheLoader # noqa: E402
if sys.version_info[0] > 2:
from ..data import BatchedANIDataset # noqa: E402
from ..data import AEVCacheLoader # noqa: E402
class Trainer:
"""Train with NeuroChem training configurations.
class Trainer:
"""Train with NeuroChem training configurations.
Arguments:
filename (str): Input file name
device (:class:`torch.device`): device to train the model
tqdm (bool): whether to enable tqdm
tensorboard (str): Directory to store tensorboard log file, set to
``None`` to disable tensorboardX.
aev_caching (bool): Whether to use AEV caching.
checkpoint_name (str): Name of the checkpoint file, checkpoints will be
stored in the network directory with this file name.
"""
Arguments:
filename (str): Input file name
device (:class:`torch.device`): device to train the model
tqdm (bool): whether to enable tqdm
tensorboard (str): Directory to store tensorboard log file, set to
``None`` to disable tensorboardX.
aev_caching (bool): Whether to use AEV caching.
checkpoint_name (str): Name of the checkpoint file, checkpoints
will be stored in the network directory with this file name.
"""
def __init__(self, filename, device=torch.device('cuda'), tqdm=False,
tensorboard=None, aev_caching=False,
checkpoint_name='model.pt'):
self.filename = filename
self.device = device
self.aev_caching = aev_caching
self.checkpoint_name = checkpoint_name
if tqdm:
import tqdm
self.tqdm = tqdm.tqdm
else:
self.tqdm = None
if tensorboard is not None:
import tensorboardX
self.tensorboard = tensorboardX.SummaryWriter(log_dir=tensorboard)
self.training_eval_every = 20
else:
self.tensorboard = None
with open(filename, 'r') as f:
if filename.endswith('.yaml') or filename.endswith('.yml'):
network_setup, params = self._parse_yaml(f)
def __init__(self, filename, device=torch.device('cuda'), tqdm=False,
tensorboard=None, aev_caching=False,
checkpoint_name='model.pt'):
self.filename = filename
self.device = device
self.aev_caching = aev_caching
self.checkpoint_name = checkpoint_name
if tqdm:
import tqdm
self.tqdm = tqdm.tqdm
else:
network_setup, params = self._parse(f.read())
self._construct(network_setup, params)
def _parse(self, txt):
parser = lark.Lark(r'''
identifier : CNAME
outer_assign : identifier "=" value
params : outer_assign *
inner_assign : identifier "=" value ";"
input_size : "inputsize" "=" INT ";"
layer : "layer" "[" inner_assign * "]"
atom_type : WORD
atom_net : "atom_net" atom_type "$" layer * "$"
network_setup: "network_setup" "{" input_size atom_net * "}"
start: params network_setup params
value : SIGNED_INT
| SIGNED_FLOAT
| STRING_VALUE
STRING_VALUE : ("_"|"-"|"."|"/"|LETTER)("_"|"-"|"."|"/"|LETTER|DIGIT)*
%import common.SIGNED_NUMBER
%import common.LETTER
%import common.WORD
%import common.DIGIT
%import common.INT
%import common.SIGNED_INT
%import common.SIGNED_FLOAT
%import common.CNAME
%import common.WS
%ignore WS
%ignore /!.*/
''')
tree = parser.parse(txt)
class TreeExec(lark.Transformer):
def identifier(self, v):
v = v[0].value
return v
self.tqdm = None
if tensorboard is not None:
import tensorboardX
self.tensorboard = tensorboardX.SummaryWriter(
log_dir=tensorboard)
self.training_eval_every = 20
else:
self.tensorboard = None
def value(self, v):
if len(v) == 1:
v = v[0]
if v.type == 'STRING_VALUE':
v = v.value
elif v.type == 'SIGNED_INT' or v.type == 'INT':
v = int(v.value)
elif v.type == 'SIGNED_FLOAT' or v.type == 'FLOAT':
v = float(v.value)
else:
raise ValueError('unexpected type')
with open(filename, 'r') as f:
if filename.endswith('.yaml') or filename.endswith('.yml'):
network_setup, params = self._parse_yaml(f)
else:
raise ValueError('length of value can only be 1 or 2')
return v
def outer_assign(self, v):
name = v[0]
value = v[1]
return name, value
network_setup, params = self._parse(f.read())
self._construct(network_setup, params)
def _parse(self, txt):
parser = lark.Lark(r'''
identifier : CNAME
outer_assign : identifier "=" value
params : outer_assign *
inner_assign : identifier "=" value ";"
input_size : "inputsize" "=" INT ";"
layer : "layer" "[" inner_assign * "]"
atom_type : WORD
atom_net : "atom_net" atom_type "$" layer * "$"
network_setup: "network_setup" "{" input_size atom_net * "}"
start: params network_setup params
value : SIGNED_INT
| SIGNED_FLOAT
| STRING_VALUE
STRING_VALUE : ("_"|"-"|"."|"/"|LETTER)("_"|"-"|"."|"/"|LETTER|DIGIT)*
%import common.SIGNED_NUMBER
%import common.LETTER
%import common.WORD
%import common.DIGIT
%import common.INT
%import common.SIGNED_INT
%import common.SIGNED_FLOAT
%import common.CNAME
%import common.WS
%ignore WS
%ignore /!.*/
''') # noqa: E501
tree = parser.parse(txt)
class TreeExec(lark.Transformer):
def identifier(self, v):
v = v[0].value
return v
def value(self, v):
if len(v) == 1:
v = v[0]
if v.type == 'STRING_VALUE':
v = v.value
elif v.type == 'SIGNED_INT' or v.type == 'INT':
v = int(v.value)
elif v.type == 'SIGNED_FLOAT' or v.type == 'FLOAT':
v = float(v.value)
else:
raise ValueError('unexpected type')
else:
raise ValueError('length of value can only be 1 or 2')
return v
inner_assign = outer_assign
def outer_assign(self, v):
name = v[0]
value = v[1]
return name, value
def params(self, v):
return v
inner_assign = outer_assign
def network_setup(self, v):
intput_size = int(v[0])
atomic_nets = dict(v[1:])
return intput_size, atomic_nets
def params(self, v):
return v
def layer(self, v):
return dict(v)
def network_setup(self, v):
intput_size = int(v[0])
atomic_nets = dict(v[1:])
return intput_size, atomic_nets
def atom_net(self, v):
atom_type = v[0]
layers = v[1:]
return atom_type, layers
def layer(self, v):
return dict(v)
def atom_type(self, v):
return v[0].value
def atom_net(self, v):
atom_type = v[0]
layers = v[1:]
return atom_type, layers
def start(self, v):
network_setup = v[1]
del v[1]
return network_setup, dict(itertools.chain(*v))
def atom_type(self, v):
return v[0].value
def input_size(self, v):
return v[0].value
def start(self, v):
network_setup = v[1]
del v[1]
return network_setup, dict(itertools.chain(*v))
return TreeExec().transform(tree)
def input_size(self, v):
return v[0].value
def _parse_yaml(self, f):
import yaml
params = yaml.safe_load(f)
network_setup = params['network_setup']
del params['network_setup']
network_setup = (network_setup['inputsize'], network_setup['atom_net'])
return network_setup, params
return TreeExec().transform(tree)
def _construct(self, network_setup, params):
dir_ = os.path.dirname(os.path.abspath(self.filename))
def _parse_yaml(self, f):
import yaml
params = yaml.safe_load(f)
network_setup = params['network_setup']
del params['network_setup']
network_setup = (network_setup['inputsize'],
network_setup['atom_net'])
return network_setup, params
# delete ignored params
def del_if_exists(key):
if key in params:
del params[key]
def _construct(self, network_setup, params):
dir_ = os.path.dirname(os.path.abspath(self.filename))
def assert_param(key, value):
if key in params and params[key] != value:
raise NotImplementedError(key + ' not supported yet')
del params[key]
del_if_exists('gpuid')
del_if_exists('nkde')
del_if_exists('fmult')
del_if_exists('cmult')
del_if_exists('decrate')
del_if_exists('mu')
assert_param('pbc', 0)
assert_param('force', 0)
assert_param('energy', 1)
assert_param('moment', 'ADAM')
assert_param('runtype', 'ANNP_CREATE_HDNN_AND_TRAIN')
assert_param('adptlrn', 'OFF')
assert_param('tmax', 0)
assert_param('nmax', 0)
assert_param('ntwshr', 0)
# load parameters
self.const_file = os.path.join(dir_, params['sflparamsfile'])
self.consts = Constants(self.const_file)
self.aev_computer = AEVComputer(**self.consts)
del params['sflparamsfile']
self.sae_file = os.path.join(dir_, params['atomEnergyFile'])
self.shift_energy = load_sae(self.sae_file)
del params['atomEnergyFile']
network_dir = os.path.join(dir_, params['ntwkStoreDir'])
if not os.path.exists(network_dir):
os.makedirs(network_dir)
self.model_checkpoint = os.path.join(network_dir, self.checkpoint_name)
del params['ntwkStoreDir']
self.max_nonimprove = params['tolr']
del params['tolr']
self.init_lr = params['eta']
del params['eta']
self.lr_decay = params['emult']
del params['emult']
self.min_lr = params['tcrit']
del params['tcrit']
self.training_batch_size = params['tbtchsz']
del params['tbtchsz']
self.validation_batch_size = params['vbtchsz']
del params['vbtchsz']
# construct networks
input_size, network_setup = network_setup
if input_size != self.aev_computer.aev_length():
raise ValueError('AEV size and input size does not match')
l2reg = []
atomic_nets = {}
for atom_type in network_setup:
layers = network_setup[atom_type]
modules = []
i = input_size
for layer in layers:
o = layer['nodes']
del layer['nodes']
if layer['type'] != 0:
raise ValueError('Unsupported layer type')
del layer['type']
module = torch.nn.Linear(i, o)
modules.append(module)
activation = _get_activation(layer['activation'])
if activation is not None:
modules.append(activation)
del layer['activation']
if 'l2norm' in layer:
if layer['l2norm'] == 1:
# NB: The "L2" implemented in NeuroChem is actually not
# L2 but weight decay. The difference of these two is:
# https://arxiv.org/pdf/1711.05101.pdf
# There is a pull request on github/pytorch
# implementing AdamW, etc.:
# https://github.com/pytorch/pytorch/pull/4429
# There is no plan to support the "L2" settings in
# input file before AdamW get merged into pytorch.
raise NotImplementedError('L2 not supported yet')
del layer['l2norm']
del layer['l2valu']
if layer:
raise ValueError('unrecognized parameter in layer setup')
i = o
atomic_nets[atom_type] = torch.nn.Sequential(*modules)
self.model = ANIModel([atomic_nets[s] for s in self.consts.species])
if self.aev_caching:
self.nnp = self.model
else:
self.nnp = torch.nn.Sequential(self.aev_computer, self.model)
self.container = Container({'energies': self.nnp}).to(self.device)
# losses
def l2():
return sum([c * (m.weight ** 2).sum() for c, m in l2reg])
self.mse_loss = TransformedLoss(MSELoss('energies'),
lambda x: x + l2())
self.exp_loss = TransformedLoss(
MSELoss('energies'),
lambda x: 0.5 * (torch.exp(2 * x) - 1) + l2())
if params:
raise ValueError('unrecognized parameter')
self.global_epoch = 0
self.global_iteration = 0
self.best_validation_rmse = math.inf
def evaluate(self, dataset):
"""Evaluate on given dataset to compute RMSE and MAE."""
evaluator = ignite.engine.create_supervised_evaluator(
self.container,
metrics={
'RMSE': RMSEMetric('energies'),
'MAE': MAEMetric('energies'),
}
)
evaluator.run(dataset)
metrics = evaluator.state.metrics
return hartree2kcal(metrics['RMSE']), hartree2kcal(metrics['MAE'])
# delete ignored params
def del_if_exists(key):
if key in params:
del params[key]
def load_data(self, training_path, validation_path):
"""Load training and validation dataset from file.
def assert_param(key, value):
if key in params and params[key] != value:
raise NotImplementedError(key + ' not supported yet')
del params[key]
If AEV caching is enabled, then the arguments are path to the cache
directory, otherwise it should be path to the dataset.
"""
if self.aev_caching:
self.training_set = AEVCacheLoader(training_path)
self.validation_set = AEVCacheLoader(validation_path)
else:
self.training_set = BatchedANIDataset(
training_path, self.consts.species_to_tensor,
self.training_batch_size, device=self.device,
transform=[self.shift_energy.subtract_from_dataset])
self.validation_set = BatchedANIDataset(
validation_path, self.consts.species_to_tensor,
self.validation_batch_size, device=self.device,
transform=[self.shift_energy.subtract_from_dataset])
def run(self):
"""Run the training"""
start = timeit.default_timer()
def decorate(trainer):
@trainer.on(ignite.engine.Events.STARTED)
def initialize(trainer):
trainer.state.no_improve_count = 0
trainer.state.epoch += self.global_epoch
trainer.state.iteration += self.global_iteration
@trainer.on(ignite.engine.Events.COMPLETED)
def finalize(trainer):
self.global_epoch = trainer.state.epoch
self.global_iteration = trainer.state.iteration
if self.tqdm is not None:
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def init_tqdm(trainer):
trainer.state.tqdm = self.tqdm(
total=len(self.training_set), desc='epoch')
del_if_exists('gpuid')
del_if_exists('nkde')
del_if_exists('fmult')
del_if_exists('cmult')
del_if_exists('decrate')
del_if_exists('mu')
assert_param('pbc', 0)
assert_param('force', 0)
assert_param('energy', 1)
assert_param('moment', 'ADAM')
assert_param('runtype', 'ANNP_CREATE_HDNN_AND_TRAIN')
assert_param('adptlrn', 'OFF')
assert_param('tmax', 0)
assert_param('nmax', 0)
assert_param('ntwshr', 0)
# load parameters
self.const_file = os.path.join(dir_, params['sflparamsfile'])
self.consts = Constants(self.const_file)
self.aev_computer = AEVComputer(**self.consts)
del params['sflparamsfile']
self.sae_file = os.path.join(dir_, params['atomEnergyFile'])
self.shift_energy = load_sae(self.sae_file)
del params['atomEnergyFile']
network_dir = os.path.join(dir_, params['ntwkStoreDir'])
if not os.path.exists(network_dir):
os.makedirs(network_dir)
self.model_checkpoint = os.path.join(network_dir,
self.checkpoint_name)
del params['ntwkStoreDir']
self.max_nonimprove = params['tolr']
del params['tolr']
self.init_lr = params['eta']
del params['eta']
self.lr_decay = params['emult']
del params['emult']
self.min_lr = params['tcrit']
del params['tcrit']
self.training_batch_size = params['tbtchsz']
del params['tbtchsz']
self.validation_batch_size = params['vbtchsz']
del params['vbtchsz']
# construct networks
input_size, network_setup = network_setup
if input_size != self.aev_computer.aev_length():
raise ValueError('AEV size and input size does not match')
l2reg = []
atomic_nets = {}
for atom_type in network_setup:
layers = network_setup[atom_type]
modules = []
i = input_size
for layer in layers:
o = layer['nodes']
del layer['nodes']
if layer['type'] != 0:
raise ValueError('Unsupported layer type')
del layer['type']
module = torch.nn.Linear(i, o)
modules.append(module)
activation = _get_activation(layer['activation'])
if activation is not None:
modules.append(activation)
del layer['activation']
if 'l2norm' in layer:
if layer['l2norm'] == 1:
# NB: The "L2" implemented in NeuroChem is actually
# not L2 but weight decay. The difference of these
# two is:
# https://arxiv.org/pdf/1711.05101.pdf
# There is a pull request on github/pytorch
# implementing AdamW, etc.:
# https://github.com/pytorch/pytorch/pull/4429
# There is no plan to support the "L2" settings in
# input file before AdamW get merged into pytorch.
raise NotImplementedError('L2 not supported yet')
del layer['l2norm']
del layer['l2valu']
if layer:
raise ValueError(
'unrecognized parameter in layer setup')
i = o
atomic_nets[atom_type] = torch.nn.Sequential(*modules)
self.model = ANIModel([atomic_nets[s]
for s in self.consts.species])
if self.aev_caching:
self.nnp = self.model
else:
self.nnp = torch.nn.Sequential(self.aev_computer, self.model)
self.container = Container({'energies': self.nnp}).to(self.device)
# losses
def l2():
return sum([c * (m.weight ** 2).sum() for c, m in l2reg])
self.mse_loss = TransformedLoss(MSELoss('energies'),
lambda x: x + l2())
self.exp_loss = TransformedLoss(
MSELoss('energies'),
lambda x: 0.5 * (torch.exp(2 * x) - 1) + l2())
if params:
raise ValueError('unrecognized parameter')
self.global_epoch = 0
self.global_iteration = 0
self.best_validation_rmse = math.inf
def evaluate(self, dataset):
"""Evaluate on given dataset to compute RMSE and MAE."""
evaluator = ignite.engine.create_supervised_evaluator(
self.container,
metrics={
'RMSE': RMSEMetric('energies'),
'MAE': MAEMetric('energies'),
}
)
evaluator.run(dataset)
metrics = evaluator.state.metrics
return hartree2kcal(metrics['RMSE']), hartree2kcal(metrics['MAE'])
def load_data(self, training_path, validation_path):
"""Load training and validation dataset from file.
If AEV caching is enabled, then the arguments are path to the cache
directory, otherwise it should be path to the dataset.
"""
if self.aev_caching:
self.training_set = AEVCacheLoader(training_path)
self.validation_set = AEVCacheLoader(validation_path)
else:
self.training_set = BatchedANIDataset(
training_path, self.consts.species_to_tensor,
self.training_batch_size, device=self.device,
transform=[self.shift_energy.subtract_from_dataset])
self.validation_set = BatchedANIDataset(
validation_path, self.consts.species_to_tensor,
self.validation_batch_size, device=self.device,
transform=[self.shift_energy.subtract_from_dataset])
def run(self):
"""Run the training"""
start = timeit.default_timer()
def decorate(trainer):
@trainer.on(ignite.engine.Events.STARTED)
def initialize(trainer):
trainer.state.no_improve_count = 0
trainer.state.epoch += self.global_epoch
trainer.state.iteration += self.global_iteration
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def update_tqdm(trainer):
trainer.state.tqdm.update(1)
@trainer.on(ignite.engine.Events.COMPLETED)
def finalize(trainer):
self.global_epoch = trainer.state.epoch
self.global_iteration = trainer.state.iteration
@trainer.on(ignite.engine.Events.EPOCH_COMPLETED)
def finalize_tqdm(trainer):
trainer.state.tqdm.close()
if self.tqdm is not None:
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def init_tqdm(trainer):
trainer.state.tqdm = self.tqdm(
total=len(self.training_set), desc='epoch')
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def validation_and_checkpoint(trainer):
trainer.state.rmse, trainer.state.mae = \
self.evaluate(self.validation_set)
if trainer.state.rmse < self.best_validation_rmse:
trainer.state.no_improve_count = 0
self.best_validation_rmse = trainer.state.rmse
torch.save(self.model.state_dict(), self.model_checkpoint)
else:
trainer.state.no_improve_count += 1
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def update_tqdm(trainer):
trainer.state.tqdm.update(1)
if trainer.state.no_improve_count > self.max_nonimprove:
trainer.terminate()
@trainer.on(ignite.engine.Events.EPOCH_COMPLETED)
def finalize_tqdm(trainer):
trainer.state.tqdm.close()
if self.tensorboard is not None:
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def log_per_epoch(trainer):
elapsed = round(timeit.default_timer() - start, 2)
epoch = trainer.state.epoch
self.tensorboard.add_scalar('time_vs_epoch', elapsed,
epoch)
self.tensorboard.add_scalar('learning_rate_vs_epoch', lr,
epoch)
self.tensorboard.add_scalar('validation_rmse_vs_epoch',
trainer.state.rmse, epoch)
self.tensorboard.add_scalar('validation_mae_vs_epoch',
trainer.state.mae, epoch)
self.tensorboard.add_scalar(
'best_validation_rmse_vs_epoch',
self.best_validation_rmse, epoch)
self.tensorboard.add_scalar('no_improve_count_vs_epoch',
trainer.state.no_improve_count,
epoch)
# compute training RMSE and MAE
if epoch % self.training_eval_every == 1:
training_rmse, training_mae = \
self.evaluate(self.training_set)
self.tensorboard.add_scalar('training_rmse_vs_epoch',
training_rmse, epoch)
self.tensorboard.add_scalar('training_mae_vs_epoch',
training_mae, epoch)
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def log_loss(trainer):
iteration = trainer.state.iteration
loss = trainer.state.output
self.tensorboard.add_scalar('loss_vs_iteration',
loss, iteration)
lr = self.init_lr
# training using mse loss first until the validation MAE decrease
# to < 1 Hartree
optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
trainer = ignite.engine.create_supervised_trainer(
self.container, optimizer, self.mse_loss)
decorate(trainer)
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def terminate_if_smaller_enough(trainer):
if trainer.state.mae < 1.0:
trainer.terminate()
trainer.run(self.training_set, max_epochs=math.inf)
while lr > self.min_lr:
def validation_and_checkpoint(trainer):
trainer.state.rmse, trainer.state.mae = \
self.evaluate(self.validation_set)
if trainer.state.rmse < self.best_validation_rmse:
trainer.state.no_improve_count = 0
self.best_validation_rmse = trainer.state.rmse
torch.save(self.model.state_dict(),
self.model_checkpoint)
else:
trainer.state.no_improve_count += 1
if trainer.state.no_improve_count > self.max_nonimprove:
trainer.terminate()
if self.tensorboard is not None:
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def log_per_epoch(trainer):
elapsed = round(timeit.default_timer() - start, 2)
epoch = trainer.state.epoch
self.tensorboard.add_scalar('time_vs_epoch', elapsed,
epoch)
self.tensorboard.add_scalar('learning_rate_vs_epoch',
lr, epoch)
self.tensorboard.add_scalar('validation_rmse_vs_epoch',
trainer.state.rmse, epoch)
self.tensorboard.add_scalar('validation_mae_vs_epoch',
trainer.state.mae, epoch)
self.tensorboard.add_scalar(
'best_validation_rmse_vs_epoch',
self.best_validation_rmse, epoch)
self.tensorboard.add_scalar(
'no_improve_count_vs_epoch',
trainer.state.no_improve_count, epoch)
# compute training RMSE and MAE
if epoch % self.training_eval_every == 1:
training_rmse, training_mae = \
self.evaluate(self.training_set)
self.tensorboard.add_scalar(
'training_rmse_vs_epoch', training_rmse, epoch)
self.tensorboard.add_scalar(
'training_mae_vs_epoch', training_mae, epoch)
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def log_loss(trainer):
iteration = trainer.state.iteration
loss = trainer.state.output
self.tensorboard.add_scalar('loss_vs_iteration',
loss, iteration)
lr = self.init_lr
# training using mse loss first until the validation MAE decrease
# to < 1 Hartree
optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
trainer = ignite.engine.create_supervised_trainer(
self.container, optimizer, self.exp_loss)
self.container, optimizer, self.mse_loss)
decorate(trainer)
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def terminate_if_smaller_enough(trainer):
if trainer.state.mae < 1.0:
trainer.terminate()
trainer.run(self.training_set, max_epochs=math.inf)
lr *= self.lr_decay
while lr > self.min_lr:
optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
trainer = ignite.engine.create_supervised_trainer(
self.container, optimizer, self.exp_loss)
decorate(trainer)
trainer.run(self.training_set, max_epochs=math.inf)
lr *= self.lr_decay
__all__ = ['Constants', 'load_sae', 'load_model', 'load_model_ensemble',
......
import collections
if not hasattr(collections, 'abc'):
collections.abc = collections
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment