Unverified Commit b546adb8 authored by Gao, Xiang's avatar Gao, Xiang Committed by GitHub
Browse files

AEV Cache, part2 (#89)

parent bfc04ac8
...@@ -21,3 +21,5 @@ benchmark_xyz ...@@ -21,3 +21,5 @@ benchmark_xyz
/*.params /*.params
/*.dat /*.dat
/tmp /tmp
*_cache
...@@ -37,4 +37,5 @@ steps: ...@@ -37,4 +37,5 @@ steps:
Docs: Docs:
image: '${{BuildTorchANI}}' image: '${{BuildTorchANI}}'
commands: commands:
- sphinx-build docs build - find . -name '*.pt' -delete
- sphinx-build -D plot_gallery=0 docs build
...@@ -15,6 +15,7 @@ Datasets ...@@ -15,6 +15,7 @@ Datasets
.. automodule:: torchani.data .. automodule:: torchani.data
.. autoclass:: torchani.data.BatchedANIDataset .. autoclass:: torchani.data.BatchedANIDataset
.. autoclass:: torchani.data.AEVCacheLoader
.. automodule:: torchani.data.cache_aev .. automodule:: torchani.data.cache_aev
......
...@@ -2,8 +2,6 @@ ...@@ -2,8 +2,6 @@
Welcome to TorchANI's documentation! Welcome to TorchANI's documentation!
==================================== ====================================
Precompute AEVs to Improve Training Performance
.. automodule:: torchani .. automodule:: torchani
.. toctree:: .. toctree::
...@@ -18,6 +16,7 @@ Precompute AEVs to Improve Training Performance ...@@ -18,6 +16,7 @@ Precompute AEVs to Improve Training Performance
examples/energy_force examples/energy_force
examples/nnp_training examples/nnp_training
examples/cache_aev
examples/neurochem_trainer examples/neurochem_trainer
.. toctree:: .. toctree::
......
# -*- coding: utf-8 -*-
"""
Use Disk Cache of AEV to Boost Training
=======================================
In the previous :ref:`training-example` example, AEVs are computed everytime
when needed. This is not very efficient because the AEVs actually never change
during training. If one has a good SSD, it would be beneficial to cache these
AEVs. This example shows how to use disk cache to boost training
"""
###############################################################################
# Most part of the codes in this example are line by line copy of
# :ref:`training-example`.
import torch
import ignite
import torchani
import tqdm
import timeit
import tensorboardX
import os
import sys
# training and validation set
try:
path = os.path.dirname(os.path.realpath(__file__))
except NameError:
path = os.getcwd()
training_path = os.path.join(path, '../dataset/ani_gdb_s01.h5')
validation_path = os.path.join(path, '../dataset/ani_gdb_s01.h5')
# checkpoint file to save model when validation RMSE improves
model_checkpoint = 'model.pt'
# max epochs to run the training
max_epochs = 20
# Compute training RMSE every this steps. Since the training set is usually
# huge and the loss funcition does not directly gives us RMSE, we need to
# check the training RMSE to see overfitting.
training_rmse_every = 5
# device to run the training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# batch size
batch_size = 1024
# log directory for tensorboardX
log = 'runs'
###############################################################################
# Here, there is no need to manually construct aev computer and energy shifter,
# but we do need to generate a disk cache for datasets
const_file = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/rHCNO-5.2R_16-3.5A_a4-8.params') # noqa: E501
sae_file = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/sae_linfit.dat') # noqa: E501
training_cache = './training_cache'
validation_cache = './validation_cache'
# If the cache dirs already exists, then we assume these data has already been
# cached and skip the generation part.
if not os.path.exists(training_cache):
torchani.data.cache_aev(training_cache, training_path, batch_size, device,
const_file, True, sae_file)
if not os.path.exists(validation_cache):
torchani.data.cache_aev(validation_cache, validation_path, batch_size,
device, const_file, True, sae_file)
###############################################################################
# The codes that define the network are also the same
def atomic():
model = torch.nn.Sequential(
torch.nn.Linear(384, 128),
torch.nn.CELU(0.1),
torch.nn.Linear(128, 128),
torch.nn.CELU(0.1),
torch.nn.Linear(128, 64),
torch.nn.CELU(0.1),
torch.nn.Linear(64, 1)
)
return model
nn = torchani.ANIModel([atomic() for _ in range(4)])
print(nn)
if os.path.isfile(model_checkpoint):
nn.load_state_dict(torch.load(model_checkpoint))
else:
torch.save(nn.state_dict(), model_checkpoint)
class Flatten(torch.nn.Module):
def forward(self, x):
return x[0], x[1].flatten()
###############################################################################
# Except that at here we do not include aev computer into our pipeline, because
# the cache loader will load computed AEVs from disk.
model = torch.nn.Sequential(nn, Flatten()).to(device)
###############################################################################
# This part is also a line by line copy
writer = tensorboardX.SummaryWriter(log_dir=log)
###############################################################################
# Here we don't need to construct :class:`torchani.data.BatchedANIDataset`
# object, but instead an object of :class:`torchani.data.AEVCacheLoader`
training = torchani.data.AEVCacheLoader(training_cache)
validation = torchani.data.AEVCacheLoader(validation_cache)
###############################################################################
# The rest of the code are again the same
training = torchani.data.AEVCacheLoader(training_cache)
container = torchani.ignite.Container({'energies': model})
optimizer = torch.optim.Adam(model.parameters())
trainer = ignite.engine.create_supervised_trainer(
container, optimizer, torchani.ignite.MSELoss('energies'))
evaluator = ignite.engine.create_supervised_evaluator(container, metrics={
'RMSE': torchani.ignite.RMSEMetric('energies')
})
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def init_tqdm(trainer):
trainer.state.tqdm = tqdm.tqdm(total=len(training),
file=sys.stdout, desc='epoch')
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def update_tqdm(trainer):
trainer.state.tqdm.update(1)
@trainer.on(ignite.engine.Events.EPOCH_COMPLETED)
def finalize_tqdm(trainer):
trainer.state.tqdm.close()
def hartree2kcal(x):
return 627.509 * x
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def validation_and_checkpoint(trainer):
def evaluate(dataset, name):
evaluator = ignite.engine.create_supervised_evaluator(
container,
metrics={
'RMSE': torchani.ignite.RMSEMetric('energies')
}
)
evaluator.run(dataset)
metrics = evaluator.state.metrics
rmse = hartree2kcal(metrics['RMSE'])
writer.add_scalar(name, rmse, trainer.state.epoch)
# compute validation RMSE
evaluate(validation, 'validation_rmse_vs_epoch')
# compute training RMSE
if trainer.state.epoch % training_rmse_every == 1:
evaluate(training, 'training_rmse_vs_epoch')
# checkpoint model
torch.save(nn.state_dict(), model_checkpoint)
start = timeit.default_timer()
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def log_time(trainer):
elapsed = round(timeit.default_timer() - start, 2)
writer.add_scalar('time_vs_epoch', elapsed, trainer.state.epoch)
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def log_loss(trainer):
iteration = trainer.state.iteration
writer.add_scalar('loss_vs_iteration', trainer.state.output, iteration)
trainer.run(training, max_epochs)
###############################################################################
# In the end, we explicitly close the opened loader's process. If the loading
# processes are not closed, these processes would prevent the whole program
# from terminating. The closing of loading process can be done automatically
# when an :class:`torchani.data.AEVCacheLoader` object is garbage collected,
# but here since our cache loader objects are in global scope, it won't be
# garbage collected, se we need to terminate these processes manually.
training.__del__()
validation.__del__()
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
.. _training-example:
Train Your Own Neural Network Potential Train Your Own Neural Network Potential
======================================= =======================================
...@@ -83,8 +85,15 @@ def atomic(): ...@@ -83,8 +85,15 @@ def atomic():
return model return model
model = torchani.ANIModel([atomic() for _ in range(4)]) nn = torchani.ANIModel([atomic() for _ in range(4)])
print(model) print(nn)
###############################################################################
# If checkpoint from previous training exists, then load it.
if os.path.isfile(model_checkpoint):
nn.load_state_dict(torch.load(model_checkpoint))
else:
torch.save(nn.state_dict(), model_checkpoint)
############################################################################### ###############################################################################
...@@ -97,15 +106,7 @@ class Flatten(torch.nn.Module): ...@@ -97,15 +106,7 @@ class Flatten(torch.nn.Module):
return x[0], x[1].flatten() return x[0], x[1].flatten()
model = torch.nn.Sequential(aev_computer, model, Flatten()) model = torch.nn.Sequential(aev_computer, nn, Flatten()).to(device)
###############################################################################
# If checkpoint from previous training exists, then load it.
if os.path.isfile(model_checkpoint):
model.load_state_dict(torch.load(model_checkpoint))
else:
torch.save(model.state_dict(), model_checkpoint)
model.to(device)
############################################################################### ###############################################################################
...@@ -208,6 +209,9 @@ def validation_and_checkpoint(trainer): ...@@ -208,6 +209,9 @@ def validation_and_checkpoint(trainer):
if trainer.state.epoch % training_rmse_every == 1: if trainer.state.epoch % training_rmse_every == 1:
evaluate(training, 'training_rmse_vs_epoch') evaluate(training, 'training_rmse_vs_epoch')
# checkpoint model
torch.save(nn.state_dict(), model_checkpoint)
############################################################################### ###############################################################################
# Also some to log elapsed time: # Also some to log elapsed time:
......
...@@ -6,9 +6,11 @@ from os.path import join, isfile, isdir ...@@ -6,9 +6,11 @@ from os.path import join, isfile, isdir
import os import os
from ._pyanitools import anidataloader from ._pyanitools import anidataloader
import torch import torch
from .. import utils from .. import utils, neurochem, aev
import pickle import pickle
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
def chunk_counts(counts, split): def chunk_counts(counts, split):
split = [x + 1 for x in split] + [None] split = [x + 1 for x in split] + [None]
...@@ -131,7 +133,7 @@ class BatchedANIDataset(Dataset): ...@@ -131,7 +133,7 @@ class BatchedANIDataset(Dataset):
def __init__(self, path, species_tensor_converter, batch_size, def __init__(self, path, species_tensor_converter, batch_size,
shuffle=True, properties=['energies'], transform=(), shuffle=True, properties=['energies'], transform=(),
dtype=torch.get_default_dtype(), device=torch.device('cpu')): dtype=torch.get_default_dtype(), device=default_device):
super(BatchedANIDataset, self).__init__() super(BatchedANIDataset, self).__init__()
self.properties = properties self.properties = properties
self.device = device self.device = device
...@@ -256,7 +258,7 @@ class AEVCacheLoader: ...@@ -256,7 +258,7 @@ class AEVCacheLoader:
self.in_memory_size = in_memory_size self.in_memory_size = in_memory_size
if len(self.dataset) < in_memory_size: if len(self.dataset) < in_memory_size:
self.in_memory_size = len(self.dataset) self.in_memory_size = len(self.dataset)
for i in range(in_memory_size): for i in range(self.in_memory_size):
self.index_queue.put(i) self.index_queue.put(i)
self.loader = torch.multiprocessing.Process( self.loader = torch.multiprocessing.Process(
target=_disk_cache_loader, target=_disk_cache_loader,
...@@ -289,5 +291,52 @@ class AEVCacheLoader: ...@@ -289,5 +291,52 @@ class AEVCacheLoader:
def __del__(self): def __del__(self):
self.loader.terminate() self.loader.terminate()
def __len__(self):
__all__ = ['BatchedANIDataset', 'AEVCacheLoader'] return len(self.dataset)
builtin = neurochem.Builtins()
def cache_aev(output, dataset_path, batchsize, device=default_device,
constfile=builtin.const_file, subtract_sae=False,
sae_file=builtin.sae_file, enable_tqdm=True, **kwargs):
# if output directory does not exist, then create it
if not os.path.exists(output):
os.makedirs(output)
device = torch.device(device)
consts = neurochem.Constants(constfile)
aev_computer = aev.AEVComputer(**consts).to(device)
if subtract_sae:
energy_shifter = neurochem.load_sae(sae_file)
transform = (energy_shifter.subtract_from_dataset,)
else:
transform = ()
dataset = BatchedANIDataset(
dataset_path, consts.species_to_tensor, batchsize,
device=device, transform=transform, **kwargs
)
# dump out the dataset
filename = os.path.join(output, 'dataset')
with open(filename, 'wb') as f:
pickle.dump(dataset, f)
if enable_tqdm:
import tqdm
indices = tqdm.trange(len(dataset))
else:
indices = range(len(dataset))
for i in indices:
input_, _ = dataset[i]
aevs = [aev_computer(j) for j in input_]
aevs = [(x.cpu(), y.cpu()) for x, y in aevs]
filename = os.path.join(output, '{}'.format(i))
with open(filename, 'wb') as f:
pickle.dump(aevs, f)
__all__ = ['BatchedANIDataset', 'AEVCacheLoader', 'cache_aev']
...@@ -4,57 +4,8 @@ ...@@ -4,57 +4,8 @@
computed aevs. Use the ``-h`` option for help. computed aevs. Use the ``-h`` option for help.
""" """
import os
import torch import torch
from .. import aev, neurochem from . import cache_aev, builtin, default_device
from . import BatchedANIDataset
import pickle
builtin = neurochem.Builtins()
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
default_dtype = str(torch.get_default_dtype()).split('.')[1]
def cache_aev(output, dataset_path, batchsize, device=default_device,
constfile=builtin.const_file, subtract_sae=False,
sae_file=builtin.sae_file, enable_tqdm=True, **kwargs):
# if output directory does not exist, then create it
if not os.path.exists(output):
os.makedirs(output)
device = torch.device(device)
consts = neurochem.Constants(constfile)
aev_computer = aev.AEVComputer(**consts).to(device)
if subtract_sae:
energy_shifter = neurochem.load_sae(sae_file)
transform = (energy_shifter.subtract_from_dataset,)
else:
transform = ()
dataset = BatchedANIDataset(
dataset_path, consts.species_to_tensor, batchsize,
device=device, transform=transform, **kwargs
)
# dump out the dataset
filename = os.path.join(output, 'dataset')
with open(filename, 'wb') as f:
pickle.dump(dataset, f)
if enable_tqdm:
import tqdm
indices = tqdm.trange(len(dataset))
else:
indices = range(len(dataset))
for i in indices:
input_, _ = dataset[i]
aevs = [aev_computer(j) for j in input_]
aevs = [(x.cpu(), y.cpu()) for x, y in aevs]
filename = os.path.join(output, '{}'.format(i))
with open(filename, 'wb') as f:
pickle.dump(aevs, f)
if __name__ == '__main__': if __name__ == '__main__':
...@@ -72,6 +23,7 @@ if __name__ == '__main__': ...@@ -72,6 +23,7 @@ if __name__ == '__main__':
parser.add_argument('--properties', nargs='+', parser.add_argument('--properties', nargs='+',
help='Output properties to load.`', help='Output properties to load.`',
default=['energies']) default=['energies'])
default_dtype = str(torch.get_default_dtype()).split('.')[1]
parser.add_argument('--dtype', help='Data type', default=default_dtype) parser.add_argument('--dtype', help='Data type', default=default_dtype)
parser.add_argument('-d', '--device', help='Device for training', parser.add_argument('-d', '--device', help='Device for training',
default=default_device) default=default_device)
......
...@@ -16,7 +16,6 @@ from ..nn import ANIModel, Ensemble, Gaussian ...@@ -16,7 +16,6 @@ from ..nn import ANIModel, Ensemble, Gaussian
from ..utils import EnergyShifter from ..utils import EnergyShifter
from ..aev import AEVComputer from ..aev import AEVComputer
from ..ignite import Container, MSELoss, TransformedLoss, RMSEMetric, MAEMetric from ..ignite import Container, MSELoss, TransformedLoss, RMSEMetric, MAEMetric
from ..data import BatchedANIDataset
class Constants(Mapping): class Constants(Mapping):
...@@ -304,6 +303,9 @@ def hartree2kcal(x): ...@@ -304,6 +303,9 @@ def hartree2kcal(x):
return 627.509 * x return 627.509 * x
from ..data import BatchedANIDataset # noqa: E402
class Trainer: class Trainer:
"""Train with NeuroChem training configurations. """Train with NeuroChem training configurations.
...@@ -676,4 +678,4 @@ class Trainer: ...@@ -676,4 +678,4 @@ class Trainer:
__all__ = ['Constants', 'load_sae', 'load_model', 'load_model_ensemble', __all__ = ['Constants', 'load_sae', 'load_model', 'load_model_ensemble',
'Trainer'] 'Builtins', 'Trainer']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment