Unverified Commit b546adb8 authored by Gao, Xiang's avatar Gao, Xiang Committed by GitHub
Browse files

AEV Cache, part2 (#89)

parent bfc04ac8
......@@ -21,3 +21,5 @@ benchmark_xyz
/*.params
/*.dat
/tmp
*_cache
......@@ -37,4 +37,5 @@ steps:
Docs:
image: '${{BuildTorchANI}}'
commands:
- sphinx-build docs build
- find . -name '*.pt' -delete
- sphinx-build -D plot_gallery=0 docs build
......@@ -15,6 +15,7 @@ Datasets
.. automodule:: torchani.data
.. autoclass:: torchani.data.BatchedANIDataset
.. autoclass:: torchani.data.AEVCacheLoader
.. automodule:: torchani.data.cache_aev
......
......@@ -2,8 +2,6 @@
Welcome to TorchANI's documentation!
====================================
Precompute AEVs to Improve Training Performance
.. automodule:: torchani
.. toctree::
......@@ -18,6 +16,7 @@ Precompute AEVs to Improve Training Performance
examples/energy_force
examples/nnp_training
examples/cache_aev
examples/neurochem_trainer
.. toctree::
......
# -*- coding: utf-8 -*-
"""
Use Disk Cache of AEV to Boost Training
=======================================
In the previous :ref:`training-example` example, AEVs are computed everytime
when needed. This is not very efficient because the AEVs actually never change
during training. If one has a good SSD, it would be beneficial to cache these
AEVs. This example shows how to use disk cache to boost training
"""
###############################################################################
# Most part of the codes in this example are line by line copy of
# :ref:`training-example`.
import torch
import ignite
import torchani
import tqdm
import timeit
import tensorboardX
import os
import sys
# training and validation set
try:
path = os.path.dirname(os.path.realpath(__file__))
except NameError:
path = os.getcwd()
training_path = os.path.join(path, '../dataset/ani_gdb_s01.h5')
validation_path = os.path.join(path, '../dataset/ani_gdb_s01.h5')
# checkpoint file to save model when validation RMSE improves
model_checkpoint = 'model.pt'
# max epochs to run the training
max_epochs = 20
# Compute training RMSE every this steps. Since the training set is usually
# huge and the loss funcition does not directly gives us RMSE, we need to
# check the training RMSE to see overfitting.
training_rmse_every = 5
# device to run the training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# batch size
batch_size = 1024
# log directory for tensorboardX
log = 'runs'
###############################################################################
# Here, there is no need to manually construct aev computer and energy shifter,
# but we do need to generate a disk cache for datasets
const_file = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/rHCNO-5.2R_16-3.5A_a4-8.params') # noqa: E501
sae_file = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/sae_linfit.dat') # noqa: E501
training_cache = './training_cache'
validation_cache = './validation_cache'
# If the cache dirs already exists, then we assume these data has already been
# cached and skip the generation part.
if not os.path.exists(training_cache):
torchani.data.cache_aev(training_cache, training_path, batch_size, device,
const_file, True, sae_file)
if not os.path.exists(validation_cache):
torchani.data.cache_aev(validation_cache, validation_path, batch_size,
device, const_file, True, sae_file)
###############################################################################
# The codes that define the network are also the same
def atomic():
model = torch.nn.Sequential(
torch.nn.Linear(384, 128),
torch.nn.CELU(0.1),
torch.nn.Linear(128, 128),
torch.nn.CELU(0.1),
torch.nn.Linear(128, 64),
torch.nn.CELU(0.1),
torch.nn.Linear(64, 1)
)
return model
nn = torchani.ANIModel([atomic() for _ in range(4)])
print(nn)
if os.path.isfile(model_checkpoint):
nn.load_state_dict(torch.load(model_checkpoint))
else:
torch.save(nn.state_dict(), model_checkpoint)
class Flatten(torch.nn.Module):
def forward(self, x):
return x[0], x[1].flatten()
###############################################################################
# Except that at here we do not include aev computer into our pipeline, because
# the cache loader will load computed AEVs from disk.
model = torch.nn.Sequential(nn, Flatten()).to(device)
###############################################################################
# This part is also a line by line copy
writer = tensorboardX.SummaryWriter(log_dir=log)
###############################################################################
# Here we don't need to construct :class:`torchani.data.BatchedANIDataset`
# object, but instead an object of :class:`torchani.data.AEVCacheLoader`
training = torchani.data.AEVCacheLoader(training_cache)
validation = torchani.data.AEVCacheLoader(validation_cache)
###############################################################################
# The rest of the code are again the same
training = torchani.data.AEVCacheLoader(training_cache)
container = torchani.ignite.Container({'energies': model})
optimizer = torch.optim.Adam(model.parameters())
trainer = ignite.engine.create_supervised_trainer(
container, optimizer, torchani.ignite.MSELoss('energies'))
evaluator = ignite.engine.create_supervised_evaluator(container, metrics={
'RMSE': torchani.ignite.RMSEMetric('energies')
})
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def init_tqdm(trainer):
trainer.state.tqdm = tqdm.tqdm(total=len(training),
file=sys.stdout, desc='epoch')
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def update_tqdm(trainer):
trainer.state.tqdm.update(1)
@trainer.on(ignite.engine.Events.EPOCH_COMPLETED)
def finalize_tqdm(trainer):
trainer.state.tqdm.close()
def hartree2kcal(x):
return 627.509 * x
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def validation_and_checkpoint(trainer):
def evaluate(dataset, name):
evaluator = ignite.engine.create_supervised_evaluator(
container,
metrics={
'RMSE': torchani.ignite.RMSEMetric('energies')
}
)
evaluator.run(dataset)
metrics = evaluator.state.metrics
rmse = hartree2kcal(metrics['RMSE'])
writer.add_scalar(name, rmse, trainer.state.epoch)
# compute validation RMSE
evaluate(validation, 'validation_rmse_vs_epoch')
# compute training RMSE
if trainer.state.epoch % training_rmse_every == 1:
evaluate(training, 'training_rmse_vs_epoch')
# checkpoint model
torch.save(nn.state_dict(), model_checkpoint)
start = timeit.default_timer()
@trainer.on(ignite.engine.Events.EPOCH_STARTED)
def log_time(trainer):
elapsed = round(timeit.default_timer() - start, 2)
writer.add_scalar('time_vs_epoch', elapsed, trainer.state.epoch)
@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
def log_loss(trainer):
iteration = trainer.state.iteration
writer.add_scalar('loss_vs_iteration', trainer.state.output, iteration)
trainer.run(training, max_epochs)
###############################################################################
# In the end, we explicitly close the opened loader's process. If the loading
# processes are not closed, these processes would prevent the whole program
# from terminating. The closing of loading process can be done automatically
# when an :class:`torchani.data.AEVCacheLoader` object is garbage collected,
# but here since our cache loader objects are in global scope, it won't be
# garbage collected, se we need to terminate these processes manually.
training.__del__()
validation.__del__()
# -*- coding: utf-8 -*-
"""
.. _training-example:
Train Your Own Neural Network Potential
=======================================
......@@ -83,8 +85,15 @@ def atomic():
return model
model = torchani.ANIModel([atomic() for _ in range(4)])
print(model)
nn = torchani.ANIModel([atomic() for _ in range(4)])
print(nn)
###############################################################################
# If checkpoint from previous training exists, then load it.
if os.path.isfile(model_checkpoint):
nn.load_state_dict(torch.load(model_checkpoint))
else:
torch.save(nn.state_dict(), model_checkpoint)
###############################################################################
......@@ -97,15 +106,7 @@ class Flatten(torch.nn.Module):
return x[0], x[1].flatten()
model = torch.nn.Sequential(aev_computer, model, Flatten())
###############################################################################
# If checkpoint from previous training exists, then load it.
if os.path.isfile(model_checkpoint):
model.load_state_dict(torch.load(model_checkpoint))
else:
torch.save(model.state_dict(), model_checkpoint)
model.to(device)
model = torch.nn.Sequential(aev_computer, nn, Flatten()).to(device)
###############################################################################
......@@ -208,6 +209,9 @@ def validation_and_checkpoint(trainer):
if trainer.state.epoch % training_rmse_every == 1:
evaluate(training, 'training_rmse_vs_epoch')
# checkpoint model
torch.save(nn.state_dict(), model_checkpoint)
###############################################################################
# Also some to log elapsed time:
......
......@@ -6,9 +6,11 @@ from os.path import join, isfile, isdir
import os
from ._pyanitools import anidataloader
import torch
from .. import utils
from .. import utils, neurochem, aev
import pickle
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
def chunk_counts(counts, split):
split = [x + 1 for x in split] + [None]
......@@ -131,7 +133,7 @@ class BatchedANIDataset(Dataset):
def __init__(self, path, species_tensor_converter, batch_size,
shuffle=True, properties=['energies'], transform=(),
dtype=torch.get_default_dtype(), device=torch.device('cpu')):
dtype=torch.get_default_dtype(), device=default_device):
super(BatchedANIDataset, self).__init__()
self.properties = properties
self.device = device
......@@ -256,7 +258,7 @@ class AEVCacheLoader:
self.in_memory_size = in_memory_size
if len(self.dataset) < in_memory_size:
self.in_memory_size = len(self.dataset)
for i in range(in_memory_size):
for i in range(self.in_memory_size):
self.index_queue.put(i)
self.loader = torch.multiprocessing.Process(
target=_disk_cache_loader,
......@@ -289,5 +291,52 @@ class AEVCacheLoader:
def __del__(self):
self.loader.terminate()
__all__ = ['BatchedANIDataset', 'AEVCacheLoader']
def __len__(self):
return len(self.dataset)
builtin = neurochem.Builtins()
def cache_aev(output, dataset_path, batchsize, device=default_device,
constfile=builtin.const_file, subtract_sae=False,
sae_file=builtin.sae_file, enable_tqdm=True, **kwargs):
# if output directory does not exist, then create it
if not os.path.exists(output):
os.makedirs(output)
device = torch.device(device)
consts = neurochem.Constants(constfile)
aev_computer = aev.AEVComputer(**consts).to(device)
if subtract_sae:
energy_shifter = neurochem.load_sae(sae_file)
transform = (energy_shifter.subtract_from_dataset,)
else:
transform = ()
dataset = BatchedANIDataset(
dataset_path, consts.species_to_tensor, batchsize,
device=device, transform=transform, **kwargs
)
# dump out the dataset
filename = os.path.join(output, 'dataset')
with open(filename, 'wb') as f:
pickle.dump(dataset, f)
if enable_tqdm:
import tqdm
indices = tqdm.trange(len(dataset))
else:
indices = range(len(dataset))
for i in indices:
input_, _ = dataset[i]
aevs = [aev_computer(j) for j in input_]
aevs = [(x.cpu(), y.cpu()) for x, y in aevs]
filename = os.path.join(output, '{}'.format(i))
with open(filename, 'wb') as f:
pickle.dump(aevs, f)
__all__ = ['BatchedANIDataset', 'AEVCacheLoader', 'cache_aev']
......@@ -4,57 +4,8 @@
computed aevs. Use the ``-h`` option for help.
"""
import os
import torch
from .. import aev, neurochem
from . import BatchedANIDataset
import pickle
builtin = neurochem.Builtins()
default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
default_dtype = str(torch.get_default_dtype()).split('.')[1]
def cache_aev(output, dataset_path, batchsize, device=default_device,
constfile=builtin.const_file, subtract_sae=False,
sae_file=builtin.sae_file, enable_tqdm=True, **kwargs):
# if output directory does not exist, then create it
if not os.path.exists(output):
os.makedirs(output)
device = torch.device(device)
consts = neurochem.Constants(constfile)
aev_computer = aev.AEVComputer(**consts).to(device)
if subtract_sae:
energy_shifter = neurochem.load_sae(sae_file)
transform = (energy_shifter.subtract_from_dataset,)
else:
transform = ()
dataset = BatchedANIDataset(
dataset_path, consts.species_to_tensor, batchsize,
device=device, transform=transform, **kwargs
)
# dump out the dataset
filename = os.path.join(output, 'dataset')
with open(filename, 'wb') as f:
pickle.dump(dataset, f)
if enable_tqdm:
import tqdm
indices = tqdm.trange(len(dataset))
else:
indices = range(len(dataset))
for i in indices:
input_, _ = dataset[i]
aevs = [aev_computer(j) for j in input_]
aevs = [(x.cpu(), y.cpu()) for x, y in aevs]
filename = os.path.join(output, '{}'.format(i))
with open(filename, 'wb') as f:
pickle.dump(aevs, f)
from . import cache_aev, builtin, default_device
if __name__ == '__main__':
......@@ -72,6 +23,7 @@ if __name__ == '__main__':
parser.add_argument('--properties', nargs='+',
help='Output properties to load.`',
default=['energies'])
default_dtype = str(torch.get_default_dtype()).split('.')[1]
parser.add_argument('--dtype', help='Data type', default=default_dtype)
parser.add_argument('-d', '--device', help='Device for training',
default=default_device)
......
......@@ -16,7 +16,6 @@ from ..nn import ANIModel, Ensemble, Gaussian
from ..utils import EnergyShifter
from ..aev import AEVComputer
from ..ignite import Container, MSELoss, TransformedLoss, RMSEMetric, MAEMetric
from ..data import BatchedANIDataset
class Constants(Mapping):
......@@ -304,6 +303,9 @@ def hartree2kcal(x):
return 627.509 * x
from ..data import BatchedANIDataset # noqa: E402
class Trainer:
"""Train with NeuroChem training configurations.
......@@ -676,4 +678,4 @@ class Trainer:
__all__ = ['Constants', 'load_sae', 'load_model', 'load_model_ensemble',
'Trainer']
'Builtins', 'Trainer']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment