AEV Cache, part2 (#89)

b546adb8 · Gao, Xiang · GitHub · bfc04ac8 · b546adb8 · b546adb8
Unverified Commit b546adb8 authored Sep 06, 2018 by Gao, Xiang Committed by GitHub Sep 06, 2018
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,5 @@ benchmark_xyz
 /*.params
 /*.dat
 /tmp
+*_cache
+
--- a/codefresh.yml
+++ b/codefresh.yml
@@ -37,4 +37,5 @@ steps:
  Docs:
    image: '${{BuildTorchANI}}'
    commands:
-      - sphinx-build docs build
+      - find . -name '*.pt' -delete
+      - sphinx-build -D plot_gallery=0 docs build
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -15,6 +15,7 @@ Datasets

 .. automodule:: torchani.data
 .. autoclass:: torchani.data.BatchedANIDataset
+.. autoclass:: torchani.data.AEVCacheLoader
 .. automodule:: torchani.data.cache_aev



--- a/docs/index.rst
+++ b/docs/index.rst
@@ -2,8 +2,6 @@
 Welcome to TorchANI's documentation!
 ====================================

-Precompute AEVs to Improve Training Performance
-
 .. automodule:: torchani

 .. toctree::
@@ -18,6 +16,7 @@ Precompute AEVs to Improve Training Performance

    examples/energy_force
    examples/nnp_training
+    examples/cache_aev
    examples/neurochem_trainer

 .. toctree::

--- a/examples/cache_aev.py
+++ b/examples/cache_aev.py
+# -*- coding: utf-8 -*-
+"""
+Use Disk Cache of AEV to Boost Training
+=======================================
+
+In the previous :ref:`training-example` example, AEVs are computed everytime
+when needed. This is not very efficient because the AEVs actually never change
+during training. If one has a good SSD, it would be beneficial to cache these
+AEVs.  This example shows how to use disk cache to boost training
+"""
+
+###############################################################################
+# Most part of the codes in this example are line by line copy of
+# :ref:`training-example`.
+import torch
+import ignite
+import torchani
+import tqdm
+import timeit
+import tensorboardX
+import os
+import sys
+
+
+# training and validation set
+try:
+    path = os.path.dirname(os.path.realpath(__file__))
+except NameError:
+    path = os.getcwd()
+training_path = os.path.join(path, '../dataset/ani_gdb_s01.h5')
+validation_path = os.path.join(path, '../dataset/ani_gdb_s01.h5')
+
+# checkpoint file to save model when validation RMSE improves
+model_checkpoint = 'model.pt'
+
+# max epochs to run the training
+max_epochs = 20
+
+# Compute training RMSE every this steps. Since the training set is usually
+# huge and the loss funcition does not directly gives us RMSE, we need to
+# check the training RMSE to see overfitting.
+training_rmse_every = 5
+
+# device to run the training
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# batch size
+batch_size = 1024
+
+# log directory for tensorboardX
+log = 'runs'
+
+###############################################################################
+# Here, there is no need to manually construct aev computer and energy shifter,
+# but we do need to generate a disk cache for datasets
+const_file = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/rHCNO-5.2R_16-3.5A_a4-8.params')  # noqa: E501
+sae_file = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/sae_linfit.dat')  # noqa: E501
+training_cache = './training_cache'
+validation_cache = './validation_cache'
+
+# If the cache dirs already exists, then we assume these data has already been
+# cached and skip the generation part.
+if not os.path.exists(training_cache):
+    torchani.data.cache_aev(training_cache, training_path, batch_size, device,
+                            const_file, True, sae_file)
+if not os.path.exists(validation_cache):
+    torchani.data.cache_aev(validation_cache, validation_path, batch_size,
+                            device, const_file, True, sae_file)
+
+
+###############################################################################
+# The codes that define the network are also the same
+def atomic():
+    model = torch.nn.Sequential(
+        torch.nn.Linear(384, 128),
+        torch.nn.CELU(0.1),
+        torch.nn.Linear(128, 128),
+        torch.nn.CELU(0.1),
+        torch.nn.Linear(128, 64),
+        torch.nn.CELU(0.1),
+        torch.nn.Linear(64, 1)
+    )
+    return model
+
+
+nn = torchani.ANIModel([atomic() for _ in range(4)])
+print(nn)
+
+if os.path.isfile(model_checkpoint):
+    nn.load_state_dict(torch.load(model_checkpoint))
+else:
+    torch.save(nn.state_dict(), model_checkpoint)
+
+
+class Flatten(torch.nn.Module):
+    def forward(self, x):
+        return x[0], x[1].flatten()
+
+
+###############################################################################
+# Except that at here we do not include aev computer into our pipeline, because
+# the cache loader will load computed AEVs from disk.
+model = torch.nn.Sequential(nn, Flatten()).to(device)
+
+###############################################################################
+# This part is also a line by line copy
+writer = tensorboardX.SummaryWriter(log_dir=log)
+
+###############################################################################
+# Here we don't need to construct :class:`torchani.data.BatchedANIDataset`
+# object, but instead an object of :class:`torchani.data.AEVCacheLoader`
+training = torchani.data.AEVCacheLoader(training_cache)
+validation = torchani.data.AEVCacheLoader(validation_cache)
+
+###############################################################################
+# The rest of the code are again the same
+training = torchani.data.AEVCacheLoader(training_cache)
+container = torchani.ignite.Container({'energies': model})
+optimizer = torch.optim.Adam(model.parameters())
+trainer = ignite.engine.create_supervised_trainer(
+    container, optimizer, torchani.ignite.MSELoss('energies'))
+evaluator = ignite.engine.create_supervised_evaluator(container, metrics={
+        'RMSE': torchani.ignite.RMSEMetric('energies')
+    })
+
+
+@trainer.on(ignite.engine.Events.EPOCH_STARTED)
+def init_tqdm(trainer):
+    trainer.state.tqdm = tqdm.tqdm(total=len(training),
+                                   file=sys.stdout, desc='epoch')
+
+
+@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
+def update_tqdm(trainer):
+    trainer.state.tqdm.update(1)
+
+
+@trainer.on(ignite.engine.Events.EPOCH_COMPLETED)
+def finalize_tqdm(trainer):
+    trainer.state.tqdm.close()
+
+
+def hartree2kcal(x):
+    return 627.509 * x
+
+
+@trainer.on(ignite.engine.Events.EPOCH_STARTED)
+def validation_and_checkpoint(trainer):
+    def evaluate(dataset, name):
+        evaluator = ignite.engine.create_supervised_evaluator(
+            container,
+            metrics={
+                'RMSE': torchani.ignite.RMSEMetric('energies')
+            }
+        )
+        evaluator.run(dataset)
+        metrics = evaluator.state.metrics
+        rmse = hartree2kcal(metrics['RMSE'])
+        writer.add_scalar(name, rmse, trainer.state.epoch)
+
+    # compute validation RMSE
+    evaluate(validation, 'validation_rmse_vs_epoch')
+
+    # compute training RMSE
+    if trainer.state.epoch % training_rmse_every == 1:
+        evaluate(training, 'training_rmse_vs_epoch')
+
+    # checkpoint model
+    torch.save(nn.state_dict(), model_checkpoint)
+
+
+start = timeit.default_timer()
+
+
+@trainer.on(ignite.engine.Events.EPOCH_STARTED)
+def log_time(trainer):
+    elapsed = round(timeit.default_timer() - start, 2)
+    writer.add_scalar('time_vs_epoch', elapsed, trainer.state.epoch)
+
+
+@trainer.on(ignite.engine.Events.ITERATION_COMPLETED)
+def log_loss(trainer):
+    iteration = trainer.state.iteration
+    writer.add_scalar('loss_vs_iteration', trainer.state.output, iteration)
+
+
+trainer.run(training, max_epochs)
+
+###############################################################################
+# In the end, we explicitly close the opened loader's process. If the loading
+# processes are not closed, these processes would prevent the whole program
+# from terminating. The closing of loading process can be done automatically
+# when an :class:`torchani.data.AEVCacheLoader` object is garbage collected,
+# but here since our cache loader objects are in global scope, it won't be
+# garbage collected, se we need to terminate these processes manually.
+training.__del__()
+validation.__del__()
--- a/examples/nnp_training.py
+++ b/examples/nnp_training.py
 # -*- coding: utf-8 -*-
 """
+.. _training-example:
+
 Train Your Own Neural Network Potential
 =======================================

@@ -83,8 +85,15 @@ def atomic():
    return model


-model = torchani.ANIModel([atomic() for _ in range(4)])
-print(model)
+nn = torchani.ANIModel([atomic() for _ in range(4)])
+print(nn)
+
+###############################################################################
+# If checkpoint from previous training exists, then load it.
+if os.path.isfile(model_checkpoint):
+    nn.load_state_dict(torch.load(model_checkpoint))
+else:
+    torch.save(nn.state_dict(), model_checkpoint)


 ###############################################################################
@@ -97,15 +106,7 @@ class Flatten(torch.nn.Module):
        return x[0], x[1].flatten()


-model = torch.nn.Sequential(aev_computer, model, Flatten())
-
-###############################################################################
-# If checkpoint from previous training exists, then load it.
-if os.path.isfile(model_checkpoint):
-    model.load_state_dict(torch.load(model_checkpoint))
-else:
-    torch.save(model.state_dict(), model_checkpoint)
-model.to(device)
+model = torch.nn.Sequential(aev_computer, nn, Flatten()).to(device)


 ###############################################################################
@@ -208,6 +209,9 @@ def validation_and_checkpoint(trainer):
    if trainer.state.epoch % training_rmse_every == 1:
        evaluate(training, 'training_rmse_vs_epoch')

+    # checkpoint model
+    torch.save(nn.state_dict(), model_checkpoint)
+

 ###############################################################################
 # Also some to log elapsed time:

--- a/torchani/data/__init__.py
+++ b/torchani/data/__init__.py
@@ -6,9 +6,11 @@ from os.path import join, isfile, isdir
 import os
 from ._pyanitools import anidataloader
 import torch
-from .. import utils
+from .. import utils, neurochem, aev
 import pickle

+default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+

 def chunk_counts(counts, split):
    split = [x + 1 for x in split] + [None]
@@ -131,7 +133,7 @@ class BatchedANIDataset(Dataset):

    def __init__(self, path, species_tensor_converter, batch_size,
                 shuffle=True, properties=['energies'], transform=(),
-                 dtype=torch.get_default_dtype(), device=torch.device('cpu')):
+                 dtype=torch.get_default_dtype(), device=default_device):
        super(BatchedANIDataset, self).__init__()
        self.properties = properties
        self.device = device
@@ -256,7 +258,7 @@ class AEVCacheLoader:
        self.in_memory_size = in_memory_size
        if len(self.dataset) < in_memory_size:
            self.in_memory_size = len(self.dataset)
-        for i in range(in_memory_size):
+        for i in range(self.in_memory_size):
            self.index_queue.put(i)
        self.loader = torch.multiprocessing.Process(
            target=_disk_cache_loader,
@@ -289,5 +291,52 @@ class AEVCacheLoader:
    def __del__(self):
        self.loader.terminate()

-
-__all__ = ['BatchedANIDataset', 'AEVCacheLoader']
+    def __len__(self):
+        return len(self.dataset)
+
+
+builtin = neurochem.Builtins()
+
+
+def cache_aev(output, dataset_path, batchsize, device=default_device,
+              constfile=builtin.const_file, subtract_sae=False,
+              sae_file=builtin.sae_file, enable_tqdm=True, **kwargs):
+    # if output directory does not exist, then create it
+    if not os.path.exists(output):
+        os.makedirs(output)
+
+    device = torch.device(device)
+    consts = neurochem.Constants(constfile)
+    aev_computer = aev.AEVComputer(**consts).to(device)
+
+    if subtract_sae:
+        energy_shifter = neurochem.load_sae(sae_file)
+        transform = (energy_shifter.subtract_from_dataset,)
+    else:
+        transform = ()
+
+    dataset = BatchedANIDataset(
+        dataset_path, consts.species_to_tensor, batchsize,
+        device=device, transform=transform, **kwargs
+    )
+
+    # dump out the dataset
+    filename = os.path.join(output, 'dataset')
+    with open(filename, 'wb') as f:
+        pickle.dump(dataset, f)
+
+    if enable_tqdm:
+        import tqdm
+        indices = tqdm.trange(len(dataset))
+    else:
+        indices = range(len(dataset))
+    for i in indices:
+        input_, _ = dataset[i]
+        aevs = [aev_computer(j) for j in input_]
+        aevs = [(x.cpu(), y.cpu()) for x, y in aevs]
+        filename = os.path.join(output, '{}'.format(i))
+        with open(filename, 'wb') as f:
+            pickle.dump(aevs, f)
+
+
+__all__ = ['BatchedANIDataset', 'AEVCacheLoader', 'cache_aev']
--- a/torchani/data/cache_aev.py
+++ b/torchani/data/cache_aev.py
@@ -4,57 +4,8 @@
 computed aevs. Use the ``-h`` option for help.
 """

-import os
 import torch
-from .. import aev, neurochem
-from . import BatchedANIDataset
-import pickle
-
-
-builtin = neurochem.Builtins()
-default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
-default_dtype = str(torch.get_default_dtype()).split('.')[1]
-
-
-def cache_aev(output, dataset_path, batchsize, device=default_device,
-              constfile=builtin.const_file, subtract_sae=False,
-              sae_file=builtin.sae_file, enable_tqdm=True, **kwargs):
-    # if output directory does not exist, then create it
-    if not os.path.exists(output):
-        os.makedirs(output)
-
-    device = torch.device(device)
-    consts = neurochem.Constants(constfile)
-    aev_computer = aev.AEVComputer(**consts).to(device)
-
-    if subtract_sae:
-        energy_shifter = neurochem.load_sae(sae_file)
-        transform = (energy_shifter.subtract_from_dataset,)
-    else:
-        transform = ()
-
-    dataset = BatchedANIDataset(
-        dataset_path, consts.species_to_tensor, batchsize,
-        device=device, transform=transform, **kwargs
-    )
-
-    # dump out the dataset
-    filename = os.path.join(output, 'dataset')
-    with open(filename, 'wb') as f:
-        pickle.dump(dataset, f)
-
-    if enable_tqdm:
-        import tqdm
-        indices = tqdm.trange(len(dataset))
-    else:
-        indices = range(len(dataset))
-    for i in indices:
-        input_, _ = dataset[i]
-        aevs = [aev_computer(j) for j in input_]
-        aevs = [(x.cpu(), y.cpu()) for x, y in aevs]
-        filename = os.path.join(output, '{}'.format(i))
-        with open(filename, 'wb') as f:
-            pickle.dump(aevs, f)
+from . import cache_aev, builtin, default_device


 if __name__ == '__main__':
@@ -72,6 +23,7 @@ if __name__ == '__main__':
    parser.add_argument('--properties', nargs='+',
                        help='Output properties to load.`',
                        default=['energies'])
+    default_dtype = str(torch.get_default_dtype()).split('.')[1]
    parser.add_argument('--dtype', help='Data type', default=default_dtype)
    parser.add_argument('-d', '--device', help='Device for training',
                        default=default_device)

--- a/torchani/neurochem/__init__.py
+++ b/torchani/neurochem/__init__.py
@@ -16,7 +16,6 @@ from ..nn import ANIModel, Ensemble, Gaussian
 from ..utils import EnergyShifter
 from ..aev import AEVComputer
 from ..ignite import Container, MSELoss, TransformedLoss, RMSEMetric, MAEMetric
-from ..data import BatchedANIDataset


 class Constants(Mapping):
@@ -304,6 +303,9 @@ def hartree2kcal(x):
    return 627.509 * x


+from ..data import BatchedANIDataset  # noqa: E402
+
+
 class Trainer:
    """Train with NeuroChem training configurations.

@@ -676,4 +678,4 @@ class Trainer:


 __all__ = ['Constants', 'load_sae', 'load_model', 'load_model_ensemble',
-           'Trainer']
+           'Builtins', 'Trainer']