Commit 2359a387 authored by Xiang Gao's avatar Xiang Gao Committed by Gao, Xiang
Browse files

torchani 0.1

parents
import torch
import torchani
import torchani.data
import tqdm
import math
import timeit
import configs
import functools
configs.benchmark = True
from common import *
ds = torchani.data.load_dataset(configs.data_path)
sampler = torchani.data.BatchSampler(ds, 256, 4)
dataloader = torch.utils.data.DataLoader(
ds, batch_sampler=sampler, collate_fn=torchani.data.collate, num_workers=20)
model = get_or_create_model('/tmp/model.pt')
optimizer = torch.optim.Adam(model.parameters(), amsgrad=True)
def benchmark(timer, index):
def wrapper(fun):
@functools.wraps(fun)
def wrapped(*args, **kwargs):
start = timeit.default_timer()
ret = fun(*args, **kwargs)
end = timeit.default_timer()
timer[index] += end - start
return ret
return wrapped
return wrapper
timer = {'backward': 0}
@benchmark(timer, 'backward')
def optimize_step(a):
mse = a.avg()
optimizer.zero_grad()
mse.backward()
optimizer.step()
start = timeit.default_timer()
for batch in tqdm.tqdm(dataloader, total=len(sampler)):
a = Averager()
for molecule_id in batch:
_species = ds.species[molecule_id]
coordinates, energies = batch[molecule_id]
coordinates = coordinates.to(aev_computer.device)
energies = energies.to(aev_computer.device)
a.add(*evaluate(model, coordinates, energies, _species))
optimize_step(a)
elapsed = round(timeit.default_timer() - start, 2)
print('Radial terms:', aev_computer.timers['radial terms'])
print('Angular terms:', aev_computer.timers['angular terms'])
print('Terms and indices:', aev_computer.timers['terms and indices'])
print('Combinations:', aev_computer.timers['combinations'])
print('Mask R:', aev_computer.timers['mask_r'])
print('Mask A:', aev_computer.timers['mask_a'])
print('Assemble:', aev_computer.timers['assemble'])
print('Total AEV:', aev_computer.timers['total'])
print('NN:', model.timers['nn'])
print('Total Forward:', model.timers['forward'])
print('Total Backward:', timer['backward'])
print('Epoch time:', elapsed)
.. torchani documentation master file, created by
sphinx-quickstart on Tue May 1 00:00:05 2018.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to torchani's documentation!
====================================
.. toctree::
:maxdepth: 2
:caption: Contents:
.. automodule:: torchani
:members:
:special-members: __init__, __call__
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
from setuptools import setup
from sphinx.setup_command import BuildDoc
cmdclass = {'build_sphinx': BuildDoc}
setup(name='torchani',
version='0.1',
description='ANI based on pytorch',
url='https://github.com/zasdfgbnm/torchani',
author='Xiang Gao',
author_email='qasdfgtyuiop@ufl.edu',
license='MIT',
packages=['torchani'],
include_package_data=True,
install_requires=[
'torch',
'lark-parser',
'h5py',
],
test_suite='nose.collector',
tests_require=['nose', 'cntk'],
cmdclass=cmdclass,
)
import pkg_resources
import torch
buildin_const_file = pkg_resources.resource_filename(
__name__, 'resources/ani-1x_dft_x8ens/rHCNO-5.2R_16-3.5A_a4-8.params')
buildin_sae_file = pkg_resources.resource_filename(
__name__, 'resources/ani-1x_dft_x8ens/sae_linfit.dat')
buildin_network_dir = pkg_resources.resource_filename(
__name__, 'resources/ani-1x_dft_x8ens/train0/networks/')
buildin_model_prefix = pkg_resources.resource_filename(
__name__, 'resources/ani-1x_dft_x8ens/train')
buildin_dataset_dir = pkg_resources.resource_filename(
__name__, 'resources/')
default_dtype = torch.float32
default_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from .energyshifter import EnergyShifter
from .nn import ModelOnAEV, PerSpeciesFromNeuroChem
from .aev import SortedAEV
import logging
__all__ = ['SortedAEV', 'EnergyShifter', 'ModelOnAEV', 'PerSpeciesFromNeuroChem', 'data',
'buildin_const_file', 'buildin_sae_file', 'buildin_network_dir', 'buildin_dataset_dir',
'default_dtype', 'default_device']
try:
from .neurochem_aev import NeuroChemAEV
__all__.append('NeuroChemAEV')
except ImportError:
logging.log(logging.WARNING,
'Unable to import NeuroChemAEV, please check your pyNeuroChem installation.')
import torch
import itertools
import numpy
from .aev_base import AEVComputer
from . import buildin_const_file, default_dtype, default_device
def _cutoff_cosine(distances, cutoff):
"""Compute the elementwise cutoff cosine function
The cutoff cosine function is define in https://arxiv.org/pdf/1610.08935.pdf equation 2
Parameters
----------
distances : torch.Tensor
The pytorch tensor that stores Rij values. This tensor can have any shape since the cutoff
cosine function is computed elementwise.
cutoff : float
The cutoff radius, i.e. the Rc in the equation. For any Rij > Rc, the function value is defined to be zero.
Returns
-------
torch.Tensor
The tensor of the same shape as `distances` that stores the computed function values.
"""
return torch.where(distances <= cutoff, 0.5 * torch.cos(numpy.pi * distances / cutoff) + 0.5, torch.zeros_like(distances))
class SortedAEV(AEVComputer):
"""The AEV computer assuming input coordinates sorted by species
Attributes
----------
timers : dict
Dictionary storing the the benchmark result. It has the following keys:
radial_subaev : time spent on computing radial subaev
angular_subaev : time spent on computing angular subaev
total : total time for computing everything.
"""
def __init__(self, benchmark=False, device=default_device, dtype=default_dtype, const_file=buildin_const_file):
super(SortedAEV, self).__init__(benchmark, dtype, device, const_file)
if benchmark:
self.radial_subaev_terms = self._enable_benchmark(
self.radial_subaev_terms, 'radial terms')
self.angular_subaev_terms = self._enable_benchmark(
self.angular_subaev_terms, 'angular terms')
self.terms_and_indices = self._enable_benchmark(
self.terms_and_indices, 'terms and indices')
self.combinations = self._enable_benchmark(
self.combinations, 'combinations')
self.compute_mask_r = self._enable_benchmark(
self.compute_mask_r, 'mask_r')
self.compute_mask_a = self._enable_benchmark(
self.compute_mask_a, 'mask_a')
self.assemble = self._enable_benchmark(self.assemble, 'assemble')
self.forward = self._enable_benchmark(self.forward, 'total')
def species_to_tensor(self, species):
"""Convert species list into a long tensor.
Parameters
----------
species : list
List of string for the species of each atoms.
Returns
-------
torch.Tensor
Long tensor for the species, where a value k means the species is
the same as self.species[k].
"""
indices = {self.species[i]: i for i in range(len(self.species))}
values = [indices[i] for i in species]
return torch.tensor(values, dtype=torch.long, device=self.device)
def radial_subaev_terms(self, distances):
"""Compute the radial subAEV terms of the center atom given neighbors
The radial AEV is define in https://arxiv.org/pdf/1610.08935.pdf equation 3.
The sum computed by this method is over all given neighbors, so the caller
of this method need to select neighbors if the caller want a per species subAEV.
Parameters
----------
distances : torch.Tensor
Pytorch tensor of shape (..., neighbors) storing the |Rij| length where i are the
center atoms, and j are their neighbors.
Returns
-------
torch.Tensor
A tensor of shape (..., neighbors, `radial_sublength`) storing the subAEVs.
"""
distances = distances.unsqueeze(-1).unsqueeze(-1) #TODO: allow unsqueeze to insert multiple dimensions
fc = _cutoff_cosine(distances, self.Rcr)
# Note that in the equation in the paper there is no 0.25 coefficient, but in NeuroChem there is such a coefficient. We choose to be consistent with NeuroChem instead of the paper here.
ret = 0.25 * torch.exp(-self.EtaR * (distances - self.ShfR)**2) * fc
return ret.flatten(start_dim=-2)
def angular_subaev_terms(self, vectors1, vectors2):
"""Compute the angular subAEV terms of the center atom given neighbor pairs.
The angular AEV is define in https://arxiv.org/pdf/1610.08935.pdf equation 4.
The sum computed by this method is over all given neighbor pairs, so the caller
of this method need to select neighbors if the caller want a per species subAEV.
Parameters
----------
vectors1, vectors2: torch.Tensor
Tensor of shape (..., pairs, 3) storing the Rij vectors of pairs of neighbors.
The vectors1(..., j, :) and vectors2(..., j, :) are the Rij vectors of the
two atoms of pair j.
Returns
-------
torch.Tensor
Tensor of shape (..., pairs, `angular_sublength`) storing the subAEVs.
"""
vectors1 = vectors1.unsqueeze(
-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) #TODO: allow unsqueeze to plug in multiple dims
vectors2 = vectors2.unsqueeze(
-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1) #TODO: allow unsqueeze to plug in multiple dims
distances1 = vectors1.norm(2, dim=-5)
distances2 = vectors2.norm(2, dim=-5)
# 0.95 is multiplied to the cos values to prevent acos from returning NaN.
cos_angles = 0.95 * \
torch.nn.functional.cosine_similarity(
vectors1, vectors2, dim=-5)
angles = torch.acos(cos_angles)
fcj1 = _cutoff_cosine(distances1, self.Rca)
fcj2 = _cutoff_cosine(distances2, self.Rca)
factor1 = ((1 + torch.cos(angles - self.ShfZ)) / 2) ** self.Zeta
factor2 = torch.exp(-self.EtaA *
((distances1 + distances2) / 2 - self.ShfA) ** 2)
ret = 2 * factor1 * factor2 * fcj1 * fcj2
# ret now have shape (..., pairs, ?, ?, ?, ?) where ? depend on constants
# flat the last 4 dimensions to view the subAEV as one dimension vector
return ret.flatten(start_dim=-4)
def terms_and_indices(self, coordinates):
"""Compute radial and angular subAEV terms, and original indices.
Terms will be sorted according to their distances to central atoms, and only
these within cutoff radius are valid. The returned indices contains what would
their original indices be if they were unsorted.
Parameters
----------
coordinates : torch.Tensor
The tensor that specifies the xyz coordinates of atoms in the molecule.
The tensor must have shape (conformations, atoms, 3)
Returns
-------
(radial_terms, angular_terms, indices_r, indices_a)
radial_terms : torch.Tensor
Tensor of shape (conformations, atoms, neighbors, `radial_sublength`) for
the (unsummed) radial subAEV terms.
angular_terms : torch.Tensor
Tensor of shape (conformations, atoms, pairs, `angular_sublength`) for the
(unsummed) angular subAEV terms.
indices_r : torch.Tensor
Tensor of shape (conformations, atoms, neighbors). Let l = indices_r(i,j,k),
then this means that radial_terms(i,j,k,:) is in the subAEV term of conformation
i between atom j and atom l.
indices_a : torch.Tensor
Same as indices_r, except that the cutoff radius is Rca instead of Rcr.
"""
vec = coordinates.unsqueeze(2) - coordinates.unsqueeze(1)
"""Shape (conformations, atoms, atoms, 3) storing Rij vectors"""
distances = vec.norm(2, -1)
"""Shape (conformations, atoms, atoms) storing Rij distances"""
distances, indices = distances.sort(-1)
min_distances, _ = distances.flatten(end_dim=1).min(0)
inRcr = (min_distances <= self.Rcr).nonzero().flatten()[1:] #TODO: can we use something like find_first?
inRca = (min_distances <= self.Rca).nonzero().flatten()[1:]
distances = distances.index_select(-1, inRcr)
indices_r = indices.index_select(-1, inRcr)
radial_terms = self.radial_subaev_terms(distances)
indices_a = indices.index_select(-1, inRca)
new_shape = list(indices_a.shape) + [3]
_indices_a = indices_a.unsqueeze(-1).expand(*new_shape) #TODO: can we add something like expand_dim(dim=0, repeat=3)
vec = vec.gather(-2, _indices_a) #TODO: can we make gather broadcast??
vec = self.combinations(vec, -2) #TODO: can we move combinations to ATen?
angular_terms = self.angular_subaev_terms(
*vec) if vec is not None else None
return radial_terms, angular_terms, indices_r, indices_a
def combinations(self, tensor, dim=0):
n = tensor.shape[dim]
r = torch.arange(n).type(torch.long).to(tensor.device)
grid_x, grid_y = torch.meshgrid([r, r])
index1 = grid_y[torch.triu(torch.ones(n, n), diagonal=1) == 1]
index2 = grid_x[torch.triu(torch.ones(n, n), diagonal=1) == 1]
if torch.numel(index1) == 0:
# TODO: pytorch are unable to handle size 0 tensor well. Is this an expected behavior?
# See: https://github.com/pytorch/pytorch/issues/5014
return None
return tensor.index_select(dim, index1), tensor.index_select(dim, index2)
def compute_mask_r(self, species_r):
"""Partition indices according to their species, radial part
Parameters
----------
species_r : torch.Tensor
Tensor of shape (conformations, atoms, neighbors) storing species of
neighbors.
Returns
-------
torch.Tensor
Tensor of shape (conformations, atoms, neighbors, all species) storing
the mask for each species.
"""
mask_r = (species_r.unsqueeze(-1) ==
torch.arange(len(self.species), device=self.device))
return mask_r
def compute_mask_a(self, species_a, present_species):
"""Partition indices according to their species, angular part
Parameters
----------
species_a : torch.Tensor
Tensor of shape (conformations, atoms, neighbors) storing the species of
neighbors
present_species : torch.Tensor
Long tensor for the species, already uniqued.
Returns
-------
torch.Tensor
Tensor of shape (conformations, atoms, pairs, present species, present species)
storing the mask for each pair.
"""
species_a = self.combinations(species_a, -1)
if species_a is not None:
# TODO: can we remove this if pytorch support 0 size tensors?
species_a1, species_a2 = species_a
if species_a is not None:
mask_a1 = (species_a1.unsqueeze(-1) ==
present_species).unsqueeze(-1)
mask_a2 = (species_a2.unsqueeze(-1).unsqueeze(-1)
== present_species)
mask = mask_a1 * mask_a2
mask_rev = mask.permute(0, 1, 2, 4, 3)
mask_a = (mask + mask_rev) > 0
return mask_a
else:
return None
def assemble(self, radial_terms, angular_terms, present_species, mask_r, mask_a):
"""Assemble radial and angular AEV from computed terms according to the given partition information.
Parameters
----------
radial_terms : torch.Tensor
Tensor of shape (conformations, atoms, neighbors, `radial_sublength`) for
the (unsummed) radial subAEV terms.
angular_terms : torch.Tensor
Tensor of shape (conformations, atoms, pairs, `angular_sublength`) for the
(unsummed) angular subAEV terms.
present_species : torch.Tensor
Long tensor for species of atoms present in the molecules.
mask_r : torch.Tensor
Tensor of shape (conformations, atoms, neighbors, present species) storing
the mask for each species.
mask_a : torch.Tensor
Tensor of shape (conformations, atoms, pairs, present species, present species)
storing the mask for each pair.
Returns
-------
(torch.Tensor, torch.Tensor)
Returns (radial AEV, angular AEV), both are pytorch tensor of `dtype`.
The radial AEV must be of shape (conformations, atoms, radial_length)
The angular AEV must be of shape (conformations, atoms, angular_length)
"""
conformations = radial_terms.shape[0]
atoms = radial_terms.shape[1]
# assemble radial subaev
present_radial_aevs = (radial_terms.unsqueeze(-2)
* mask_r.unsqueeze(-1).type(self.dtype)).sum(-3)
"""Tensor of shape (conformations, atoms, present species, radial_length)"""
radial_aevs = present_radial_aevs.flatten(start_dim=2)
# assemble angular subaev
rev_indices = {present_species[i].item(): i #TODO: can we use find_first?
for i in range(len(present_species))}
"""Tensor of shape (conformations, atoms, present species, present species, angular_length)"""
angular_aevs = []
zero_angular_subaev = torch.zeros( #TODO: can we make stack and cat broadcast?
conformations, atoms, self.angular_sublength, dtype=self.dtype, device=self.device) #TODO: can we make torch.zeros, torch.ones typeless and deviceless?
for s1, s2 in itertools.combinations_with_replacement(range(len(self.species)), 2):
# TODO: can we remove this if pytorch support 0 size tensors?
if s1 in rev_indices and s2 in rev_indices and mask_a is not None:
i1 = rev_indices[s1]
i2 = rev_indices[s2]
mask = mask_a[..., i1, i2].unsqueeze(-1).type(self.dtype)
subaev = (angular_terms * mask).sum(-2)
else:
subaev = zero_angular_subaev
angular_aevs.append(subaev)
return radial_aevs, torch.cat(angular_aevs, dim=2)
def forward(self, coordinates, species):
species = self.species_to_tensor(species)
present_species = species.unique(sorted=True)
radial_terms, angular_terms, indices_r, indices_a = self.terms_and_indices(
coordinates)
species_r = species[indices_r]
mask_r = self.compute_mask_r(species_r)
species_a = species[indices_a]
mask_a = self.compute_mask_a(species_a, present_species)
return self.assemble(radial_terms, angular_terms, present_species, mask_r, mask_a)
def export_radial_subaev_onnx(self, filename):
"""Export the operation that compute radial subaev into onnx format
Parameters
----------
filename : string
Name of the file to store exported networks.
"""
class M(torch.nn.Module):
def __init__(self, outerself):
super(M, self).__init__()
self.outerself = outerself
def forward(self, center, neighbors):
return self.outerself.radial_subaev(center, neighbors)
dummy_center = torch.randn(1, 3)
dummy_neighbors = torch.randn(1, 5, 3)
torch.onnx.export(M(self), (dummy_center, dummy_neighbors), filename)
import torch
import torch.nn as nn
from . import buildin_const_file, default_dtype, default_device
from .benchmarked import BenchmarkedModule
class AEVComputer(BenchmarkedModule):
__constants__ = ['Rcr', 'Rca', 'dtype', 'device', 'radial_sublength',
'radial_length', 'angular_sublength', 'angular_length', 'aev_length']
"""Base class of various implementations of AEV computer
Attributes
----------
benchmark : boolean
Whether to enable benchmark
dtype : torch.dtype
Data type of pytorch tensors for all the computations. This is also used
to specify whether to use CPU or GPU.
device : torch.Device
The device where tensors should be.
const_file : str
The name of the original file that stores constant.
Rcr, Rca : float
Cutoff radius
EtaR, ShfR, Zeta, ShfZ, EtaA, ShfA : torch.Tensor
Tensor storing constants.
radial_sublength : int
The length of radial subaev of a single species
radial_length : int
The length of full radial aev
angular_sublength : int
The length of angular subaev of a single species
angular_length : int
The length of full angular aev
aev_length : int
The length of full aev
"""
def __init__(self, benchmark=False, dtype=default_dtype, device=default_device, const_file=buildin_const_file):
super(AEVComputer, self).__init__(benchmark)
self.dtype = dtype
self.const_file = const_file
self.device = device
# load constants from const file
with open(const_file) as f:
for i in f:
try:
line = [x.strip() for x in i.split('=')]
name = line[0]
value = line[1]
if name == 'Rcr' or name == 'Rca':
setattr(self, name, float(value))
elif name in ['EtaR', 'ShfR', 'Zeta', 'ShfZ', 'EtaA', 'ShfA']:
value = [float(x.strip()) for x in value.replace(
'[', '').replace(']', '').split(',')]
value = torch.tensor(value, dtype=dtype, device=device)
setattr(self, name, value)
elif name == 'Atyp':
value = [x.strip() for x in value.replace(
'[', '').replace(']', '').split(',')]
self.species = value
except:
raise ValueError('unable to parse const file')
# Compute lengths
self.radial_sublength = self.EtaR.shape[0] * self.ShfR.shape[0]
self.radial_length = len(self.species) * self.radial_sublength
self.angular_sublength = self.EtaA.shape[0] * \
self.Zeta.shape[0] * self.ShfA.shape[0] * self.ShfZ.shape[0]
species = len(self.species)
self.angular_length = int(
(species * (species + 1)) / 2) * self.angular_sublength
self.aev_length = self.radial_length + self.angular_length
# convert constant tensors to a ready-to-broadcast shape
# shape convension (..., EtaR, ShfR)
self.EtaR = self.EtaR.view(-1, 1)
self.ShfR = self.ShfR.view(1, -1)
# shape convension (..., EtaA, Zeta, ShfA, ShfZ)
self.EtaA = self.EtaA.view(-1, 1, 1, 1)
self.Zeta = self.Zeta.view(1, -1, 1, 1)
self.ShfA = self.ShfA.view(1, 1, -1, 1)
self.ShfZ = self.ShfZ.view(1, 1, 1, -1)
def sort_by_species(self, data, species):
"""Sort the data by its species according to the order in `self.species`
Parameters
----------
data : torch.Tensor
Tensor of shape (conformations, atoms, ...) for data.
species : list
List storing species of each atom.
Returns
-------
(torch.Tensor, list)
Tuple of (sorted data, sorted species).
"""
atoms = list(zip(species, torch.unbind(data, 1)))
atoms = sorted(atoms, key=lambda x: self.species.index(x[0]))
species = [s for s, _ in atoms]
data = torch.stack([c for _, c in atoms], dim=1)
return data, species
def forward(self, coordinates, species):
"""Compute AEV from coordinates and species
Parameters
----------
coordinates : torch.Tensor
The tensor that specifies the xyz coordinates of atoms in the molecule.
The tensor must have shape (conformations, atoms, 3)
species : torch.LongTensor
Long tensor for the species, where a value k means the species is
the same as self.species[k]
Returns
-------
(torch.Tensor, torch.Tensor)
Returns (radial AEV, angular AEV), both are pytorch tensor of `dtype`.
The radial AEV must be of shape (conformations, atoms, radial_length)
The angular AEV must be of shape (conformations, atoms, angular_length)
"""
raise NotImplementedError('subclass must override this method')
import torch
import timeit
import functools
class BenchmarkedModule(torch.jit.ScriptModule):
"""Module with member function benchmarking support.
The benchmarking is done by wrapping the original member function with
a wrapped function. The wrapped function will call the original function,
and accumulate its running time into `self.timers`. Different accumulators are
distinguished by different keys. All times should have unit seconds.
To enable benchmarking for member functions in a subclass, simply
call the `__init__` of this class with `benchmark=True`, and add the following
code to your subclass's `__init__`:
```
if self.benchmark:
self._enable_benchmark(self.function_to_be_benchmarked, 'key1', 'key2')
```
Example
-------
The following code implements a subclass for timing the running time of member function
`f` and `g` and the total of these two::
```
class BenchmarkFG(BenchmarkedModule):
def __init__(self, benchmark=False)
super(BenchmarkFG, self).__init__(benchmark)
if benchmark:
self.f = self._enable_benchmark(self.f, 'function f', 'total')
self.g = self._enable_benchmark(self.g, 'function g', 'total')
def f(self):
print('in function f')
def g(self):
print('in function g')
```
Attributes
----------
benchmark : boolean
Whether benchmark is enabled
timers : dict
Dictionary storing the the benchmark result.
"""
def _enable_benchmark(self, fun, *keys):
"""Wrap a function to automatically benchmark it, and assign a key for it.
Parameters
----------
keys
The keys in `self.timers` assigned. If multiple keys are specified, then
the time will be accumulated to all the keys.
func : function
The function to be benchmarked.
Returns
-------
function
Wrapped function that time the original function and update the corresponding
value in `self.timers` automatically.
"""
for key in keys:
self.timers[key] = 0
@functools.wraps(fun)
def wrapped(*args, **kwargs):
start = timeit.default_timer()
ret = fun(*args, **kwargs)
end = timeit.default_timer()
for key in keys:
self.timers[key] += end - start
return ret
return wrapped
def reset_timers(self):
"""Reset all timers. If benchmark is not enabled, a `ValueError` will be raised."""
if not self.benchmark:
raise ValueError('Can not reset timers, benchmark not enabled')
for i in self.timers:
self.timers[i] = 0
def __init__(self, benchmark=False):
super(BenchmarkedModule, self).__init__()
self.benchmark = benchmark
if benchmark:
self.timers = {}
from .pyanitools import anidataloader
from os import listdir
from os.path import join, isfile, isdir
from torch import tensor, full_like, long
from torch.utils.data import Dataset, Subset, TensorDataset, ConcatDataset
from torch.utils.data.dataloader import default_collate
from math import ceil
from . import default_dtype
from random import shuffle
from itertools import chain, accumulate
class ANIDataset(Dataset):
"""Dataset with extra information for ANI applications
Attributes
----------
dataset : Dataset
The dataset
sizes : sequence
Number of conformations for each molecule
cumulative_sizes : sequence
Cumulative sizes
"""
def __init__(self, dataset, sizes, species):
super(ANIDataset, self).__init__()
self.dataset = dataset
self.sizes = sizes
self.cumulative_sizes = list(accumulate(sizes))
self.species = species
def __getitem__(self, idx):
return self.dataset[idx]
def __len__(self):
return len(self.dataset)
def load_dataset(path, dtype=default_dtype):
"""The returned dataset has cumulative_sizes and molecule_sizes"""
# get name of files storing data
files = []
if isdir(path):
for f in listdir(path):
f = join(path, f)
if isfile(f) and (f.endswith('.h5') or f.endswith('.hdf5')):
files.append(f)
elif isfile(path):
files = [path]
else:
raise ValueError('Bad path')
# read tensors from file and build a dataset
species = []
molecule_id = 0
datasets = []
for f in files:
for m in anidataloader(f):
coordinates = tensor(m['coordinates'], dtype=dtype)
energies = tensor(m['energies'], dtype=dtype)
_molecule_id = full_like(energies, molecule_id).type(long)
datasets.append(TensorDataset(_molecule_id, coordinates, energies))
species.append(m['species'])
molecule_id += 1
dataset = ConcatDataset(datasets)
sizes = [len(x) for x in dataset.datasets]
return ANIDataset(dataset, sizes, species)
class BatchSampler(object):
def __init__(self, source, chunk_size, batch_chunks):
if not isinstance(source, ANIDataset):
raise ValueError("BatchSampler must take ANIDataset as input")
self.source = source
self.chunk_size = chunk_size
self.batch_chunks = batch_chunks
def _concated_index(self, molecule, conformation):
"""
Get the index in the dataset of the specified conformation
of the specified molecule.
"""
src = self.source
cumulative_sizes = [0] + src.cumulative_sizes
return cumulative_sizes[molecule] + conformation
def __iter__(self):
molecules = len(self.source.sizes)
sizes = self.source.sizes
"""Number of conformations of each molecule"""
unfinished = list(zip(range(molecules), [0] * molecules))
"""List of pairs (molecule, progress) storing the current progress
of iterating each molecules."""
batch = []
batch_molecules = 0
"""The number of molecules already in batch"""
while len(unfinished) > 0:
new_unfinished = []
for molecule, progress in unfinished:
size = sizes[molecule]
# the last incomplete chunk is not dropped
end = min(progress + self.chunk_size, size)
if end < size:
new_unfinished.append((molecule, end))
batch += [self._concated_index(molecule, x)
for x in range(progress, end)]
batch_molecules += 1
if batch_molecules >= self.batch_chunks:
yield batch
batch = []
batch_molecules = 0
unfinished = new_unfinished
# the last incomplete batch is not dropped
if len(batch) > 0:
yield batch
def __len__(self):
sizes = self.source.sizes
chunks = [ceil(x/self.chunk_size) for x in sizes]
chunks = sum(chunks)
return ceil(chunks / self.batch_chunks)
def collate(batch):
by_molecules = {}
for molecule_id, xyz, energy in batch:
molecule_id = molecule_id.item()
if molecule_id not in by_molecules:
by_molecules[molecule_id] = []
by_molecules[molecule_id].append((xyz, energy))
for i in by_molecules:
by_molecules[i] = default_collate(by_molecules[i])
return by_molecules
def random_split(dataset, num_chunks, chunk_size):
"""
Randomly split a dataset into non-overlapping new datasets of given lengths
The splitting is by chunk, which makes it possible for batching: The whole
dataset is first splitted into chunks of specified size, each chunk are different
conformation of the same isomer/molecule, then these chunks are randomly shuffled
and splitted accorting to the given `num_chunks`. After splitted, chunks belong to
the same molecule/isomer of the same subset will be merged to allow larger batch.
Parameters
----------
dataset : Dataset:
Dataset to be split
num_chunks : sequence
Number of chuncks of splits to be produced
chunk_size : integer
Size of each chunk
"""
chunks = list(BatchSampler(dataset, chunk_size, 1))
shuffle(chunks)
if sum(num_chunks) != len(chunks):
raise ValueError(
"Sum of input number of chunks does not equal the length of the total dataset!")
offset = 0
subsets = []
for i in num_chunks:
_chunks = chunks[offset:offset+i]
offset += i
# merge chunks by molecule
by_molecules = {}
for chunk in _chunks:
molecule_id = dataset[chunk[0]][0].item()
if molecule_id not in by_molecules:
by_molecules[molecule_id] = []
by_molecules[molecule_id] += chunk
_chunks = list(by_molecules.values())
shuffle(_chunks)
# construct subset
sizes = [len(j) for j in _chunks]
indices = list(chain.from_iterable(_chunks))
_dataset = Subset(dataset, indices)
_dataset = ANIDataset(_dataset, sizes, dataset.species)
subsets.append(_dataset)
return subsets
from . import buildin_sae_file
class EnergyShifter:
"""Class that deal with self atomic energies.
Attributes
----------
self_energies : dict
The dictionary that stores self energies of species.
"""
def __init__(self, self_energy_file=buildin_sae_file):
# load self energies
self.self_energies = {}
with open(self_energy_file) as f:
for i in f:
try:
line = [x.strip() for x in i.split('=')]
name = line[0].split(',')[0].strip()
value = float(line[1])
self.self_energies[name] = value
except:
pass # ignore unrecognizable line
def subtract_sae(self, energies, species):
"""Subtract self atomic energies from `energies`.
Parameters
----------
energies : pytorch tensor of `dtype`
The tensor of any shape that stores the raw energies.
species : list of str
The list specifying the species of each atom. The length of the
list must be the same as the number of atoms.
Returns
-------
pytorch tensor of `dtype`
The tensor of the same shape as `energies` that stores the energies
with self atomic energies subtracted.
"""
s = 0
for i in species:
s += self.self_energies[i]
return energies - s
def add_sae(self, energies, species):
"""Add self atomic energies to `energies`
Parameters
----------
energies : pytorch tensor of `dtype`
The tensor of any shape that stores the energies excluding self
atomic energies.
species : list of str
The list specifying the species of each atom. The length of the
list must be the same as the number of atoms.
Returns
-------
pytorch tensor of `dtype`
The tensor of the same shape as `energies` that stores the raw
energies, i.e. the energy including self atomic energies.
"""
s = 0
for i in species:
s += self.self_energies[i]
return energies + s
import torch
import ase
import pyNeuroChem
import ase_interface
import numpy
from .aev_base import AEVComputer
from . import buildin_const_file, buildin_sae_file, buildin_network_dir, default_dtype, default_device
class NeuroChemAEV (AEVComputer):
"""The AEV computer that dump out AEV from pyNeuroChem
Attributes
----------
sae_file : str
The name of the original file that stores self atomic energies.
network_dir : str
The name ending with '/' of the directory that stores networks in NeuroChem's format.
nc : pyNeuroChem.molecule
The internal object of pyNeuroChem which can be used to dump out AEVs, energies, forces,
activations, etc.
"""
def __init__(self, dtype=default_dtype, device=default_device, const_file=buildin_const_file, sae_file=buildin_sae_file, network_dir=buildin_network_dir):
super(NeuroChemAEV, self).__init__(False, dtype, device, const_file)
self.sae_file = sae_file
self.network_dir = network_dir
self.nc = pyNeuroChem.molecule(const_file, sae_file, network_dir, 0)
def _get_radial_part(self, fullaev):
"""Get the radial part of AEV from the full AEV
Parameters
----------
fullaev : pytorch tensor of `dtype`
The full AEV in shape (conformations, atoms, `radial_length()+angular_length()`).
Returns
-------
pytorch tensor of `dtype`
The radial AEV in shape(conformations, atoms, `radial_length()`)
"""
radial_size = self.radial_length
return fullaev[:, :, :radial_size]
def _get_angular_part(self, fullaev):
"""Get the angular part of AEV from the full AEV
Parameters
----------
fullaev : pytorch tensor of `dtype`
The full AEV in shape (conformations, atoms, `radial_length()+angular_length()`).
Returns
-------
pytorch tensor of `dtype`
The radial AEV in shape (conformations, atoms, `angular_length()`)
"""
radial_size = self.radial_length
return fullaev[:, :, radial_size:]
def _compute_neurochem_aevs_per_conformation(self, coordinates, species):
"""Get the full AEV for a single conformation
Parameters
----------
coordinates : pytorch tensor of `dtype`
The xyz coordinates in shape (atoms, 3).
species : list of str
The list specifying the species of each atom. The length of the
list must be the same as the number of atoms.
Returns
-------
pytorch tensor of `dtype`
The full AEV for all atoms in shape (atoms, `radial_length()+angular_length()`)
"""
atoms = coordinates.shape[0]
mol = ase.Atoms(''.join(species), positions=coordinates)
mol.set_calculator(ase_interface.ANI(False))
mol.calc.setnc(self.nc)
_ = mol.get_potential_energy()
aevs = [self.nc.atomicenvironments(j) for j in range(atoms)]
aevs = numpy.stack(aevs)
return aevs
def __call__(self, coordinates, species):
conformations = coordinates.shape[0]
aevs = [self._compute_neurochem_aevs_per_conformation(
coordinates[i], species) for i in range(conformations)]
aevs = torch.from_numpy(numpy.stack(aevs)).type(
self.dtype).to(self.device)
return self._get_radial_part(aevs), self._get_angular_part(aevs)
This diff is collapsed.
# Written by Roman Zubatyuk and Justin S. Smith
import h5py
import numpy as np
import platform
import os
PY_VERSION = int(platform.python_version().split('.')[0]) > 3
class datapacker(object):
def __init__(self, store_file, mode='w-', complib='gzip', complevel=6):
"""Wrapper to store arrays within HFD5 file
"""
# opening file
self.store = h5py.File(store_file, mode=mode)
self.clib = complib
self.clev = complevel
def store_data(self, store_loc, **kwargs):
"""Put arrays to store
"""
# print(store_loc)
g = self.store.create_group(store_loc)
for k, v, in kwargs.items():
# print(type(v[0]))
# print(k)
if type(v) == list:
if len(v) != 0:
if type(v[0]) is np.str_ or type(v[0]) is str:
v = [a.encode('utf8') for a in v]
g.create_dataset(k, data=v, compression=self.clib,
compression_opts=self.clev)
def cleanup(self):
"""Wrapper to close HDF5 file
"""
self.store.close()
class anidataloader(object):
''' Contructor '''
def __init__(self, store_file):
if not os.path.exists(store_file):
exit('Error: file not found - '+store_file)
self.store = h5py.File(store_file, 'r')
''' Group recursive iterator (iterate through all groups in all branches and return datasets in dicts) '''
def h5py_dataset_iterator(self, g, prefix=''):
for key in g.keys():
item = g[key]
path = '{}/{}'.format(prefix, key)
keys = [i for i in item.keys()]
if isinstance(item[keys[0]], h5py.Dataset): # test for dataset
data = {'path': path}
for k in keys:
if not isinstance(item[k], h5py.Group):
dataset = np.array(item[k].value)
if type(dataset) is np.ndarray:
if dataset.size != 0:
if type(dataset[0]) is np.bytes_:
dataset = [a.decode('ascii')
for a in dataset]
data.update({k: dataset})
yield data
else: # test for group (go down)
yield from self.h5py_dataset_iterator(item, path)
''' Default class iterator (iterate through all data) '''
def __iter__(self):
for data in self.h5py_dataset_iterator(self.store):
yield data
''' Returns a list of all groups in the file '''
def get_group_list(self):
return [g for g in self.store.values()]
''' Allows interation through the data in a given group '''
def iter_group(self, g):
for data in self.h5py_dataset_iterator(g):
yield data
''' Returns the requested dataset '''
def get_data(self, path, prefix=''):
item = self.store[path]
path = '{}/{}'.format(prefix, path)
keys = [i for i in item.keys()]
data = {'path': path}
# print(path)
for k in keys:
if not isinstance(item[k], h5py.Group):
dataset = np.array(item[k].value)
if type(dataset) is np.ndarray:
if dataset.size != 0:
if type(dataset[0]) is np.bytes_:
dataset = [a.decode('ascii') for a in dataset]
data.update({k: dataset})
return data
''' Returns the number of groups '''
def group_size(self):
return len(self.get_group_list())
def size(self):
count = 0
for g in self.store.values():
count = count + len(g.items())
return count
''' Close the HDF5 file '''
def cleanup(self):
self.store.close()
TM = 1
Rcr = 5.2000e+00
Rca = 3.5000e+00
EtaR = [1.6000000e+01]
ShfR = [9.0000000e-01,1.1687500e+00,1.4375000e+00,1.7062500e+00,1.9750000e+00,2.2437500e+00,2.5125000e+00,2.7812500e+00,3.0500000e+00,3.3187500e+00,3.5875000e+00,3.8562500e+00,4.1250000e+00,4.3937500e+00,4.6625000e+00,4.9312500e+00]
Zeta = [3.2000000e+01]
ShfZ = [1.9634954e-01,5.8904862e-01,9.8174770e-01,1.3744468e+00,1.7671459e+00,2.1598449e+00,2.5525440e+00,2.9452431e+00]
EtaA = [8.0000000e+00]
ShfA = [9.0000000e-01,1.5500000e+00,2.2000000e+00,2.8500000e+00]
Atyp = [H,C,N,O]
H,0=-0.600952980000
C,1=-38.08316124000
N,2=-54.70775770000
O,3=-75.19446356000
This diff is collapsed.
!InputFile for Force Prediction Network
sflparamsfile=rHCNO-5.2R_16-3.5A_a4-8.params
ntwkStoreDir=networks/
atomEnergyFile=sae_linfit.dat
nmax=0! Maximum number of iterations (0 = inf)
tolr=100! Tolerance - early stopping
emult=0.5!Multiplier by eta after tol switch
eta=0.001! Eta -- Learning rate
tcrit=1.0E-5! Eta termination criterion
tmax=0! Maximum time (0 = inf)
tbtchsz=2560
vbtchsz=2560
gpuid=0
ntwshr=0
nkde=2
energy=1
force=0
fmult=0.0
pbc=0
cmult =0.001
runtype=ANNP_CREATE_HDNN_AND_TRAIN!Create and train a HDN network
network_setup {
inputsize=384;
atom_net H $
layer [
nodes=160;
activation=9;
type=0;
l2norm=1;
l2valu=0.0001;
]
layer [
nodes=128;
activation=9;
type=0;
l2norm=1;
l2valu=0.00001;
]
layer [
nodes=96;
activation=9;
type=0;
l2norm=1;
l2valu=0.000001;
]
layer [
nodes=1;
activation=6;
type=0;
]
$
atom_net C $
layer [
nodes=144;
activation=9;
type=0;
l2norm=1;
l2valu=0.0001;
]
layer [
nodes=112;
activation=9;
type=0;
l2norm=1;
l2valu=0.00001;
]
layer [
nodes=96;
activation=9;
type=0;
l2norm=1;
l2valu=0.000001;
]
layer [
nodes=1;
activation=6;
type=0;
]
$
atom_net N $
layer [
nodes=128;
activation=9;
type=0;
l2norm=1;
l2valu=0.0001;
]
layer [
nodes=112;
activation=9;
type=0;
l2norm=1;
l2valu=0.00001;
]
layer [
nodes=96;
activation=9;
type=0;
l2norm=1;
l2valu=0.000001;
]
layer [
nodes=1;
activation=6;
type=0;
]
$
atom_net O $
layer [
nodes=128;
activation=9;
type=0;
l2norm=1;
l2valu=0.0001;
]
layer [
nodes=112;
activation=9;
type=0;
l2norm=1;
l2valu=0.00001;
]
layer [
nodes=96;
activation=9;
type=0;
l2norm=1;
l2valu=0.000001;
]
layer [
nodes=1;
activation=6;
type=0;
]
$
}
adptlrn=OFF ! Adaptive learning (OFF,RMSPROP)
decrate=0.9 !Decay rate of RMSPROP
moment=ADAM! Turn on momentum or nesterov momentum (OFF,CNSTTEMP,TMANNEAL,REGULAR,NESTEROV)
mu=0.99 ! Mu factor for momentum
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment