Unverified Commit 2cc64cb7 authored by Gao, Xiang's avatar Gao, Xiang Committed by GitHub
Browse files

Add 3D structures downloaded from NIST to unit test set (#146)

parent 91cec854
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
This source diff could not be displayed because it is stored in LFS. You can view the blob instead.
...@@ -3,6 +3,8 @@ import torchani ...@@ -3,6 +3,8 @@ import torchani
import unittest import unittest
import os import os
import pickle import pickle
import math
import random
path = os.path.dirname(os.path.realpath(__file__)) path = os.path.dirname(os.path.realpath(__file__))
...@@ -19,11 +21,23 @@ class TestEnergies(unittest.TestCase): ...@@ -19,11 +21,23 @@ class TestEnergies(unittest.TestCase):
shift_energy = builtins.energy_shifter shift_energy = builtins.energy_shifter
self.model = torch.nn.Sequential(self.aev_computer, nnp, shift_energy) self.model = torch.nn.Sequential(self.aev_computer, nnp, shift_energy)
def random_skip(self):
return False
def transform(self, x):
return x
def testIsomers(self): def testIsomers(self):
for i in range(N): for i in range(N):
datafile = os.path.join(path, 'test_data/{}'.format(i)) datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
coordinates, species, _, _, energies, _ = pickle.load(f) coordinates, species, _, _, energies, _ = pickle.load(f)
coordinates = torch.from_numpy(coordinates).to(torch.float)
species = torch.from_numpy(species)
energies = torch.from_numpy(energies).to(torch.float)
coordinates = self.transform(coordinates)
species = self.transform(species)
energies = self.transform(energies)
_, energies_ = self.model((species, coordinates)) _, energies_ = self.model((species, coordinates))
max_diff = (energies - energies_).abs().max().item() max_diff = (energies - energies_).abs().max().item()
self.assertLess(max_diff, self.tolerance) self.assertLess(max_diff, self.tolerance)
...@@ -32,9 +46,15 @@ class TestEnergies(unittest.TestCase): ...@@ -32,9 +46,15 @@ class TestEnergies(unittest.TestCase):
species_coordinates = [] species_coordinates = []
energies = [] energies = []
for i in range(N): for i in range(N):
datafile = os.path.join(path, 'test_data/{}'.format(i)) datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
coordinates, species, _, _, e, _ = pickle.load(f) coordinates, species, _, _, e, _ = pickle.load(f)
coordinates = torch.from_numpy(coordinates).to(torch.float)
species = torch.from_numpy(species)
e = torch.from_numpy(e).to(torch.float)
coordinates = self.transform(coordinates)
species = self.transform(species)
e = self.transform(e)
species_coordinates.append((species, coordinates)) species_coordinates.append((species, coordinates))
energies.append(e) energies.append(e)
species, coordinates = torchani.utils.pad_coordinates( species, coordinates = torchani.utils.pad_coordinates(
...@@ -44,6 +64,21 @@ class TestEnergies(unittest.TestCase): ...@@ -44,6 +64,21 @@ class TestEnergies(unittest.TestCase):
max_diff = (energies - energies_).abs().max().item() max_diff = (energies - energies_).abs().max().item()
self.assertLess(max_diff, self.tolerance) self.assertLess(max_diff, self.tolerance)
def testNIST(self):
datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f:
data = pickle.load(f)
for coordinates, species, _, _, e, _ in data:
if self.random_skip():
continue
coordinates = torch.from_numpy(coordinates).to(torch.float)
species = torch.from_numpy(species)
energies = torch.from_numpy(e).to(torch.float)
_, energies_ = self.model((species, coordinates))
natoms = coordinates.shape[1]
max_diff = (energies - energies_).abs().max().item()
self.assertLess(max_diff / math.sqrt(natoms), self.tolerance)
class TestEnergiesASEComputer(TestEnergies): class TestEnergiesASEComputer(TestEnergies):
...@@ -51,6 +86,14 @@ class TestEnergiesASEComputer(TestEnergies): ...@@ -51,6 +86,14 @@ class TestEnergiesASEComputer(TestEnergies):
super(TestEnergiesASEComputer, self).setUp() super(TestEnergiesASEComputer, self).setUp()
self.aev_computer.neighborlist = torchani.ase.NeighborList() self.aev_computer.neighborlist = torchani.ase.NeighborList()
def transform(self, x):
"""To reduce the size of test cases for faster test speed"""
return x[:2, ...]
def random_skip(self):
"""To reduce the size of test cases for faster test speed"""
return random.random() < 0.95
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -34,9 +34,11 @@ class TestEnsemble(unittest.TestCase): ...@@ -34,9 +34,11 @@ class TestEnsemble(unittest.TestCase):
def testGDB(self): def testGDB(self):
for i in range(N): for i in range(N):
datafile = os.path.join(path, 'test_data/{}'.format(i)) datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
coordinates, species, _, _, _, _ = pickle.load(f) coordinates, species, _, _, _, _ = pickle.load(f)
coordinates = torch.from_numpy(coordinates)
species = torch.from_numpy(species)
self._test_molecule(coordinates, species) self._test_molecule(coordinates, species)
......
...@@ -3,6 +3,7 @@ import torchani ...@@ -3,6 +3,7 @@ import torchani
import unittest import unittest
import os import os
import pickle import pickle
import random
path = os.path.dirname(os.path.realpath(__file__)) path = os.path.dirname(os.path.realpath(__file__))
N = 97 N = 97
...@@ -17,14 +18,20 @@ class TestForce(unittest.TestCase): ...@@ -17,14 +18,20 @@ class TestForce(unittest.TestCase):
nnp = builtins.models[0] nnp = builtins.models[0]
self.model = torch.nn.Sequential(self.aev_computer, nnp) self.model = torch.nn.Sequential(self.aev_computer, nnp)
def random_skip(self):
return False
def transform(self, x): def transform(self, x):
return x return x
def testIsomers(self): def testIsomers(self):
for i in range(N): for i in range(N):
datafile = os.path.join(path, 'test_data/{}'.format(i)) datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
coordinates, species, _, _, _, forces = pickle.load(f) coordinates, species, _, _, _, forces = pickle.load(f)
coordinates = torch.from_numpy(coordinates)
species = torch.from_numpy(species)
forces = torch.from_numpy(forces)
coordinates = self.transform(coordinates) coordinates = self.transform(coordinates)
species = self.transform(species) species = self.transform(species)
forces = self.transform(forces) forces = self.transform(forces)
...@@ -39,9 +46,12 @@ class TestForce(unittest.TestCase): ...@@ -39,9 +46,12 @@ class TestForce(unittest.TestCase):
species_coordinates = [] species_coordinates = []
coordinates_forces = [] coordinates_forces = []
for i in range(N): for i in range(N):
datafile = os.path.join(path, 'test_data/{}'.format(i)) datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
coordinates, species, _, _, _, forces = pickle.load(f) coordinates, species, _, _, _, forces = pickle.load(f)
coordinates = torch.from_numpy(coordinates)
species = torch.from_numpy(species)
forces = torch.from_numpy(forces)
coordinates = self.transform(coordinates) coordinates = self.transform(coordinates)
species = self.transform(species) species = self.transform(species)
forces = self.transform(forces) forces = self.transform(forces)
...@@ -58,6 +68,23 @@ class TestForce(unittest.TestCase): ...@@ -58,6 +68,23 @@ class TestForce(unittest.TestCase):
max_diff = (forces + derivative).abs().max().item() max_diff = (forces + derivative).abs().max().item()
self.assertLess(max_diff, self.tolerance) self.assertLess(max_diff, self.tolerance)
def testNIST(self):
datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f:
data = pickle.load(f)
for coordinates, species, _, _, _, forces in data:
if self.random_skip():
continue
coordinates = torch.from_numpy(coordinates).to(torch.float) \
.requires_grad_(True)
species = torch.from_numpy(species)
forces = torch.from_numpy(forces).to(torch.float)
_, energies = self.model((species, coordinates))
derivative = torch.autograd.grad(energies.sum(),
coordinates)[0]
max_diff = (forces + derivative).abs().max().item()
self.assertLess(max_diff, self.tolerance)
class TestForceASEComputer(TestForce): class TestForceASEComputer(TestForce):
...@@ -67,7 +94,11 @@ class TestForceASEComputer(TestForce): ...@@ -67,7 +94,11 @@ class TestForceASEComputer(TestForce):
def transform(self, x): def transform(self, x):
"""To reduce the size of test cases for faster test speed""" """To reduce the size of test cases for faster test speed"""
return x[:3, ...] return x[:2, ...]
def random_skip(self):
"""To reduce the size of test cases for faster test speed"""
return random.random() < 0.95
if __name__ == '__main__': if __name__ == '__main__':
......
# Commands
## Download all 3d structures from NIST
```
scrapy runspider nist.py -o result.json
```
\ No newline at end of file
import scrapy
species = ['H', 'C', 'N', 'O']
formula = ''.join([x + '*' for x in species])
urltemplate = "https://webbook.nist.gov/cgi/cbook.cgi?Value={}-{}&VType=MW&Formula=" + formula + "&NoIon=on&MatchIso=on" # noqa: E501
min_weight = 0
max_weight = 9999999
class NISTSpider(scrapy.Spider):
name = "NIST"
def start_requests(self):
start_url = urltemplate.format(min_weight, max_weight)
yield scrapy.Request(
url=start_url,
callback=lambda x: self.parse_range(x, min_weight, max_weight))
def parse_range(self, response, from_, to):
search_result_list = response.xpath('//*[@id="main"]/ol/li/a')
# if there are 400 result, then the range is too large,
# half the range and repeat the search
if len(search_result_list) == 400 and to - from_ > 1:
mid = (from_ + to) // 2
next_url1 = urltemplate.format(from_, mid)
next_url2 = urltemplate.format(mid, to)
yield scrapy.Request(
url=next_url1,
callback=lambda x: self.parse_range(x, from_, mid))
yield scrapy.Request(
url=next_url2,
callback=lambda x: self.parse_range(x, mid, to))
elif len(search_result_list) == 400 and to - from_ == 1:
next_url1 = urltemplate.format(from_, from_)
next_url2 = urltemplate.format(to, to)
yield scrapy.Request(
url=next_url1,
callback=lambda x: self.parse_range(x, from_, from_))
yield scrapy.Request(
url=next_url2,
callback=lambda x: self.parse_range(x, to, to))
else:
for i in search_result_list:
href = i.css('a::attr("href")').extract_first()
prefix = '/cgi/cbook.cgi?ID='
suffix = '&Units=SI'
nist_id = href[len(prefix): len(href) - len(suffix)]
url3d = 'https://webbook.nist.gov/cgi/cbook.cgi?Str3File=' + nist_id # noqa: E501
yield scrapy.Request(
url=url3d,
callback=lambda x, n=nist_id: self.parse_sdf(x, n))
def parse_sdf(self, response, nist_id):
if response.text:
text = response.text
lines = [x.strip() for x in text.split("\r\n")]
lines = lines[3:]
count = int(lines[0].split()[0])
lines = lines[1: count + 1]
lines = [x.split()[:4] for x in lines]
# double check if it contain unexpected elements
good = True
atoms = []
for x, y, z, atype in lines:
if atype not in species:
good = False
break
atoms.append((atype, float(x), float(y), float(z)))
if good:
yield {'id': nist_id, 'size': count, 'atoms': atoms}
import torch
import os import os
import ase
import pyNeuroChem
import ase_interface
import numpy
import torchani
import pickle import pickle
from torchani import buildin_const_file, buildin_sae_file, \ import pyanitools
buildin_network_dir import numpy
import torchani.training.pyanitools from neurochem_calculator import NeuroChem, path
import json
path = os.path.dirname(os.path.realpath(__file__)) import tqdm
conv_au_ev = 27.21138505 import random
class NeuroChem (torchani.aev.AEVComputer):
def __init__(self, const_file=buildin_const_file,
sae_file=buildin_sae_file,
network_dir=buildin_network_dir):
super(NeuroChem, self).__init__(False, const_file)
self.sae_file = sae_file
self.network_dir = network_dir
self.nc = pyNeuroChem.molecule(
self.const_file, self.sae_file, self.network_dir, 0)
def _get_radial_part(self, fullaev):
return fullaev[:, :, :self.radial_length]
def _get_angular_part(self, fullaev):
return fullaev[:, :, self.radial_length:]
def _per_conformation(self, coordinates, species):
atoms = coordinates.shape[0]
mol = ase.Atoms(''.join(species), positions=coordinates)
mol.set_calculator(ase_interface.ANI(False))
mol.calc.setnc(self.nc)
energy = mol.get_potential_energy() / conv_au_ev
aevs = [mol.calc.nc.atomicenvironments(j) for j in range(atoms)]
force = mol.get_forces() / conv_au_ev
aevs = numpy.stack(aevs)
return aevs, energy, force
def forward(self, coordinates, species):
conformations = coordinates.shape[0]
results = [self._per_conformation(
coordinates[i], species) for i in range(conformations)]
aevs, energies, forces = zip(*results)
aevs = torch.from_numpy(numpy.stack(aevs)).type(
self.EtaR.dtype).to(self.EtaR.device)
energies = torch.from_numpy(numpy.stack(energies)).type(
self.EtaR.dtype).to(self.EtaR.device)
forces = torch.from_numpy(numpy.stack(forces)).type(
self.EtaR.dtype).to(self.EtaR.device)
return self._get_radial_part(aevs), \
self._get_angular_part(aevs), \
energies, forces
neurochem = NeuroChem()
consts = torchani.neurochem.Constants() # generate expect for ANI1 subset
aev = torchani.AEVComputer(**consts)
ncaev = NeuroChem().to(torch.device('cpu'))
mol_count = 0 mol_count = 0
species_indices = {consts.species[i]: i for i in range(len(aev.species))}
for i in [1, 2, 3, 4]: for i in [1, 2, 3, 4]:
data_file = os.path.join( data_file = os.path.join(
path, '../dataset/ani_gdb_s0{}.h5'.format(i)) path, '../dataset/ani_gdb_s0{}.h5'.format(i))
adl = torchani.training.pyanitools.anidataloader(data_file) adl = pyanitools.anidataloader(data_file)
for data in adl: for data in tqdm.tqdm(adl, desc='ANI1: {} heavy atoms'.format(i)):
coordinates = data['coordinates'][:10, :] coordinates = data['coordinates'][:10, :]
coordinates = torch.from_numpy(coordinates).type(ncaev.EtaR.dtype) pickleobj = neurochem(coordinates, data['species'])
species = torch.tensor([species_indices[i] for i in data['species']],
dtype=torch.long, device=torch.device('cpu')) \
.expand(10, -1)
smiles = ''.join(data['smiles'])
radial, angular, energies, forces = ncaev(coordinates, data['species'])
pickleobj = (coordinates, species, radial, angular, energies, forces)
dumpfile = os.path.join( dumpfile = os.path.join(
path, '../tests/test_data/{}'.format(mol_count)) path, '../tests/test_data/ANI1_subset/{}'.format(mol_count))
with open(dumpfile, 'wb') as f: with open(dumpfile, 'wb') as f:
pickle.dump(pickleobj, f, protocol=2) pickle.dump(pickleobj, f)
mol_count += 1
# generate expect for NIST
keep_ratio = 0.1 # reduce the size of generated file by discarding
mol_count = 0
with open(os.path.join(path, 'diverse_test_set/result.json')) as f:
pickle_objects = []
for i in tqdm.tqdm(json.load(f), desc='NIST'):
if random.random() > keep_ratio:
continue
atoms = i['atoms']
natoms = len(atoms)
species = []
coordinates = []
for atype, x, y, z in atoms:
species.append(atype)
coordinates.append([x, y, z])
pickleobj = neurochem(numpy.array(coordinates), species)
pickle_objects.append(pickleobj)
mol_count += 1 mol_count += 1
dumpfile = os.path.join(
path, '../tests/test_data/NIST/all')
with open(dumpfile, 'wb') as f:
pickle.dump(pickle_objects, f)
import os
import ase
import pyNeuroChem
import ase_interface
import numpy
path = os.path.dirname(os.path.realpath(__file__))
builtin_path = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/')
const_file = os.path.join(builtin_path, 'rHCNO-5.2R_16-3.5A_a4-8.params')
sae_file = os.path.join(builtin_path, 'sae_linfit.dat')
network_dir = os.path.join(builtin_path, 'train0/networks/')
radial_length = 64
conv_au_ev = 27.21138505
all_species = ['H', 'C', 'N', 'O']
species_indices = {all_species[i]: i for i in range(len(all_species))}
class NeuroChem:
def __init__(self):
self.nc = pyNeuroChem.molecule(const_file, sae_file, network_dir, 0)
def _get_radial_part(self, fullaev):
return fullaev[:, :, :radial_length]
def _get_angular_part(self, fullaev):
return fullaev[:, :, radial_length:]
def _per_conformation(self, coordinates, species):
atoms = coordinates.shape[0]
mol = ase.Atoms(''.join(species), positions=coordinates)
mol.set_calculator(ase_interface.ANI(False))
mol.calc.setnc(self.nc)
energy = mol.get_potential_energy() / conv_au_ev
aevs = [mol.calc.nc.atomicenvironments(j) for j in range(atoms)]
force = mol.get_forces() / conv_au_ev
aevs = numpy.stack(aevs)
return aevs, energy, force
def __call__(self, coordinates, species):
if len(coordinates.shape) == 2:
coordinates = coordinates.reshape(1, -1, 3)
conformations = coordinates.shape[0]
results = [self._per_conformation(
coordinates[i], species) for i in range(conformations)]
aevs, energies, forces = zip(*results)
aevs = numpy.stack(aevs)
energies = numpy.stack(energies)
forces = numpy.stack(forces)
species = numpy.array([species_indices[i] for i in species])
species = species.reshape(1, -1)
species = numpy.broadcast_to(species,
(coordinates.shape[0], species.shape[1]))
return coordinates, species, self._get_radial_part(aevs), \
self._get_angular_part(aevs), energies, forces
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment