Add 3D structures downloaded from NIST to unit test set (#146)

2cc64cb7 · Gao, Xiang · GitHub · 91cec854 · 2cc64cb7 · 2cc64cb7
Unverified Commit 2cc64cb7 authored Dec 12, 2018 by Gao, Xiang Committed by GitHub Dec 12, 2018
18 changed files
--- a/tests/test_data/92
+++ b/tests/test_data/92
--- a/tests/test_data/93
+++ b/tests/test_data/93
--- a/tests/test_data/94
+++ b/tests/test_data/94
--- a/tests/test_data/95
+++ b/tests/test_data/95
--- a/tests/test_data/96
+++ b/tests/test_data/96
--- a/tests/test_data/10
+++ b/tests/test_data/10
--- a/tests/test_data/inputtrain.ipt
+++ b/tests/test_data/inputtrain.ipt
--- a/tests/test_data/inputtrain.yaml
+++ b/tests/test_data/inputtrain.yaml
--- a/tests/test_data/rHCNO-5.2R_16-3.5A_a4-8.params
+++ b/tests/test_data/rHCNO-5.2R_16-3.5A_a4-8.params
--- a/tests/test_data/sae_linfit.dat
+++ b/tests/test_data/sae_linfit.dat
--- a/tests/test_energies.py
+++ b/tests/test_energies.py
@@ -3,6 +3,8 @@ import torchani
 import unittest
 import os
 import pickle
+import math
+import random
 path = os.path.dirname(os.path.realpath(__file__))
@@ -19,11 +21,23 @@ class TestEnergies(unittest.TestCase):
        shift_energy = builtins.energy_shifter
        self.model = torch.nn.Sequential(self.aev_computer, nnp, shift_energy)
+    def random_skip(self):
+        return False
+    def transform(self, x):
+        return x
    def testIsomers(self):
        for i in range(N):
-            datafile = os.path.join(path, 'test_data/{}'.format(i))
+            datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
            with open(datafile, 'rb') as f:
                coordinates, species, _, _, energies, _ = pickle.load(f)
+                coordinates = torch.from_numpy(coordinates).to(torch.float)
+                species = torch.from_numpy(species)
+                energies = torch.from_numpy(energies).to(torch.float)
+                coordinates = self.transform(coordinates)
+                species = self.transform(species)
+                energies = self.transform(energies)
                _, energies_ = self.model((species, coordinates))
                max_diff = (energies - energies_).abs().max().item()
                self.assertLess(max_diff, self.tolerance)
@@ -32,9 +46,15 @@ class TestEnergies(unittest.TestCase):
        species_coordinates = []
        energies = []
        for i in range(N):
-            datafile = os.path.join(path, 'test_data/{}'.format(i))
+            datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
            with open(datafile, 'rb') as f:
                coordinates, species, _, _, e, _ = pickle.load(f)
+                coordinates = torch.from_numpy(coordinates).to(torch.float)
+                species = torch.from_numpy(species)
+                e = torch.from_numpy(e).to(torch.float)
+                coordinates = self.transform(coordinates)
+                species = self.transform(species)
+                e = self.transform(e)
                species_coordinates.append((species, coordinates))
                energies.append(e)
        species, coordinates = torchani.utils.pad_coordinates(
@@ -44,6 +64,21 @@ class TestEnergies(unittest.TestCase):
        max_diff = (energies - energies_).abs().max().item()
        self.assertLess(max_diff, self.tolerance)
+    def testNIST(self):
+        datafile = os.path.join(path, 'test_data/NIST/all')
+        with open(datafile, 'rb') as f:
+            data = pickle.load(f)
+            for coordinates, species, _, _, e, _ in data:
+                if self.random_skip():
+                    continue
+                coordinates = torch.from_numpy(coordinates).to(torch.float)
+                species = torch.from_numpy(species)
+                energies = torch.from_numpy(e).to(torch.float)
+                _, energies_ = self.model((species, coordinates))
+                natoms = coordinates.shape[1]
+                max_diff = (energies - energies_).abs().max().item()
+                self.assertLess(max_diff / math.sqrt(natoms), self.tolerance)
 class TestEnergiesASEComputer(TestEnergies):
@@ -51,6 +86,14 @@ class TestEnergiesASEComputer(TestEnergies):
        super(TestEnergiesASEComputer, self).setUp()
        self.aev_computer.neighborlist = torchani.ase.NeighborList()
+    def transform(self, x):
+        """To reduce the size of test cases for faster test speed"""
+        return x[:2, ...]
+    def random_skip(self):
+        """To reduce the size of test cases for faster test speed"""
+        return random.random() < 0.95
 if __name__ == '__main__':
    unittest.main()
--- a/tests/test_ensemble.py
+++ b/tests/test_ensemble.py
@@ -34,9 +34,11 @@ class TestEnsemble(unittest.TestCase):
    def testGDB(self):
        for i in range(N):
-            datafile = os.path.join(path, 'test_data/{}'.format(i))
+            datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
            with open(datafile, 'rb') as f:
                coordinates, species, _, _, _, _ = pickle.load(f)
+                coordinates = torch.from_numpy(coordinates)
+                species = torch.from_numpy(species)
                self._test_molecule(coordinates, species)

--- a/tests/test_forces.py
+++ b/tests/test_forces.py
@@ -3,6 +3,7 @@ import torchani
 import unittest
 import os
 import pickle
+import random
 path = os.path.dirname(os.path.realpath(__file__))
 N = 97
@@ -17,14 +18,20 @@ class TestForce(unittest.TestCase):
        nnp = builtins.models[0]
        self.model = torch.nn.Sequential(self.aev_computer, nnp)
+    def random_skip(self):
+        return False
    def transform(self, x):
        return x
    def testIsomers(self):
        for i in range(N):
-            datafile = os.path.join(path, 'test_data/{}'.format(i))
+            datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
            with open(datafile, 'rb') as f:
                coordinates, species, _, _, _, forces = pickle.load(f)
+                coordinates = torch.from_numpy(coordinates)
+                species = torch.from_numpy(species)
+                forces = torch.from_numpy(forces)
                coordinates = self.transform(coordinates)
                species = self.transform(species)
                forces = self.transform(forces)
@@ -39,9 +46,12 @@ class TestForce(unittest.TestCase):
        species_coordinates = []
        coordinates_forces = []
        for i in range(N):
-            datafile = os.path.join(path, 'test_data/{}'.format(i))
+            datafile = os.path.join(path, 'test_data/ANI1_subset/{}'.format(i))
            with open(datafile, 'rb') as f:
                coordinates, species, _, _, _, forces = pickle.load(f)
+                coordinates = torch.from_numpy(coordinates)
+                species = torch.from_numpy(species)
+                forces = torch.from_numpy(forces)
                coordinates = self.transform(coordinates)
                species = self.transform(species)
                forces = self.transform(forces)
@@ -58,6 +68,23 @@ class TestForce(unittest.TestCase):
            max_diff = (forces + derivative).abs().max().item()
            self.assertLess(max_diff, self.tolerance)
+    def testNIST(self):
+        datafile = os.path.join(path, 'test_data/NIST/all')
+        with open(datafile, 'rb') as f:
+            data = pickle.load(f)
+            for coordinates, species, _, _, _, forces in data:
+                if self.random_skip():
+                    continue
+                coordinates = torch.from_numpy(coordinates).to(torch.float) \
+                                   .requires_grad_(True)
+                species = torch.from_numpy(species)
+                forces = torch.from_numpy(forces).to(torch.float)
+                _, energies = self.model((species, coordinates))
+                derivative = torch.autograd.grad(energies.sum(),
+                                                 coordinates)[0]
+                max_diff = (forces + derivative).abs().max().item()
+                self.assertLess(max_diff, self.tolerance)
 class TestForceASEComputer(TestForce):
@@ -67,7 +94,11 @@ class TestForceASEComputer(TestForce):
    def transform(self, x):
        """To reduce the size of test cases for faster test speed"""
-        return x[:3, ...]
+        return x[:2, ...]
+    def random_skip(self):
+        """To reduce the size of test cases for faster test speed"""
+        return random.random() < 0.95
 if __name__ == '__main__':

--- a/tools/diverse_test_set/README.md
+++ b/tools/diverse_test_set/README.md
+# Commands
+## Download all 3d structures from NIST
+```
+scrapy runspider nist.py -o result.json
+```
\ No newline at end of file
--- a/tools/diverse_test_set/nist.py
+++ b/tools/diverse_test_set/nist.py
+import scrapy
+species = ['H', 'C', 'N', 'O']
+formula = ''.join([x + '*' for x in species])
+urltemplate = "https://webbook.nist.gov/cgi/cbook.cgi?Value={}-{}&VType=MW&Formula=" + formula + "&NoIon=on&MatchIso=on"  # noqa: E501
+min_weight = 0
+max_weight = 9999999
+class NISTSpider(scrapy.Spider):
+    name = "NIST"
+    def start_requests(self):
+        start_url = urltemplate.format(min_weight, max_weight)
+        yield scrapy.Request(
+            url=start_url,
+            callback=lambda x: self.parse_range(x, min_weight, max_weight))
+    def parse_range(self, response, from_, to):
+        search_result_list = response.xpath('//*[@id="main"]/ol/li/a')
+        # if there are 400 result, then the range is too large,
+        # half the range and repeat the search
+        if len(search_result_list) == 400 and to - from_ > 1:
+            mid = (from_ + to) // 2
+            next_url1 = urltemplate.format(from_, mid)
+            next_url2 = urltemplate.format(mid, to)
+            yield scrapy.Request(
+                url=next_url1,
+                callback=lambda x: self.parse_range(x, from_, mid))
+            yield scrapy.Request(
+                url=next_url2,
+                callback=lambda x: self.parse_range(x, mid, to))
+        elif len(search_result_list) == 400 and to - from_ == 1:
+            next_url1 = urltemplate.format(from_, from_)
+            next_url2 = urltemplate.format(to, to)
+            yield scrapy.Request(
+                url=next_url1,
+                callback=lambda x: self.parse_range(x, from_, from_))
+            yield scrapy.Request(
+                url=next_url2,
+                callback=lambda x: self.parse_range(x, to, to))
+        else:
+            for i in search_result_list:
+                href = i.css('a::attr("href")').extract_first()
+                prefix = '/cgi/cbook.cgi?ID='
+                suffix = '&Units=SI'
+                nist_id = href[len(prefix): len(href) - len(suffix)]
+                url3d = 'https://webbook.nist.gov/cgi/cbook.cgi?Str3File=' + nist_id  # noqa: E501
+                yield scrapy.Request(
+                    url=url3d,
+                    callback=lambda x, n=nist_id: self.parse_sdf(x, n))
+    def parse_sdf(self, response, nist_id):
+        if response.text:
+            text = response.text
+            lines = [x.strip() for x in text.split("\r\n")]
+            lines = lines[3:]
+            count = int(lines[0].split()[0])
+            lines = lines[1: count + 1]
+            lines = [x.split()[:4] for x in lines]
+            # double check if it contain unexpected elements
+            good = True
+            atoms = []
+            for x, y, z, atype in lines:
+                if atype not in species:
+                    good = False
+                    break
+                atoms.append((atype, float(x), float(y), float(z)))
+            if good:
+                yield {'id': nist_id, 'size': count, 'atoms': atoms}
--- a/tests/test_data/1
+++ b/tests/test_data/1
--- a/tools/generate-unit-test-expect.py
+++ b/tools/generate-unit-test-expect.py
-import torch
 import os
-import ase
-import pyNeuroChem
-import ase_interface
-import numpy
-import torchani
 import pickle
-from torchani import buildin_const_file, buildin_sae_file, \
+import pyanitools
-    buildin_network_dir
+import numpy
-import torchani.training.pyanitools
+from neurochem_calculator import NeuroChem, path
+import json
-path = os.path.dirname(os.path.realpath(__file__))
+import tqdm
-conv_au_ev = 27.21138505
+import random
-class NeuroChem (torchani.aev.AEVComputer):
-    def __init__(self, const_file=buildin_const_file,
-                 sae_file=buildin_sae_file,
-                 network_dir=buildin_network_dir):
-        super(NeuroChem, self).__init__(False, const_file)
-        self.sae_file = sae_file
-        self.network_dir = network_dir
-        self.nc = pyNeuroChem.molecule(
-            self.const_file, self.sae_file, self.network_dir, 0)
-    def _get_radial_part(self, fullaev):
-        return fullaev[:, :, :self.radial_length]
-    def _get_angular_part(self, fullaev):
-        return fullaev[:, :, self.radial_length:]
-    def _per_conformation(self, coordinates, species):
-        atoms = coordinates.shape[0]
-        mol = ase.Atoms(''.join(species), positions=coordinates)
-        mol.set_calculator(ase_interface.ANI(False))
-        mol.calc.setnc(self.nc)
-        energy = mol.get_potential_energy() / conv_au_ev
-        aevs = [mol.calc.nc.atomicenvironments(j) for j in range(atoms)]
-        force = mol.get_forces() / conv_au_ev
-        aevs = numpy.stack(aevs)
-        return aevs, energy, force
-    def forward(self, coordinates, species):
-        conformations = coordinates.shape[0]
-        results = [self._per_conformation(
-            coordinates[i], species) for i in range(conformations)]
-        aevs, energies, forces = zip(*results)
-        aevs = torch.from_numpy(numpy.stack(aevs)).type(
-            self.EtaR.dtype).to(self.EtaR.device)
-        energies = torch.from_numpy(numpy.stack(energies)).type(
-            self.EtaR.dtype).to(self.EtaR.device)
-        forces = torch.from_numpy(numpy.stack(forces)).type(
-            self.EtaR.dtype).to(self.EtaR.device)
-        return self._get_radial_part(aevs), \
-            self._get_angular_part(aevs), \
-            energies, forces
+neurochem = NeuroChem()
-consts = torchani.neurochem.Constants()
+# generate expect for ANI1 subset
-aev = torchani.AEVComputer(**consts)
-ncaev = NeuroChem().to(torch.device('cpu'))
 mol_count = 0
-species_indices = {consts.species[i]: i for i in range(len(aev.species))}
 for i in [1, 2, 3, 4]:
    data_file = os.path.join(
        path, '../dataset/ani_gdb_s0{}.h5'.format(i))
-    adl = torchani.training.pyanitools.anidataloader(data_file)
+    adl = pyanitools.anidataloader(data_file)
-    for data in adl:
+    for data in tqdm.tqdm(adl, desc='ANI1: {} heavy atoms'.format(i)):
        coordinates = data['coordinates'][:10, :]
-        coordinates = torch.from_numpy(coordinates).type(ncaev.EtaR.dtype)
+        pickleobj = neurochem(coordinates, data['species'])
-        species = torch.tensor([species_indices[i] for i in data['species']],
-                               dtype=torch.long, device=torch.device('cpu')) \
-                       .expand(10, -1)
-        smiles = ''.join(data['smiles'])
-        radial, angular, energies, forces = ncaev(coordinates, data['species'])
-        pickleobj = (coordinates, species, radial, angular, energies, forces)
        dumpfile = os.path.join(
-            path, '../tests/test_data/{}'.format(mol_count))
+            path, '../tests/test_data/ANI1_subset/{}'.format(mol_count))
        with open(dumpfile, 'wb') as f:
-            pickle.dump(pickleobj, f, protocol=2)
+            pickle.dump(pickleobj, f)
+        mol_count += 1
+# generate expect for NIST
+keep_ratio = 0.1  # reduce the size of generated file by discarding
+mol_count = 0
+with open(os.path.join(path, 'diverse_test_set/result.json')) as f:
+    pickle_objects = []
+    for i in tqdm.tqdm(json.load(f), desc='NIST'):
+        if random.random() > keep_ratio:
+            continue
+        atoms = i['atoms']
+        natoms = len(atoms)
+        species = []
+        coordinates = []
+        for atype, x, y, z in atoms:
+            species.append(atype)
+            coordinates.append([x, y, z])
+        pickleobj = neurochem(numpy.array(coordinates), species)
+        pickle_objects.append(pickleobj)
        mol_count += 1
+    dumpfile = os.path.join(
+        path, '../tests/test_data/NIST/all')
+    with open(dumpfile, 'wb') as f:
+        pickle.dump(pickle_objects, f)
--- a/tools/neurochem_calculator.py
+++ b/tools/neurochem_calculator.py
+import os
+import ase
+import pyNeuroChem
+import ase_interface
+import numpy
+path = os.path.dirname(os.path.realpath(__file__))
+builtin_path = os.path.join(path, '../torchani/resources/ani-1x_dft_x8ens/')
+const_file = os.path.join(builtin_path, 'rHCNO-5.2R_16-3.5A_a4-8.params')
+sae_file = os.path.join(builtin_path, 'sae_linfit.dat')
+network_dir = os.path.join(builtin_path, 'train0/networks/')
+radial_length = 64
+conv_au_ev = 27.21138505
+all_species = ['H', 'C', 'N', 'O']
+species_indices = {all_species[i]: i for i in range(len(all_species))}
+class NeuroChem:
+    def __init__(self):
+        self.nc = pyNeuroChem.molecule(const_file, sae_file, network_dir, 0)
+    def _get_radial_part(self, fullaev):
+        return fullaev[:, :, :radial_length]
+    def _get_angular_part(self, fullaev):
+        return fullaev[:, :, radial_length:]
+    def _per_conformation(self, coordinates, species):
+        atoms = coordinates.shape[0]
+        mol = ase.Atoms(''.join(species), positions=coordinates)
+        mol.set_calculator(ase_interface.ANI(False))
+        mol.calc.setnc(self.nc)
+        energy = mol.get_potential_energy() / conv_au_ev
+        aevs = [mol.calc.nc.atomicenvironments(j) for j in range(atoms)]
+        force = mol.get_forces() / conv_au_ev
+        aevs = numpy.stack(aevs)
+        return aevs, energy, force
+    def __call__(self, coordinates, species):
+        if len(coordinates.shape) == 2:
+            coordinates = coordinates.reshape(1, -1, 3)
+        conformations = coordinates.shape[0]
+        results = [self._per_conformation(
+            coordinates[i], species) for i in range(conformations)]
+        aevs, energies, forces = zip(*results)
+        aevs = numpy.stack(aevs)
+        energies = numpy.stack(energies)
+        forces = numpy.stack(forces)
+        species = numpy.array([species_indices[i] for i in species])
+        species = species.reshape(1, -1)
+        species = numpy.broadcast_to(species,
+                                     (coordinates.shape[0], species.shape[1]))
+        return coordinates, species, self._get_radial_part(aevs), \
+            self._get_angular_part(aevs), energies, forces