cuaev benchmark file (#564)

* update * rm * update * update * update * ase test atolerent

cuaev benchmark file (#564)
* update * rm * update * update * update * ase test atolerent
910cca89 · Jinze Xue · GitHub · 23c9816c · 910cca89 · 910cca89
Unverified Commit 910cca89 authored Jan 30, 2021 by Jinze Xue Committed by GitHub Jan 30, 2021
7 changed files
--- a/README.md
+++ b/README.md
@@ -61,7 +61,8 @@ To run the tests and examples, you must manually download a data package
 ./download.sh
 ```
-(Optional) To install AEV CUDA Extension (speedup for AEV computation), please follow the instruction at [torchani/cuaev](https://github.com/aiqm/torchani/tree/master/torchani/cuaev).
+[CUAEV](https://github.com/aiqm/torchani/tree/master/torchani/cuaev) (Optional)  
+To install AEV CUDA Extension (speedup for AEV forward and backward), please follow the instruction at [torchani/cuaev](https://github.com/aiqm/torchani/tree/master/torchani/cuaev).
 # Citation

--- a/tests/test_ase.py
+++ b/tests/test_ase.py
@@ -37,7 +37,7 @@ class TestASE(torchani.testing.TestCase):
        dyn.run(100)
        f = atoms.get_forces()
        fn = get_numeric_force(atoms, 0.001)
-        self.assertEqual(f, fn, rtol=0.1, atol=0)
+        self.assertEqual(f, fn, rtol=0.1, atol=0.1)
    def testWithNumericalStressWithPBCEnabled(self):
        # Run NPT dynamics for some steps and periodically check that the

--- a/tests/test_cuaev.py
+++ b/tests/test_cuaev.py
@@ -161,7 +161,8 @@ class TestCUAEV(TestCase):
    def testVeryDenseMolecule(self):
        """
-        Test very dense molecule for aev correctness, especially for angular part
+        Test very dense molecule for aev correctness, especially for angular kernel when center atom pairs are more than 32.
+        issue: https://github.com/aiqm/torchani/pull/555
        """
        for i in range(100):
            datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))

--- a/tools/aev-benchmark-size.py
+++ b/tools/aev-benchmark-size.py
@@ -32,6 +32,7 @@ def benchmark(speciesPositions, aev_comp, N, check_gpu_mem):
    torch.cuda.synchronize()
    start = time.time()
+    aev = None
    for i in range(N):
        aev = aev_comp(speciesPositions).aevs
        if i == 2 and check_gpu_mem:
@@ -58,10 +59,13 @@ def check_speedup_error(aev, aev_ref, speed, speed_ref):
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument('-c', '--check_gpu_mem',
+    parser.add_argument('-m', '--check_gpu_mem',
                        dest='check_gpu_mem',
                        action='store_const',
                        const=1)
+    parser.add_argument('--nsight',
+                        action='store_true',
+                        help='use nsight profile')
    parser.set_defaults(check_gpu_mem=0)
    parser = parser.parse_args()
    path = os.path.dirname(os.path.realpath(__file__))
@@ -70,6 +74,11 @@ if __name__ == "__main__":
    device = torch.device('cuda')
    files = ['small.pdb', '1hz5.pdb', '6W8H.pdb']
+    N = 500
+    if parser.nsight:
+        N = 3
+        torch.cuda.profiler.start()
    for file in files:
        datafile = os.path.join(path, f'../dataset/pdb/{file}')
        mol = read(datafile)
@@ -81,8 +90,8 @@ if __name__ == "__main__":
        speciesPositions = nnp.species_converter((species, positions))
        aev_computer = nnp.aev_computer
-        N = 500
+        if parser.nsight:
+            torch.cuda.nvtx.range_push(file)
        print('Original TorchANI:')
        aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem)
        print()
@@ -91,6 +100,11 @@ if __name__ == "__main__":
        nnp.aev_computer.use_cuda_extension = True
        cuaev_computer = nnp.aev_computer
        aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem)
-        check_speedup_error(aev, aev_ref, delta, delta_ref)
+        if parser.nsight:
+            torch.cuda.nvtx.range_pop()
+        check_speedup_error(aev, aev_ref, delta, delta_ref)
        print('-' * 70 + '\n')
+    if parser.nsight:
+        torch.cuda.profiler.stop()
--- a/tools/training-aev-benchmark.py
+++ b/tools/training-aev-benchmark.py
@@ -7,6 +7,7 @@ import pkbar
 import gc
 import pynvml
 import os
+import pickle
 from torchani.units import hartree2kcalmol
@@ -76,8 +77,16 @@ def sync_cuda(sync):
        torch.cuda.synchronize()
-def benchmark(parser, dataset, use_cuda_extension, force_training=False):
+def print_timer(label, t):
-    synchronize = True if parser.synchronize else False
+    if t < 1:
+        t = f'{t * 1000:.1f} ms'
+    else:
+        t = f'{t:.3f} sec'
+    print(f'{label} - {t}')
+def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
+    synchronize = True
    timers = {}
    def time_func(key, func):
@@ -86,8 +95,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
        def wrapper(*args, **kwargs):
            start = timeit.default_timer()
            ret = func(*args, **kwargs)
-            if synchronize:
+            sync_cuda(synchronize)
-                torch.cuda.synchronize()
            end = timeit.default_timer()
            timers[key] += end - start
            return ret
@@ -128,6 +136,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
    print('=> start training')
    start = time.time()
    loss_time = 0
+    force_time = 0
    for epoch in range(0, parser.num_epochs):
@@ -136,55 +145,62 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
        for i, properties in enumerate(dataset):
            species = properties['species'].to(parser.device)
-            coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_training)
+            coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_inference)
            true_energies = properties['energies'].to(parser.device).float()
            num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
            _, predicted_energies = model((species, coordinates))
            # TODO add sync after aev is done
            sync_cuda(synchronize)
            energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
-            if force_training:
+            if force_inference:
+                sync_cuda(synchronize)
                force_coefficient = 0.1
                true_forces = properties['forces'].to(parser.device).float()
+                force_start = time.time()
                try:
+                    sync_cuda(synchronize)
                    forces = -torch.autograd.grad(predicted_energies.sum(), coordinates, create_graph=True, retain_graph=True)[0]
+                    sync_cuda(synchronize)
                except Exception as e:
                    alert('Error: {}'.format(e))
                    return
+                force_time += time.time() - force_start
                force_loss = (mse(true_forces, forces).sum(dim=(1, 2)) / num_atoms).mean()
                loss = energy_loss + force_coefficient * force_loss
+                sync_cuda(synchronize)
            else:
                loss = energy_loss
            rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy()
+            progbar.update(i, values=[("rmse", rmse)])
+            if not force_inference:
                sync_cuda(synchronize)
                loss_start = time.time()
                loss.backward()
+                # print('2', coordinates.grad)
                sync_cuda(synchronize)
                loss_stop = time.time()
                loss_time += loss_stop - loss_start
                optimizer.step()
                sync_cuda(synchronize)
-            progbar.update(i, values=[("rmse", rmse)])
        checkgpu()
    sync_cuda(synchronize)
    stop = time.time()
    print('=> More detail about benchmark PER EPOCH')
-    for k in timers:
-        if k.startswith('torchani.'):
-            print('   {} - {:.1f}s'.format(k, timers[k] / parser.num_epochs))
    total_time = (stop - start) / parser.num_epochs
    loss_time = loss_time / parser.num_epochs
+    force_time = force_time / parser.num_epochs
    opti_time = timers['optimizer.step'] / parser.num_epochs
    forward_time = timers['forward'] / parser.num_epochs
    aev_time = timers['total'] / parser.num_epochs
-    print('Total AEV - {:.1f}s'.format(aev_time))
+    print_timer('   Total AEV', aev_time)
-    print('Forward - {:.1f}s'.format(forward_time))
+    print_timer('   Forward', forward_time)
-    print('Backward - {:.1f}s'.format(loss_time))
+    print_timer('   Backward', loss_time)
-    print('Optimizer - {:.1f}s'.format(opti_time))
+    print_timer('   Force', force_time)
-    print('Others - {:.1f}s'.format(total_time - loss_time - aev_time - forward_time - opti_time))
+    print_timer('   Optimizer', opti_time)
-    print('Epoch time - {:.1f}s'.format(total_time))
+    print_timer('   Others', total_time - loss_time - aev_time - forward_time - opti_time - force_time)
+    print_timer('   Epoch time', total_time)
 if __name__ == "__main__":
@@ -199,22 +215,30 @@ if __name__ == "__main__":
    parser.add_argument('-b', '--batch_size',
                        help='Number of conformations of each batch',
                        default=2560, type=int)
-    parser.add_argument('-y', '--synchronize',
+    parser.add_argument('-p', '--pickle',
                        action='store_true',
-                        help='whether to insert torch.cuda.synchronize() at the end of each function')
+                        help='Dataset is pickled or not')
+    parser.add_argument('--nsight',
+                        action='store_true',
+                        help='use nsight profile')
    parser.add_argument('-n', '--num_epochs',
                        help='epochs',
                        default=1, type=int)
    parser = parser.parse_args()
    print('=> loading dataset...')
+    if parser.pickle:
+        f = open(parser.dataset_path, 'rb')
+        dataset_shuffled = pickle.load(f)
+        f.close()
+    else:
        shifter = torchani.EnergyShifter(None)
-    # parser.batch_size = 1280
        dataset = torchani.data.load(parser.dataset_path, additional_properties=('forces',)).subtract_self_energies(shifter).species_to_indices()
        print('=> Caching shuffled dataset...')
        dataset_shuffled = list(dataset.shuffle().collate(parser.batch_size))
-    print('=> Caching non-shuffled dataset...')
+        f = open(f'{parser.dataset_path}.pickle', 'wb')
-    dataset = list(dataset.collate(parser.batch_size))
+        pickle.dump(dataset_shuffled, f)
+        f.close()
    print("=> CUDA info:")
    devices = torch.cuda.device_count()
@@ -225,36 +249,20 @@ if __name__ == "__main__":
        print('   {}'.format(torch.cuda.get_device_properties(i)))
        checkgpu(i)
-    print("\n\n=> Test 1/8: Shuffled Dataset, USE cuda extension, Energy training")
+    print("\n\n=> Test 1: USE cuda extension, Energy training")
-    torch.cuda.empty_cache()
-    gc.collect()
-    benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_training=False)
-    print("\n\n=> Test 2/8: Shuffled Dataset, NO cuda extension, Energy training")
-    torch.cuda.empty_cache()
-    gc.collect()
-    benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_training=False)
-    print("\n\n=> Test 3/8: Non-Shuffled Dataset, USE cuda extension, Energy training")
    torch.cuda.empty_cache()
    gc.collect()
-    benchmark(parser, dataset, use_cuda_extension=True, force_training=False)
+    benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=False)
-    print("\n\n=> Test 4/8: Non-Shuffled Dataset, NO cuda extension, Energy training")
+    print("\n\n=> Test 2: NO cuda extension, Energy training")
    torch.cuda.empty_cache()
    gc.collect()
-    benchmark(parser, dataset, use_cuda_extension=False, force_training=False)
+    benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=False)
-    print("\n\n=> Test 5/8: Shuffled Dataset, USE cuda extension, Force and Energy training")
+    print("\n\n=> Test 3: USE cuda extension, Force and Energy inference")
-    torch.cuda.empty_cache()
-    gc.collect()
-    benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_training=True)
-    print("\n\n=> Test 6/8: Shuffled Dataset, NO cuda extension, Force and Energy training")
-    torch.cuda.empty_cache()
-    gc.collect()
-    benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_training=True)
-    print("\n\n=> Test 7/8: Non-Shuffled Dataset, USE cuda extension, Force and Energy training")
    torch.cuda.empty_cache()
    gc.collect()
-    benchmark(parser, dataset, use_cuda_extension=True, force_training=True)
+    benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=True)
-    print("\n\n=> Test 8/8: Non-Shuffled Dataset, NO cuda extension, Force and Energy training")
+    print("\n\n=> Test 4: NO cuda extension, Force and Energy inference")
    torch.cuda.empty_cache()
    gc.collect()
-    benchmark(parser, dataset, use_cuda_extension=False, force_training=True)
+    benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=True)
--- a/torchani/cuaev/.gitignore
+++ b/torchani/cuaev/.gitignore
+build/*
--- a/torchani/cuaev/README.md
+++ b/torchani/cuaev/README.md
@@ -29,22 +29,29 @@ Pass `use_cuda_extension=True` when construct aev_computer, for example:
 cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species, use_cuda_extension=True)
 ```
-## Limitations
+## TODOs
-Current implementation of CUAEV does not support pbc and force calculation.
+- [x] CUAEV Forward
+- [x] CUAEV Backwad (Force)
+- [ ] PBC
+- [ ] Force training (Need cuaev's second derivative)
 ## Benchmark
-Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/tree/master/torchani/tools/training-aev-benchmark.py) on RTX 2080 Ti:
+Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/tree/master/torchani/tools/training-aev-benchmark.py) on TITAN V:
-|         ANI-1x          |     Without Shuffle     |         Shuffle         |
+| ANI-1x dataset (Batchsize 2560) | Energy Training         | Energy and Force Inference        |
-|:-----------------------:|:-----------------------:|:-----------------------:|
+|---------------------------------|-------------------------|-----------------------------------|
-| Time per Epoch / Memory |  AEV / Total / GPU Mem  |  AEV / Total/ GPU Mem   |
+| Time per Epoch / Memory         | AEV / Total / GPU Mem   |  AEV  / Force / Total / GPU Mem   |
-|   aev cuda extension    | 7.7s  / 26.3s / 2289 MB | 8.5s / 27.6s / 2425 MB  |
+| aev cuda extension              | 3.90s / 31.5s / 2088 MB | 3.90s / 22.6s / 43.0s / 4234 MB   |
-|     aev python code     | 21.1s / 40.0s / 7361 MB | 28.7s / 47.8s / 3475 MB |
+| aev python code                 | 23.7s / 50.2s / 3540 MB | 25.3s / 48.0s / 88.2s / 11316 MB  |
-|      improvements       |   2.74 / 1.52 / 3.22    |   3.38 / 1.73 / 1.43    |
 ## Test
 ```bash
 cd torchani
-python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5 -y
 python tests/test_cuaev.py
 ```
+benchmark
+```
+python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5
+python tools/aev-benchmark-size.py
+```
\ No newline at end of file