Unverified Commit 910cca89 authored by Jinze Xue's avatar Jinze Xue Committed by GitHub
Browse files

cuaev benchmark file (#564)

* update

* rm

* update

* update

* update

* ase test atolerent
parent 23c9816c
...@@ -61,7 +61,8 @@ To run the tests and examples, you must manually download a data package ...@@ -61,7 +61,8 @@ To run the tests and examples, you must manually download a data package
./download.sh ./download.sh
``` ```
(Optional) To install AEV CUDA Extension (speedup for AEV computation), please follow the instruction at [torchani/cuaev](https://github.com/aiqm/torchani/tree/master/torchani/cuaev). [CUAEV](https://github.com/aiqm/torchani/tree/master/torchani/cuaev) (Optional)
To install AEV CUDA Extension (speedup for AEV forward and backward), please follow the instruction at [torchani/cuaev](https://github.com/aiqm/torchani/tree/master/torchani/cuaev).
# Citation # Citation
......
...@@ -37,7 +37,7 @@ class TestASE(torchani.testing.TestCase): ...@@ -37,7 +37,7 @@ class TestASE(torchani.testing.TestCase):
dyn.run(100) dyn.run(100)
f = atoms.get_forces() f = atoms.get_forces()
fn = get_numeric_force(atoms, 0.001) fn = get_numeric_force(atoms, 0.001)
self.assertEqual(f, fn, rtol=0.1, atol=0) self.assertEqual(f, fn, rtol=0.1, atol=0.1)
def testWithNumericalStressWithPBCEnabled(self): def testWithNumericalStressWithPBCEnabled(self):
# Run NPT dynamics for some steps and periodically check that the # Run NPT dynamics for some steps and periodically check that the
......
...@@ -161,7 +161,8 @@ class TestCUAEV(TestCase): ...@@ -161,7 +161,8 @@ class TestCUAEV(TestCase):
def testVeryDenseMolecule(self): def testVeryDenseMolecule(self):
""" """
Test very dense molecule for aev correctness, especially for angular part Test very dense molecule for aev correctness, especially for angular kernel when center atom pairs are more than 32.
issue: https://github.com/aiqm/torchani/pull/555
""" """
for i in range(100): for i in range(100):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i)) datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
......
...@@ -32,6 +32,7 @@ def benchmark(speciesPositions, aev_comp, N, check_gpu_mem): ...@@ -32,6 +32,7 @@ def benchmark(speciesPositions, aev_comp, N, check_gpu_mem):
torch.cuda.synchronize() torch.cuda.synchronize()
start = time.time() start = time.time()
aev = None
for i in range(N): for i in range(N):
aev = aev_comp(speciesPositions).aevs aev = aev_comp(speciesPositions).aevs
if i == 2 and check_gpu_mem: if i == 2 and check_gpu_mem:
...@@ -58,10 +59,13 @@ def check_speedup_error(aev, aev_ref, speed, speed_ref): ...@@ -58,10 +59,13 @@ def check_speedup_error(aev, aev_ref, speed, speed_ref):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-c', '--check_gpu_mem', parser.add_argument('-m', '--check_gpu_mem',
dest='check_gpu_mem', dest='check_gpu_mem',
action='store_const', action='store_const',
const=1) const=1)
parser.add_argument('--nsight',
action='store_true',
help='use nsight profile')
parser.set_defaults(check_gpu_mem=0) parser.set_defaults(check_gpu_mem=0)
parser = parser.parse_args() parser = parser.parse_args()
path = os.path.dirname(os.path.realpath(__file__)) path = os.path.dirname(os.path.realpath(__file__))
...@@ -70,6 +74,11 @@ if __name__ == "__main__": ...@@ -70,6 +74,11 @@ if __name__ == "__main__":
device = torch.device('cuda') device = torch.device('cuda')
files = ['small.pdb', '1hz5.pdb', '6W8H.pdb'] files = ['small.pdb', '1hz5.pdb', '6W8H.pdb']
N = 500
if parser.nsight:
N = 3
torch.cuda.profiler.start()
for file in files: for file in files:
datafile = os.path.join(path, f'../dataset/pdb/{file}') datafile = os.path.join(path, f'../dataset/pdb/{file}')
mol = read(datafile) mol = read(datafile)
...@@ -81,8 +90,8 @@ if __name__ == "__main__": ...@@ -81,8 +90,8 @@ if __name__ == "__main__":
speciesPositions = nnp.species_converter((species, positions)) speciesPositions = nnp.species_converter((species, positions))
aev_computer = nnp.aev_computer aev_computer = nnp.aev_computer
N = 500 if parser.nsight:
torch.cuda.nvtx.range_push(file)
print('Original TorchANI:') print('Original TorchANI:')
aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem) aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem)
print() print()
...@@ -91,6 +100,11 @@ if __name__ == "__main__": ...@@ -91,6 +100,11 @@ if __name__ == "__main__":
nnp.aev_computer.use_cuda_extension = True nnp.aev_computer.use_cuda_extension = True
cuaev_computer = nnp.aev_computer cuaev_computer = nnp.aev_computer
aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem) aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem)
check_speedup_error(aev, aev_ref, delta, delta_ref) if parser.nsight:
torch.cuda.nvtx.range_pop()
check_speedup_error(aev, aev_ref, delta, delta_ref)
print('-' * 70 + '\n') print('-' * 70 + '\n')
if parser.nsight:
torch.cuda.profiler.stop()
...@@ -7,6 +7,7 @@ import pkbar ...@@ -7,6 +7,7 @@ import pkbar
import gc import gc
import pynvml import pynvml
import os import os
import pickle
from torchani.units import hartree2kcalmol from torchani.units import hartree2kcalmol
...@@ -76,8 +77,16 @@ def sync_cuda(sync): ...@@ -76,8 +77,16 @@ def sync_cuda(sync):
torch.cuda.synchronize() torch.cuda.synchronize()
def benchmark(parser, dataset, use_cuda_extension, force_training=False): def print_timer(label, t):
synchronize = True if parser.synchronize else False if t < 1:
t = f'{t * 1000:.1f} ms'
else:
t = f'{t:.3f} sec'
print(f'{label} - {t}')
def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
synchronize = True
timers = {} timers = {}
def time_func(key, func): def time_func(key, func):
...@@ -86,8 +95,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False): ...@@ -86,8 +95,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
start = timeit.default_timer() start = timeit.default_timer()
ret = func(*args, **kwargs) ret = func(*args, **kwargs)
if synchronize: sync_cuda(synchronize)
torch.cuda.synchronize()
end = timeit.default_timer() end = timeit.default_timer()
timers[key] += end - start timers[key] += end - start
return ret return ret
...@@ -128,6 +136,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False): ...@@ -128,6 +136,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
print('=> start training') print('=> start training')
start = time.time() start = time.time()
loss_time = 0 loss_time = 0
force_time = 0
for epoch in range(0, parser.num_epochs): for epoch in range(0, parser.num_epochs):
...@@ -136,55 +145,62 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False): ...@@ -136,55 +145,62 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
for i, properties in enumerate(dataset): for i, properties in enumerate(dataset):
species = properties['species'].to(parser.device) species = properties['species'].to(parser.device)
coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_training) coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_inference)
true_energies = properties['energies'].to(parser.device).float() true_energies = properties['energies'].to(parser.device).float()
num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype) num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
_, predicted_energies = model((species, coordinates)) _, predicted_energies = model((species, coordinates))
# TODO add sync after aev is done # TODO add sync after aev is done
sync_cuda(synchronize) sync_cuda(synchronize)
energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean() energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
if force_training: if force_inference:
sync_cuda(synchronize)
force_coefficient = 0.1 force_coefficient = 0.1
true_forces = properties['forces'].to(parser.device).float() true_forces = properties['forces'].to(parser.device).float()
force_start = time.time()
try: try:
sync_cuda(synchronize)
forces = -torch.autograd.grad(predicted_energies.sum(), coordinates, create_graph=True, retain_graph=True)[0] forces = -torch.autograd.grad(predicted_energies.sum(), coordinates, create_graph=True, retain_graph=True)[0]
sync_cuda(synchronize)
except Exception as e: except Exception as e:
alert('Error: {}'.format(e)) alert('Error: {}'.format(e))
return return
force_time += time.time() - force_start
force_loss = (mse(true_forces, forces).sum(dim=(1, 2)) / num_atoms).mean() force_loss = (mse(true_forces, forces).sum(dim=(1, 2)) / num_atoms).mean()
loss = energy_loss + force_coefficient * force_loss loss = energy_loss + force_coefficient * force_loss
sync_cuda(synchronize)
else: else:
loss = energy_loss loss = energy_loss
rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy() rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy()
progbar.update(i, values=[("rmse", rmse)])
if not force_inference:
sync_cuda(synchronize) sync_cuda(synchronize)
loss_start = time.time() loss_start = time.time()
loss.backward() loss.backward()
# print('2', coordinates.grad)
sync_cuda(synchronize) sync_cuda(synchronize)
loss_stop = time.time() loss_stop = time.time()
loss_time += loss_stop - loss_start loss_time += loss_stop - loss_start
optimizer.step() optimizer.step()
sync_cuda(synchronize) sync_cuda(synchronize)
progbar.update(i, values=[("rmse", rmse)])
checkgpu() checkgpu()
sync_cuda(synchronize) sync_cuda(synchronize)
stop = time.time() stop = time.time()
print('=> More detail about benchmark PER EPOCH') print('=> More detail about benchmark PER EPOCH')
for k in timers:
if k.startswith('torchani.'):
print(' {} - {:.1f}s'.format(k, timers[k] / parser.num_epochs))
total_time = (stop - start) / parser.num_epochs total_time = (stop - start) / parser.num_epochs
loss_time = loss_time / parser.num_epochs loss_time = loss_time / parser.num_epochs
force_time = force_time / parser.num_epochs
opti_time = timers['optimizer.step'] / parser.num_epochs opti_time = timers['optimizer.step'] / parser.num_epochs
forward_time = timers['forward'] / parser.num_epochs forward_time = timers['forward'] / parser.num_epochs
aev_time = timers['total'] / parser.num_epochs aev_time = timers['total'] / parser.num_epochs
print('Total AEV - {:.1f}s'.format(aev_time)) print_timer(' Total AEV', aev_time)
print('Forward - {:.1f}s'.format(forward_time)) print_timer(' Forward', forward_time)
print('Backward - {:.1f}s'.format(loss_time)) print_timer(' Backward', loss_time)
print('Optimizer - {:.1f}s'.format(opti_time)) print_timer(' Force', force_time)
print('Others - {:.1f}s'.format(total_time - loss_time - aev_time - forward_time - opti_time)) print_timer(' Optimizer', opti_time)
print('Epoch time - {:.1f}s'.format(total_time)) print_timer(' Others', total_time - loss_time - aev_time - forward_time - opti_time - force_time)
print_timer(' Epoch time', total_time)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -199,22 +215,30 @@ if __name__ == "__main__": ...@@ -199,22 +215,30 @@ if __name__ == "__main__":
parser.add_argument('-b', '--batch_size', parser.add_argument('-b', '--batch_size',
help='Number of conformations of each batch', help='Number of conformations of each batch',
default=2560, type=int) default=2560, type=int)
parser.add_argument('-y', '--synchronize', parser.add_argument('-p', '--pickle',
action='store_true', action='store_true',
help='whether to insert torch.cuda.synchronize() at the end of each function') help='Dataset is pickled or not')
parser.add_argument('--nsight',
action='store_true',
help='use nsight profile')
parser.add_argument('-n', '--num_epochs', parser.add_argument('-n', '--num_epochs',
help='epochs', help='epochs',
default=1, type=int) default=1, type=int)
parser = parser.parse_args() parser = parser.parse_args()
print('=> loading dataset...') print('=> loading dataset...')
if parser.pickle:
f = open(parser.dataset_path, 'rb')
dataset_shuffled = pickle.load(f)
f.close()
else:
shifter = torchani.EnergyShifter(None) shifter = torchani.EnergyShifter(None)
# parser.batch_size = 1280
dataset = torchani.data.load(parser.dataset_path, additional_properties=('forces',)).subtract_self_energies(shifter).species_to_indices() dataset = torchani.data.load(parser.dataset_path, additional_properties=('forces',)).subtract_self_energies(shifter).species_to_indices()
print('=> Caching shuffled dataset...') print('=> Caching shuffled dataset...')
dataset_shuffled = list(dataset.shuffle().collate(parser.batch_size)) dataset_shuffled = list(dataset.shuffle().collate(parser.batch_size))
print('=> Caching non-shuffled dataset...') f = open(f'{parser.dataset_path}.pickle', 'wb')
dataset = list(dataset.collate(parser.batch_size)) pickle.dump(dataset_shuffled, f)
f.close()
print("=> CUDA info:") print("=> CUDA info:")
devices = torch.cuda.device_count() devices = torch.cuda.device_count()
...@@ -225,36 +249,20 @@ if __name__ == "__main__": ...@@ -225,36 +249,20 @@ if __name__ == "__main__":
print(' {}'.format(torch.cuda.get_device_properties(i))) print(' {}'.format(torch.cuda.get_device_properties(i)))
checkgpu(i) checkgpu(i)
print("\n\n=> Test 1/8: Shuffled Dataset, USE cuda extension, Energy training") print("\n\n=> Test 1: USE cuda extension, Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_training=False)
print("\n\n=> Test 2/8: Shuffled Dataset, NO cuda extension, Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_training=False)
print("\n\n=> Test 3/8: Non-Shuffled Dataset, USE cuda extension, Energy training")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset, use_cuda_extension=True, force_training=False) benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=False)
print("\n\n=> Test 4/8: Non-Shuffled Dataset, NO cuda extension, Energy training") print("\n\n=> Test 2: NO cuda extension, Energy training")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset, use_cuda_extension=False, force_training=False) benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=False)
print("\n\n=> Test 5/8: Shuffled Dataset, USE cuda extension, Force and Energy training") print("\n\n=> Test 3: USE cuda extension, Force and Energy inference")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_training=True)
print("\n\n=> Test 6/8: Shuffled Dataset, NO cuda extension, Force and Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_training=True)
print("\n\n=> Test 7/8: Non-Shuffled Dataset, USE cuda extension, Force and Energy training")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset, use_cuda_extension=True, force_training=True) benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=True)
print("\n\n=> Test 8/8: Non-Shuffled Dataset, NO cuda extension, Force and Energy training") print("\n\n=> Test 4: NO cuda extension, Force and Energy inference")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset, use_cuda_extension=False, force_training=True) benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=True)
...@@ -29,22 +29,29 @@ Pass `use_cuda_extension=True` when construct aev_computer, for example: ...@@ -29,22 +29,29 @@ Pass `use_cuda_extension=True` when construct aev_computer, for example:
cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species, use_cuda_extension=True) cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species, use_cuda_extension=True)
``` ```
## Limitations ## TODOs
Current implementation of CUAEV does not support pbc and force calculation. - [x] CUAEV Forward
- [x] CUAEV Backwad (Force)
- [ ] PBC
- [ ] Force training (Need cuaev's second derivative)
## Benchmark ## Benchmark
Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/tree/master/torchani/tools/training-aev-benchmark.py) on RTX 2080 Ti: Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/tree/master/torchani/tools/training-aev-benchmark.py) on TITAN V:
| ANI-1x | Without Shuffle | Shuffle | | ANI-1x dataset (Batchsize 2560) | Energy Training | Energy and Force Inference |
|:-----------------------:|:-----------------------:|:-----------------------:| |---------------------------------|-------------------------|-----------------------------------|
| Time per Epoch / Memory | AEV / Total / GPU Mem | AEV / Total/ GPU Mem | | Time per Epoch / Memory | AEV / Total / GPU Mem | AEV / Force / Total / GPU Mem |
| aev cuda extension | 7.7s / 26.3s / 2289 MB | 8.5s / 27.6s / 2425 MB | | aev cuda extension | 3.90s / 31.5s / 2088 MB | 3.90s / 22.6s / 43.0s / 4234 MB |
| aev python code | 21.1s / 40.0s / 7361 MB | 28.7s / 47.8s / 3475 MB | | aev python code | 23.7s / 50.2s / 3540 MB | 25.3s / 48.0s / 88.2s / 11316 MB |
| improvements | 2.74 / 1.52 / 3.22 | 3.38 / 1.73 / 1.43 |
## Test ## Test
```bash ```bash
cd torchani cd torchani
python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5 -y
python tests/test_cuaev.py python tests/test_cuaev.py
``` ```
benchmark
```
python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5
python tools/aev-benchmark-size.py
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment