Unverified Commit 910cca89 authored by Jinze Xue's avatar Jinze Xue Committed by GitHub
Browse files

cuaev benchmark file (#564)

* update

* rm

* update

* update

* update

* ase test atolerent
parent 23c9816c
......@@ -61,7 +61,8 @@ To run the tests and examples, you must manually download a data package
./download.sh
```
(Optional) To install AEV CUDA Extension (speedup for AEV computation), please follow the instruction at [torchani/cuaev](https://github.com/aiqm/torchani/tree/master/torchani/cuaev).
[CUAEV](https://github.com/aiqm/torchani/tree/master/torchani/cuaev) (Optional)
To install AEV CUDA Extension (speedup for AEV forward and backward), please follow the instruction at [torchani/cuaev](https://github.com/aiqm/torchani/tree/master/torchani/cuaev).
# Citation
......
......@@ -37,7 +37,7 @@ class TestASE(torchani.testing.TestCase):
dyn.run(100)
f = atoms.get_forces()
fn = get_numeric_force(atoms, 0.001)
self.assertEqual(f, fn, rtol=0.1, atol=0)
self.assertEqual(f, fn, rtol=0.1, atol=0.1)
def testWithNumericalStressWithPBCEnabled(self):
# Run NPT dynamics for some steps and periodically check that the
......
......@@ -161,7 +161,8 @@ class TestCUAEV(TestCase):
def testVeryDenseMolecule(self):
"""
Test very dense molecule for aev correctness, especially for angular part
Test very dense molecule for aev correctness, especially for angular kernel when center atom pairs are more than 32.
issue: https://github.com/aiqm/torchani/pull/555
"""
for i in range(100):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
......
......@@ -32,6 +32,7 @@ def benchmark(speciesPositions, aev_comp, N, check_gpu_mem):
torch.cuda.synchronize()
start = time.time()
aev = None
for i in range(N):
aev = aev_comp(speciesPositions).aevs
if i == 2 and check_gpu_mem:
......@@ -58,10 +59,13 @@ def check_speedup_error(aev, aev_ref, speed, speed_ref):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--check_gpu_mem',
parser.add_argument('-m', '--check_gpu_mem',
dest='check_gpu_mem',
action='store_const',
const=1)
parser.add_argument('--nsight',
action='store_true',
help='use nsight profile')
parser.set_defaults(check_gpu_mem=0)
parser = parser.parse_args()
path = os.path.dirname(os.path.realpath(__file__))
......@@ -70,6 +74,11 @@ if __name__ == "__main__":
device = torch.device('cuda')
files = ['small.pdb', '1hz5.pdb', '6W8H.pdb']
N = 500
if parser.nsight:
N = 3
torch.cuda.profiler.start()
for file in files:
datafile = os.path.join(path, f'../dataset/pdb/{file}')
mol = read(datafile)
......@@ -81,8 +90,8 @@ if __name__ == "__main__":
speciesPositions = nnp.species_converter((species, positions))
aev_computer = nnp.aev_computer
N = 500
if parser.nsight:
torch.cuda.nvtx.range_push(file)
print('Original TorchANI:')
aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem)
print()
......@@ -91,6 +100,11 @@ if __name__ == "__main__":
nnp.aev_computer.use_cuda_extension = True
cuaev_computer = nnp.aev_computer
aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem)
check_speedup_error(aev, aev_ref, delta, delta_ref)
if parser.nsight:
torch.cuda.nvtx.range_pop()
check_speedup_error(aev, aev_ref, delta, delta_ref)
print('-' * 70 + '\n')
if parser.nsight:
torch.cuda.profiler.stop()
......@@ -7,6 +7,7 @@ import pkbar
import gc
import pynvml
import os
import pickle
from torchani.units import hartree2kcalmol
......@@ -76,8 +77,16 @@ def sync_cuda(sync):
torch.cuda.synchronize()
def benchmark(parser, dataset, use_cuda_extension, force_training=False):
synchronize = True if parser.synchronize else False
def print_timer(label, t):
if t < 1:
t = f'{t * 1000:.1f} ms'
else:
t = f'{t:.3f} sec'
print(f'{label} - {t}')
def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
synchronize = True
timers = {}
def time_func(key, func):
......@@ -86,8 +95,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
def wrapper(*args, **kwargs):
start = timeit.default_timer()
ret = func(*args, **kwargs)
if synchronize:
torch.cuda.synchronize()
sync_cuda(synchronize)
end = timeit.default_timer()
timers[key] += end - start
return ret
......@@ -128,6 +136,7 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
print('=> start training')
start = time.time()
loss_time = 0
force_time = 0
for epoch in range(0, parser.num_epochs):
......@@ -136,55 +145,62 @@ def benchmark(parser, dataset, use_cuda_extension, force_training=False):
for i, properties in enumerate(dataset):
species = properties['species'].to(parser.device)
coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_training)
coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_inference)
true_energies = properties['energies'].to(parser.device).float()
num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
_, predicted_energies = model((species, coordinates))
# TODO add sync after aev is done
sync_cuda(synchronize)
energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
if force_training:
if force_inference:
sync_cuda(synchronize)
force_coefficient = 0.1
true_forces = properties['forces'].to(parser.device).float()
force_start = time.time()
try:
sync_cuda(synchronize)
forces = -torch.autograd.grad(predicted_energies.sum(), coordinates, create_graph=True, retain_graph=True)[0]
sync_cuda(synchronize)
except Exception as e:
alert('Error: {}'.format(e))
return
force_time += time.time() - force_start
force_loss = (mse(true_forces, forces).sum(dim=(1, 2)) / num_atoms).mean()
loss = energy_loss + force_coefficient * force_loss
sync_cuda(synchronize)
else:
loss = energy_loss
rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy()
progbar.update(i, values=[("rmse", rmse)])
if not force_inference:
sync_cuda(synchronize)
loss_start = time.time()
loss.backward()
# print('2', coordinates.grad)
sync_cuda(synchronize)
loss_stop = time.time()
loss_time += loss_stop - loss_start
optimizer.step()
sync_cuda(synchronize)
progbar.update(i, values=[("rmse", rmse)])
checkgpu()
sync_cuda(synchronize)
stop = time.time()
print('=> More detail about benchmark PER EPOCH')
for k in timers:
if k.startswith('torchani.'):
print(' {} - {:.1f}s'.format(k, timers[k] / parser.num_epochs))
total_time = (stop - start) / parser.num_epochs
loss_time = loss_time / parser.num_epochs
force_time = force_time / parser.num_epochs
opti_time = timers['optimizer.step'] / parser.num_epochs
forward_time = timers['forward'] / parser.num_epochs
aev_time = timers['total'] / parser.num_epochs
print('Total AEV - {:.1f}s'.format(aev_time))
print('Forward - {:.1f}s'.format(forward_time))
print('Backward - {:.1f}s'.format(loss_time))
print('Optimizer - {:.1f}s'.format(opti_time))
print('Others - {:.1f}s'.format(total_time - loss_time - aev_time - forward_time - opti_time))
print('Epoch time - {:.1f}s'.format(total_time))
print_timer(' Total AEV', aev_time)
print_timer(' Forward', forward_time)
print_timer(' Backward', loss_time)
print_timer(' Force', force_time)
print_timer(' Optimizer', opti_time)
print_timer(' Others', total_time - loss_time - aev_time - forward_time - opti_time - force_time)
print_timer(' Epoch time', total_time)
if __name__ == "__main__":
......@@ -199,22 +215,30 @@ if __name__ == "__main__":
parser.add_argument('-b', '--batch_size',
help='Number of conformations of each batch',
default=2560, type=int)
parser.add_argument('-y', '--synchronize',
parser.add_argument('-p', '--pickle',
action='store_true',
help='whether to insert torch.cuda.synchronize() at the end of each function')
help='Dataset is pickled or not')
parser.add_argument('--nsight',
action='store_true',
help='use nsight profile')
parser.add_argument('-n', '--num_epochs',
help='epochs',
default=1, type=int)
parser = parser.parse_args()
print('=> loading dataset...')
if parser.pickle:
f = open(parser.dataset_path, 'rb')
dataset_shuffled = pickle.load(f)
f.close()
else:
shifter = torchani.EnergyShifter(None)
# parser.batch_size = 1280
dataset = torchani.data.load(parser.dataset_path, additional_properties=('forces',)).subtract_self_energies(shifter).species_to_indices()
print('=> Caching shuffled dataset...')
dataset_shuffled = list(dataset.shuffle().collate(parser.batch_size))
print('=> Caching non-shuffled dataset...')
dataset = list(dataset.collate(parser.batch_size))
f = open(f'{parser.dataset_path}.pickle', 'wb')
pickle.dump(dataset_shuffled, f)
f.close()
print("=> CUDA info:")
devices = torch.cuda.device_count()
......@@ -225,36 +249,20 @@ if __name__ == "__main__":
print(' {}'.format(torch.cuda.get_device_properties(i)))
checkgpu(i)
print("\n\n=> Test 1/8: Shuffled Dataset, USE cuda extension, Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_training=False)
print("\n\n=> Test 2/8: Shuffled Dataset, NO cuda extension, Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_training=False)
print("\n\n=> Test 3/8: Non-Shuffled Dataset, USE cuda extension, Energy training")
print("\n\n=> Test 1: USE cuda extension, Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset, use_cuda_extension=True, force_training=False)
print("\n\n=> Test 4/8: Non-Shuffled Dataset, NO cuda extension, Energy training")
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=False)
print("\n\n=> Test 2: NO cuda extension, Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset, use_cuda_extension=False, force_training=False)
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=False)
print("\n\n=> Test 5/8: Shuffled Dataset, USE cuda extension, Force and Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_training=True)
print("\n\n=> Test 6/8: Shuffled Dataset, NO cuda extension, Force and Energy training")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_training=True)
print("\n\n=> Test 7/8: Non-Shuffled Dataset, USE cuda extension, Force and Energy training")
print("\n\n=> Test 3: USE cuda extension, Force and Energy inference")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset, use_cuda_extension=True, force_training=True)
print("\n\n=> Test 8/8: Non-Shuffled Dataset, NO cuda extension, Force and Energy training")
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=True)
print("\n\n=> Test 4: NO cuda extension, Force and Energy inference")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset, use_cuda_extension=False, force_training=True)
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=True)
......@@ -29,22 +29,29 @@ Pass `use_cuda_extension=True` when construct aev_computer, for example:
cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species, use_cuda_extension=True)
```
## Limitations
Current implementation of CUAEV does not support pbc and force calculation.
## TODOs
- [x] CUAEV Forward
- [x] CUAEV Backwad (Force)
- [ ] PBC
- [ ] Force training (Need cuaev's second derivative)
## Benchmark
Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/tree/master/torchani/tools/training-aev-benchmark.py) on RTX 2080 Ti:
Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/tree/master/torchani/tools/training-aev-benchmark.py) on TITAN V:
| ANI-1x | Without Shuffle | Shuffle |
|:-----------------------:|:-----------------------:|:-----------------------:|
| Time per Epoch / Memory | AEV / Total / GPU Mem | AEV / Total/ GPU Mem |
| aev cuda extension | 7.7s / 26.3s / 2289 MB | 8.5s / 27.6s / 2425 MB |
| aev python code | 21.1s / 40.0s / 7361 MB | 28.7s / 47.8s / 3475 MB |
| improvements | 2.74 / 1.52 / 3.22 | 3.38 / 1.73 / 1.43 |
| ANI-1x dataset (Batchsize 2560) | Energy Training | Energy and Force Inference |
|---------------------------------|-------------------------|-----------------------------------|
| Time per Epoch / Memory | AEV / Total / GPU Mem | AEV / Force / Total / GPU Mem |
| aev cuda extension | 3.90s / 31.5s / 2088 MB | 3.90s / 22.6s / 43.0s / 4234 MB |
| aev python code | 23.7s / 50.2s / 3540 MB | 25.3s / 48.0s / 88.2s / 11316 MB |
## Test
```bash
cd torchani
python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5 -y
python tests/test_cuaev.py
```
benchmark
```
python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5
python tools/aev-benchmark-size.py
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment