Unverified Commit 813f6e61 authored by Jinze Xue's avatar Jinze Xue Committed by GitHub
Browse files

CUAEV double backward for force training (#571)



* init

* init

* double backward test

* fix doublebackward test

* add another test

* rm gaev

* radial done

* angular init

* angular done

* update

* force training benchmark

* format

* update

* benchmark

* update

* update

* clean redundancy codes

* update

* adapt review request

* update

* update

* update

* update

* update

* update

* fix

* fix

* cuAngularAEVs code deduplicate

* pairwise double backward

* cuRadialAEVs dedup

* pairwiseDistance dedup

* format

* readme build notes

* save

* update

* save

* save

* update

* fix

* save

* add equations on comments
Co-authored-by: default avatarGao, Xiang <qasdfgtyuiop@gmail.com>
parent efae6d9d
......@@ -50,6 +50,7 @@ def cuda_extension(build_all=False):
import torch
from torch.utils.cpp_extension import CUDAExtension
SMs = None
print('-' * 75)
if not build_all:
SMs = []
devices = torch.cuda.device_count()
......@@ -81,12 +82,13 @@ def cuda_extension(build_all=False):
if cuda_version >= 11.1:
nvcc_args.append("-gencode=arch=compute_86,code=sm_86")
print("nvcc_args: ", nvcc_args)
print('-' * 75)
return CUDAExtension(
name='torchani.cuaev',
pkg='torchani.cuaev',
sources=glob.glob('torchani/cuaev/*.cu'),
include_dirs=maybe_download_cub(),
extra_compile_args={'cxx': ['-std=c++14'], 'nvcc': nvcc_args})
extra_compile_args={'cxx': ['-std=c++17'], 'nvcc': nvcc_args})
def cuaev_kwargs():
......
......@@ -3,9 +3,9 @@ import torch
import torchani
import unittest
import pickle
import copy
from torchani.testing import TestCase, make_tensor
path = os.path.dirname(os.path.realpath(__file__))
skipIfNoGPU = unittest.skipIf(not torch.cuda.is_available(),
......@@ -52,6 +52,64 @@ class TestCUAEV(TestCase):
num_species = 4
self.aev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species)
self.cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species, use_cuda_extension=True)
self.nn = torch.nn.Sequential(torch.nn.Linear(384, 1, False)).to(self.device)
self.radial_length = self.aev_computer.radial_length
def _double_backward_1_test(self, species, coordinates):
def double_backward(aev_computer, species, coordinates):
torch.manual_seed(12345)
self.nn.zero_grad()
_, aev = aev_computer((species, coordinates))
E = self.nn(aev).sum()
force = -torch.autograd.grad(E, coordinates, create_graph=True, retain_graph=True)[0]
force_true = torch.randn_like(force)
loss = torch.abs(force_true - force).sum(dim=(1, 2)).mean()
loss.backward()
param = next(self.nn.parameters())
param_grad = copy.deepcopy(param.grad)
return aev, force, param_grad
aev, force_ref, param_grad_ref = double_backward(self.aev_computer, species, coordinates)
cu_aev, force_cuaev, param_grad = double_backward(self.cuaev_computer, species, coordinates)
self.assertEqual(cu_aev, aev, f'cu_aev: {cu_aev}\n aev: {aev}')
self.assertEqual(force_cuaev, force_ref, f'\nforce_cuaev: {force_cuaev}\n force_ref: {force_ref}')
self.assertEqual(param_grad, param_grad_ref, f'\nparam_grad: {param_grad}\n param_grad_ref: {param_grad_ref}', atol=5e-5, rtol=5e-5)
def _double_backward_2_test(self, species, coordinates):
def double_backward(aev_computer, species, coordinates):
"""
# We want to get the gradient of `grad_aev`, which requires `grad_aev` to be a leaf node
# due to `torch.autograd`'s limitation. So we split the coord->aev->energy graph into two separate
# graphs: coord->aev and aev->energy, so that aev and grad_aev are now leaves.
"""
torch.manual_seed(12345)
# graph1 input -> aev
coordinates = coordinates.clone().detach().requires_grad_()
_, aev = aev_computer((species, coordinates))
# graph2 aev -> E
aev_ = aev.clone().detach().requires_grad_()
E = self.nn(aev_).sum()
# graph2 backward
aev_grad = torch.autograd.grad(E, aev_, create_graph=True, retain_graph=True)[0]
# graph1 backward
aev_grad_ = aev_grad.clone().detach().requires_grad_()
force = torch.autograd.grad(aev, coordinates, aev_grad_, create_graph=True, retain_graph=True)[0]
# force loss backward
force_true = torch.randn_like(force)
loss = torch.abs(force_true - force).sum(dim=(1, 2)).mean()
aev_grad_grad = torch.autograd.grad(loss, aev_grad_, create_graph=True, retain_graph=True)[0]
return aev, force, aev_grad_grad
aev, force_ref, aev_grad_grad = double_backward(self.aev_computer, species, coordinates)
cu_aev, force_cuaev, cuaev_grad_grad = double_backward(self.cuaev_computer, species, coordinates)
self.assertEqual(cu_aev, aev, f'cu_aev: {cu_aev}\n aev: {aev}', atol=5e-5, rtol=5e-5)
self.assertEqual(force_cuaev, force_ref, f'\nforce_cuaev: {force_cuaev}\n force_ref: {force_ref}', atol=5e-5, rtol=5e-5)
self.assertEqual(cuaev_grad_grad, aev_grad_grad, f'\ncuaev_grad_grad: {cuaev_grad_grad}\n aev_grad_grad: {aev_grad_grad}', atol=5e-5, rtol=5e-5)
def testSimple(self):
coordinates = torch.tensor([
......@@ -89,15 +147,58 @@ class TestCUAEV(TestCase):
_, aev = self.aev_computer((species, coordinates))
aev.backward(torch.ones_like(aev))
aev_grad = coordinates.grad
force_ref = coordinates.grad
coordinates = coordinates.clone().detach()
coordinates.requires_grad_()
_, cu_aev = self.cuaev_computer((species, coordinates))
cu_aev.backward(torch.ones_like(cu_aev))
cuaev_grad = coordinates.grad
force_cuaev = coordinates.grad
self.assertEqual(cu_aev, aev, f'cu_aev: {cu_aev}\n aev: {aev}')
self.assertEqual(cuaev_grad, aev_grad, f'\ncuaev_grad: {cuaev_grad}\n aev_grad: {aev_grad}')
self.assertEqual(force_cuaev, force_ref, f'\nforce_cuaev: {force_cuaev}\n aev_grad: {force_ref}')
def testSimpleDoubleBackward_1(self):
"""
Test Double Backward (Force training) by parameters' gradient
"""
coordinates = torch.tensor([
[[0.03192167, 0.00638559, 0.01301679],
[-0.83140486, 0.39370209, -0.26395324],
[-0.66518241, -0.84461308, 0.20759389],
[0.45554739, 0.54289633, 0.81170881],
[0.66091919, -0.16799635, -0.91037834]],
[[-4.1862600, 0.0575700, -0.0381200],
[-3.1689400, 0.0523700, 0.0200000],
[-4.4978600, 0.8211300, 0.5604100],
[-4.4978700, -0.8000100, 0.4155600],
[0.00000000, -0.00000000, -0.00000000]]
], requires_grad=True, device=self.device)
species = torch.tensor([[1, 0, 0, 0, 0], [2, 0, 0, 0, -1]], device=self.device)
self._double_backward_1_test(species, coordinates)
def testSimpleDoubleBackward_2(self):
"""
Test Double Backward (Force training) directly.
Double backward:
Forward: input is dE/dAEV, output is force
Backward: input is dLoss/dForce, output is dLoss/(dE/dAEV)
"""
coordinates = torch.tensor([
[[0.03192167, 0.00638559, 0.01301679],
[-0.83140486, 0.39370209, -0.26395324],
[-0.66518241, -0.84461308, 0.20759389],
[0.45554739, 0.54289633, 0.81170881],
[0.66091919, -0.16799635, -0.91037834]],
[[-4.1862600, 0.0575700, -0.0381200],
[-3.1689400, 0.0523700, 0.0200000],
[-4.4978600, 0.8211300, 0.5604100],
[-4.4978700, -0.8000100, 0.4155600],
[0.00000000, -0.00000000, -0.00000000]]
], requires_grad=True, device=self.device)
species = torch.tensor([[1, 0, 0, 0, 0], [2, 0, 0, 0, -1]], device=self.device)
self._double_backward_2_test(species, coordinates)
def testTripeptideMD(self):
for i in range(100):
......@@ -129,6 +230,15 @@ class TestCUAEV(TestCase):
self.assertEqual(cu_aev, aev)
self.assertEqual(cuaev_grad, aev_grad, atol=5e-5, rtol=5e-5)
def testTripeptideMDDoubleBackward_2(self):
for i in range(100):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
with open(datafile, 'rb') as f:
coordinates, species, *_ = pickle.load(f)
coordinates = torch.from_numpy(coordinates).float().unsqueeze(0).to(self.device).requires_grad_(True)
species = torch.from_numpy(species).unsqueeze(0).to(self.device)
self._double_backward_2_test(species, coordinates)
def testNIST(self):
datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f:
......@@ -144,7 +254,7 @@ class TestCUAEV(TestCase):
datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f:
data = pickle.load(f)
for coordinates, species, _, _, _, _ in data:
for coordinates, species, _, _, _, _ in data[:10]:
coordinates = torch.from_numpy(coordinates).to(torch.float).to(self.device).requires_grad_(True)
species = torch.from_numpy(species).to(self.device)
_, aev = self.aev_computer((species, coordinates))
......@@ -159,12 +269,21 @@ class TestCUAEV(TestCase):
self.assertEqual(cu_aev, aev)
self.assertEqual(cuaev_grad, aev_grad, atol=5e-5, rtol=5e-5)
def testNISTDoubleBackward_2(self):
datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f:
data = pickle.load(f)
for coordinates, species, _, _, _, _ in data[:3]:
coordinates = torch.from_numpy(coordinates).to(torch.float).to(self.device).requires_grad_(True)
species = torch.from_numpy(species).to(self.device)
self._double_backward_2_test(species, coordinates)
def testVeryDenseMolecule(self):
"""
Test very dense molecule for aev correctness, especially for angular kernel when center atom pairs are more than 32.
issue: https://github.com/aiqm/torchani/pull/555
"""
for i in range(100):
for i in range(5):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
with open(datafile, 'rb') as f:
coordinates, species, *_ = pickle.load(f)
......@@ -176,7 +295,7 @@ class TestCUAEV(TestCase):
self.assertEqual(cu_aev, aev, atol=5e-5, rtol=5e-5)
def testVeryDenseMoleculeBackward(self):
for i in range(100):
for i in range(5):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
with open(datafile, 'rb') as f:
coordinates, species, *_ = pickle.load(f)
......
......@@ -26,7 +26,7 @@ def info(text):
print('\033[32m{}\33[0m'.format(text)) # green
def benchmark(speciesPositions, aev_comp, N, check_gpu_mem):
def benchmark(speciesPositions, aev_comp, N, check_gpu_mem, nn=None, verbose=True):
torch.cuda.empty_cache()
gc.collect()
torch.cuda.synchronize()
......@@ -34,14 +34,25 @@ def benchmark(speciesPositions, aev_comp, N, check_gpu_mem):
aev = None
for i in range(N):
aev = aev_comp(speciesPositions).aevs
species, coordinates = speciesPositions
if nn is not None: # double backward
coordinates = coordinates.requires_grad_()
_, aev = aev_computer((species, coordinates))
E = nn(aev).sum()
force = -torch.autograd.grad(E, coordinates, create_graph=True, retain_graph=True)[0]
force_true = torch.randn_like(force)
loss = torch.abs(force_true - force).sum(dim=(1, 2)).mean()
loss.backward()
else:
_, aev = aev_comp((species, coordinates))
if i == 2 and check_gpu_mem:
checkgpu()
torch.cuda.synchronize()
delta = time.time() - start
print(f' Duration: {delta:.2f} s')
print(f' Speed: {delta/N*1000:.2f} ms/it')
if verbose:
print(f' Duration: {delta:.2f} s')
print(f' Speed: {delta/N*1000:.2f} ms/it')
return aev, delta
......@@ -63,10 +74,14 @@ if __name__ == "__main__":
dest='check_gpu_mem',
action='store_const',
const=1)
parser.add_argument('--nsight',
parser.add_argument('-s', '--nsight',
action='store_true',
help='use nsight profile')
parser.add_argument('-b', '--backward',
action='store_true',
help='benchmark double backward')
parser.set_defaults(check_gpu_mem=0)
parser.set_defaults(backward=0)
parser = parser.parse_args()
path = os.path.dirname(os.path.realpath(__file__))
......@@ -74,7 +89,7 @@ if __name__ == "__main__":
device = torch.device('cuda')
files = ['small.pdb', '1hz5.pdb', '6W8H.pdb']
N = 500
N = 200
if parser.nsight:
N = 3
torch.cuda.profiler.start()
......@@ -89,17 +104,24 @@ if __name__ == "__main__":
nnp = torchani.models.ANI2x(periodic_table_index=True, model_index=None).to(device)
speciesPositions = nnp.species_converter((species, positions))
aev_computer = nnp.aev_computer
if parser.backward:
nn = torch.nn.Sequential(torch.nn.Linear(nnp.aev_computer.aev_length, 1, False)).to(device)
else:
nn = None
if parser.nsight:
torch.cuda.nvtx.range_push(file)
print('Original TorchANI:')
aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem)
aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem, nn)
print()
print('CUaev:')
nnp.aev_computer.use_cuda_extension = True
cuaev_computer = nnp.aev_computer
aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem)
# warm up
_, _ = benchmark(speciesPositions, cuaev_computer, 1, check_gpu_mem, nn, verbose=False)
# run
aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem, nn)
if parser.nsight:
torch.cuda.nvtx.range_pop()
......
......@@ -10,6 +10,9 @@ import os
import pickle
from torchani.units import hartree2kcalmol
summary = ''
runcounter = 0
def build_network():
H_network = torch.nn.Sequential(
......@@ -51,7 +54,17 @@ def build_network():
torch.nn.CELU(0.1),
torch.nn.Linear(96, 1)
)
return [H_network, C_network, N_network, O_network]
nets = [H_network, C_network, N_network, O_network]
for net in nets:
net.apply(init_normal)
return nets
def init_normal(m):
if type(m) == torch.nn.Linear:
torch.nn.init.kaiming_uniform_(m.weight)
def checkgpu(device=None):
......@@ -66,6 +79,7 @@ def checkgpu(device=None):
info = pynvml.nvmlDeviceGetMemoryInfo(h)
name = pynvml.nvmlDeviceGetName(h)
print(' GPU Memory Used (nvidia-smi): {:7.1f}MB / {:.1f}MB ({})'.format(info.used / 1024 / 1024, info.total / 1024 / 1024, name.decode()))
return f'{(info.used / 1024 / 1024):.1f}MB'
def alert(text):
......@@ -85,7 +99,20 @@ def print_timer(label, t):
print(f'{label} - {t}')
def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
def format_time(t):
if t < 1:
t = f'{t * 1000:.1f} ms'
else:
t = f'{t:.3f} sec'
return t
def benchmark(parser, dataset, use_cuda_extension, force_train=False):
global summary
global runcounter
if parser.nsight and runcounter >= 0:
torch.cuda.nvtx.range_push(parser.runname)
synchronize = True
timers = {}
......@@ -145,14 +172,14 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
for i, properties in enumerate(dataset):
species = properties['species'].to(parser.device)
coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_inference)
coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_train)
true_energies = properties['energies'].to(parser.device).float()
num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
_, predicted_energies = model((species, coordinates))
# TODO add sync after aev is done
sync_cuda(synchronize)
energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
if force_inference:
if force_train:
sync_cuda(synchronize)
force_coefficient = 0.1
true_forces = properties['forces'].to(parser.device).float()
......@@ -172,21 +199,21 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
loss = energy_loss
rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy()
progbar.update(i, values=[("rmse", rmse)])
if not force_inference:
sync_cuda(synchronize)
loss_start = time.time()
loss.backward()
# print('2', coordinates.grad)
sync_cuda(synchronize)
loss_stop = time.time()
loss_time += loss_stop - loss_start
optimizer.step()
sync_cuda(synchronize)
sync_cuda(synchronize)
loss_start = time.time()
loss.backward()
sync_cuda(synchronize)
loss_stop = time.time()
loss_time += loss_stop - loss_start
optimizer.step()
sync_cuda(synchronize)
checkgpu()
gpumem = checkgpu()
sync_cuda(synchronize)
stop = time.time()
if parser.nsight and runcounter >= 0:
torch.cuda.nvtx.range_pop()
print('=> More detail about benchmark PER EPOCH')
total_time = (stop - start) / parser.num_epochs
loss_time = loss_time / parser.num_epochs
......@@ -199,9 +226,18 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
print_timer(' Backward', loss_time)
print_timer(' Force', force_time)
print_timer(' Optimizer', opti_time)
print_timer(' Others', total_time - loss_time - aev_time - forward_time - opti_time - force_time)
others_time = total_time - loss_time - aev_time - forward_time - opti_time - force_time
print_timer(' Others', others_time)
print_timer(' Epoch time', total_time)
if runcounter == 0:
summary += '\n' + 'RUN'.ljust(27) + 'Total AEV'.ljust(13) + 'Forward'.ljust(13) + 'Backward'.ljust(13) + 'Force'.ljust(13) + \
'Optimizer'.ljust(13) + 'Others'.ljust(13) + 'Epoch time'.ljust(13) + 'GPU'.ljust(13) + '\n'
if runcounter >= 0:
summary += f'{runcounter} {parser.runname}'.ljust(27) + f'{format_time(aev_time)}'.ljust(13) + f'{format_time(forward_time)}'.ljust(13) + f'{format_time(loss_time)}'.ljust(13) + f'{format_time(force_time)}'.ljust(13) + \
f'{format_time(opti_time)}'.ljust(13) + f'{format_time(others_time)}'.ljust(13) + f'{format_time(total_time)}'.ljust(13) + f'{gpumem}'.ljust(13) + '\n'
runcounter += 1
if __name__ == "__main__":
# parse command line arguments
......@@ -249,20 +285,43 @@ if __name__ == "__main__":
print(' {}'.format(torch.cuda.get_device_properties(i)))
checkgpu(i)
print("\n\n=> Test 1: USE cuda extension, Energy training")
# Warming UP
if len(dataset_shuffled) < 100:
runcounter = -1
parser.runname = 'Warning UP'
print(f"\n\n=> Test 0: {parser.runname}")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_train=False)
if parser.nsight:
torch.cuda.profiler.start()
parser.runname = 'cu Energy train'
print(f"\n\n=> Test 1: {parser.runname}")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=False)
print("\n\n=> Test 2: NO cuda extension, Energy training")
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_train=False)
parser.runname = 'py Energy train'
print(f"\n\n=> Test 2: {parser.runname}")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=False)
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_train=False)
print("\n\n=> Test 3: USE cuda extension, Force and Energy inference")
parser.runname = 'cu Energy + Force train'
print(f"\n\n=> Test 3: {parser.runname}")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=True)
print("\n\n=> Test 4: NO cuda extension, Force and Energy inference")
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_train=True)
parser.runname = 'py Energy + Force train'
print(f"\n\n=> Test 4: {parser.runname}")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=True)
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_train=True)
print(summary)
if parser.nsight:
torch.cuda.profiler.stop()
# CUAEV
CUDA Extension for AEV calculation.
Performance improvement is expected to be ~3X for AEV computation and ~1.5X for overall training workflow.
Performance improvement is expected to be ~3X for AEV computation and ~1.5X for energy training, 2.6X for energy+force training.
## Requirement
CUAEV needs the nightly version [pytorch](https://pytorch.org/) to be able to work.
If you you use conda, you could install it by
If you use conda, you could install it by
```
conda install pytorch torchvision torchaudio cudatoolkit={YOUR_CUDA_VERSION} -c pytorch-nightly
```
......@@ -18,21 +18,102 @@ cd torchani
# choose one option below
# use --cuaev-all-sms if you are building in SLURM environment and there are multiple different gpus in a node
# use --cuaev will only build for detected gpus
python setup.py install --cuaev-all-sms # build for all sms
python setup.py install --cuaev # only build for detected gpus
python setup.py install --cuaev-all-sms # build for all gpus
# or for development
# `pip install -e . && ` is only needed for the very first install (because issue of https://github.com/pypa/pip/issues/1883)
pip install -e . && pip install -v -e . --global-option="--cuaev-all-sms" # build for all sms
pip install -e . && pip install -v -e . --global-option="--cuaev" # only build for detected gpus
pip install -e . && pip install -v -e . --global-option="--cuaev-all-sms" # build for all gpus
```
<del>Notes for install on Hipergator</del> (Currently not working because Pytorch dropped the official build for cuda/10.0)
Notes for build CUAEV on multiple HPC
<details>
<summary>Bridges2</summary>
```bash
# prepare
srun -p GPU-small --ntasks=1 --cpus-per-task=5 --gpus=1 --time=02:00:00 --mem=20gb --pty -u bash -i
module load cuda/10.2.0
conda create -n cuaev python=3.8
conda activate cuaev
conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-nightly
# install torchani
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
<details>
<summary>Hipergator</summary>
```bash
srun -p gpu --ntasks=1 --cpus-per-task=2 --gpus=geforce:1 --time=02:00:00 --mem=10gb --pty -u bash -i
module load cuda/10.0.130 gcc/7.3.0 git
conda remove --name cuaev --all -y && conda create -n cuaev python=3.8 -y
conda activate cuaev
# install compiled torch-cu100 because pytorch droped official build for cuda 10.0
. /home/jinzexue/pytorch/loadmodule # note that there is a space after .
. /home/jinzexue/pytorch/install_deps
pip install $(realpath /home/jinzexue/pytorch/dist/torch-nightly-cu100.whl)
# check if pytorch is working, should print available's gpu infomations
python /home/jinzexue/pytorch/testcuda/testcuda.py
# install torchani
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
<details>
<summary>Expanse</summary>
```bash
srun -p gpu-shared --ntasks=1 --account=cwr109 --cpus-per-task=1 --gpus=1 --time=01:00:00 --mem=10gb --pty -u bash -i
# create env if necessary
conda create -n cuaev python=3.8
conda activate cuaev
# modules
module load cuda10.2/toolkit/10.2.89 gcc/7.5.0
# pytorch
conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-nightly
# install
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
<details>
<summary>Moria</summary>
```bash
srun -p gpu --gpus=geforce:1 --time=01:00:00 --mem=10gb --pty -u bash -i # compile may fail because of low on memery (when memery is less than 5gb)
conda install pytorch torchvision cudatoolkit=10.0 -c pytorch # make sure it's cudatoolkit=10.0
module load cuda/10.0.130
module load gcc/7.3.0
python setup.py install --cuaev-all-sms
srun --ntasks=1 --cpus-per-task=2 --gpus=1 --time=02:00:00 --mem=10gb --pty -u bash -i
# create env if necessary
conda create -n cuaev python=3.8
conda activate cuaev
# cuda path (could be added to ~/.bashrc)
export PATH=/usr/local/cuda/bin:$PATH # nvcc for cuda 9.2
# pytorch
conda install pytorch torchvision cudatoolkit=9.2 -c pytorch-nightly
# install
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
## Test
```bash
cd torchani
./download.sh
python tests/test_cuaev.py
```
## Usage
......@@ -44,27 +125,33 @@ cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, Sh
## TODOs
- [x] CUAEV Forward
- [x] CUAEV Backwad (Force)
- [x] CUAEV Double Backwad (Force training need aev's double backward w.r.t. grad_aev)
- [ ] PBC
- [ ] Force training (Need cuaev's second derivative)
## Benchmark
Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/blob/master/tools/training-aev-benchmark.py) on TITAN V:
Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/blob/master/tools/training-aev-benchmark.py):
| ANI-1x dataset (Batchsize 2560) | Energy Training | Energy and Force Inference |
|---------------------------------|-------------------------|-----------------------------------|
| Time per Epoch / Memory | AEV / Total / GPU Mem | AEV / Force / Total / GPU Mem |
| aev cuda extension | 3.90s / 31.5s / 2088 MB | 3.90s / 22.6s / 43.0s / 4234 MB |
| aev python code | 23.7s / 50.2s / 3540 MB | 25.3s / 48.0s / 88.2s / 11316 MB |
Train ANI-1x dataset (Batchsize 2560) on Tesla V100 for 1 epoch:
```
RUN Total AEV Forward Backward Force Optimizer Others Epoch time GPU
0 cu Energy 3.355 sec 4.470 sec 4.685 sec 0.0 ms 3.508 sec 2.223 sec 18.241 sec 2780.8MB
1 py Energy 19.682 sec 4.149 sec 4.663 sec 0.0 ms 3.495 sec 2.220 sec 34.209 sec 4038.8MB
2 cu Energy+Force 3.351 sec 4.200 sec 27.402 sec 16.514 sec 3.467 sec 4.556 sec 59.490 sec 7492.8MB
3 py Energy+Force 19.964 sec 4.176 sec 91.866 sec 36.554 sec 3.473 sec 5.403 sec 161.435 sec 8034.8MB
```
## Test
```bash
cd torchani
./download.sh
python tests/test_cuaev.py
Train ANI-1x dataset (Batchsize 1500) on GTX 1080 for 1 epoch:
```
RUN Total AEV Forward Backward Force Optimizer Others Epoch time GPU
0 cu Energy 14.373 sec 10.870 sec 13.100 sec 0.0 ms 11.043 sec 2.913 sec 52.299 sec 1527.5MB
1 py Energy 51.545 sec 10.228 sec 13.154 sec 0.0 ms 11.384 sec 2.874 sec 89.185 sec 2403.5MB
2 cu Energy+Force 14.275 sec 10.024 sec 85.423 sec 51.380 sec 7.396 sec 5.494 sec 173.992 sec 3577.5MB
3 py Energy+Force 51.305 sec 9.951 sec 271.078 sec 107.252 sec 7.835 sec 4.941 sec 452.362 sec 7307.5MB
```
benchmark
```
pip install pynvml pkbar
python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5
python tools/aev-benchmark-size.py
```
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment