Unverified Commit 813f6e61 authored by Jinze Xue's avatar Jinze Xue Committed by GitHub
Browse files

CUAEV double backward for force training (#571)



* init

* init

* double backward test

* fix doublebackward test

* add another test

* rm gaev

* radial done

* angular init

* angular done

* update

* force training benchmark

* format

* update

* benchmark

* update

* update

* clean redundancy codes

* update

* adapt review request

* update

* update

* update

* update

* update

* update

* fix

* fix

* cuAngularAEVs code deduplicate

* pairwise double backward

* cuRadialAEVs dedup

* pairwiseDistance dedup

* format

* readme build notes

* save

* update

* save

* save

* update

* fix

* save

* add equations on comments
Co-authored-by: default avatarGao, Xiang <qasdfgtyuiop@gmail.com>
parent efae6d9d
...@@ -50,6 +50,7 @@ def cuda_extension(build_all=False): ...@@ -50,6 +50,7 @@ def cuda_extension(build_all=False):
import torch import torch
from torch.utils.cpp_extension import CUDAExtension from torch.utils.cpp_extension import CUDAExtension
SMs = None SMs = None
print('-' * 75)
if not build_all: if not build_all:
SMs = [] SMs = []
devices = torch.cuda.device_count() devices = torch.cuda.device_count()
...@@ -81,12 +82,13 @@ def cuda_extension(build_all=False): ...@@ -81,12 +82,13 @@ def cuda_extension(build_all=False):
if cuda_version >= 11.1: if cuda_version >= 11.1:
nvcc_args.append("-gencode=arch=compute_86,code=sm_86") nvcc_args.append("-gencode=arch=compute_86,code=sm_86")
print("nvcc_args: ", nvcc_args) print("nvcc_args: ", nvcc_args)
print('-' * 75)
return CUDAExtension( return CUDAExtension(
name='torchani.cuaev', name='torchani.cuaev',
pkg='torchani.cuaev', pkg='torchani.cuaev',
sources=glob.glob('torchani/cuaev/*.cu'), sources=glob.glob('torchani/cuaev/*.cu'),
include_dirs=maybe_download_cub(), include_dirs=maybe_download_cub(),
extra_compile_args={'cxx': ['-std=c++14'], 'nvcc': nvcc_args}) extra_compile_args={'cxx': ['-std=c++17'], 'nvcc': nvcc_args})
def cuaev_kwargs(): def cuaev_kwargs():
......
...@@ -3,9 +3,9 @@ import torch ...@@ -3,9 +3,9 @@ import torch
import torchani import torchani
import unittest import unittest
import pickle import pickle
import copy
from torchani.testing import TestCase, make_tensor from torchani.testing import TestCase, make_tensor
path = os.path.dirname(os.path.realpath(__file__)) path = os.path.dirname(os.path.realpath(__file__))
skipIfNoGPU = unittest.skipIf(not torch.cuda.is_available(), skipIfNoGPU = unittest.skipIf(not torch.cuda.is_available(),
...@@ -52,6 +52,64 @@ class TestCUAEV(TestCase): ...@@ -52,6 +52,64 @@ class TestCUAEV(TestCase):
num_species = 4 num_species = 4
self.aev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species) self.aev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species)
self.cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species, use_cuda_extension=True) self.cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species, use_cuda_extension=True)
self.nn = torch.nn.Sequential(torch.nn.Linear(384, 1, False)).to(self.device)
self.radial_length = self.aev_computer.radial_length
def _double_backward_1_test(self, species, coordinates):
def double_backward(aev_computer, species, coordinates):
torch.manual_seed(12345)
self.nn.zero_grad()
_, aev = aev_computer((species, coordinates))
E = self.nn(aev).sum()
force = -torch.autograd.grad(E, coordinates, create_graph=True, retain_graph=True)[0]
force_true = torch.randn_like(force)
loss = torch.abs(force_true - force).sum(dim=(1, 2)).mean()
loss.backward()
param = next(self.nn.parameters())
param_grad = copy.deepcopy(param.grad)
return aev, force, param_grad
aev, force_ref, param_grad_ref = double_backward(self.aev_computer, species, coordinates)
cu_aev, force_cuaev, param_grad = double_backward(self.cuaev_computer, species, coordinates)
self.assertEqual(cu_aev, aev, f'cu_aev: {cu_aev}\n aev: {aev}')
self.assertEqual(force_cuaev, force_ref, f'\nforce_cuaev: {force_cuaev}\n force_ref: {force_ref}')
self.assertEqual(param_grad, param_grad_ref, f'\nparam_grad: {param_grad}\n param_grad_ref: {param_grad_ref}', atol=5e-5, rtol=5e-5)
def _double_backward_2_test(self, species, coordinates):
def double_backward(aev_computer, species, coordinates):
"""
# We want to get the gradient of `grad_aev`, which requires `grad_aev` to be a leaf node
# due to `torch.autograd`'s limitation. So we split the coord->aev->energy graph into two separate
# graphs: coord->aev and aev->energy, so that aev and grad_aev are now leaves.
"""
torch.manual_seed(12345)
# graph1 input -> aev
coordinates = coordinates.clone().detach().requires_grad_()
_, aev = aev_computer((species, coordinates))
# graph2 aev -> E
aev_ = aev.clone().detach().requires_grad_()
E = self.nn(aev_).sum()
# graph2 backward
aev_grad = torch.autograd.grad(E, aev_, create_graph=True, retain_graph=True)[0]
# graph1 backward
aev_grad_ = aev_grad.clone().detach().requires_grad_()
force = torch.autograd.grad(aev, coordinates, aev_grad_, create_graph=True, retain_graph=True)[0]
# force loss backward
force_true = torch.randn_like(force)
loss = torch.abs(force_true - force).sum(dim=(1, 2)).mean()
aev_grad_grad = torch.autograd.grad(loss, aev_grad_, create_graph=True, retain_graph=True)[0]
return aev, force, aev_grad_grad
aev, force_ref, aev_grad_grad = double_backward(self.aev_computer, species, coordinates)
cu_aev, force_cuaev, cuaev_grad_grad = double_backward(self.cuaev_computer, species, coordinates)
self.assertEqual(cu_aev, aev, f'cu_aev: {cu_aev}\n aev: {aev}', atol=5e-5, rtol=5e-5)
self.assertEqual(force_cuaev, force_ref, f'\nforce_cuaev: {force_cuaev}\n force_ref: {force_ref}', atol=5e-5, rtol=5e-5)
self.assertEqual(cuaev_grad_grad, aev_grad_grad, f'\ncuaev_grad_grad: {cuaev_grad_grad}\n aev_grad_grad: {aev_grad_grad}', atol=5e-5, rtol=5e-5)
def testSimple(self): def testSimple(self):
coordinates = torch.tensor([ coordinates = torch.tensor([
...@@ -89,15 +147,58 @@ class TestCUAEV(TestCase): ...@@ -89,15 +147,58 @@ class TestCUAEV(TestCase):
_, aev = self.aev_computer((species, coordinates)) _, aev = self.aev_computer((species, coordinates))
aev.backward(torch.ones_like(aev)) aev.backward(torch.ones_like(aev))
aev_grad = coordinates.grad force_ref = coordinates.grad
coordinates = coordinates.clone().detach() coordinates = coordinates.clone().detach()
coordinates.requires_grad_() coordinates.requires_grad_()
_, cu_aev = self.cuaev_computer((species, coordinates)) _, cu_aev = self.cuaev_computer((species, coordinates))
cu_aev.backward(torch.ones_like(cu_aev)) cu_aev.backward(torch.ones_like(cu_aev))
cuaev_grad = coordinates.grad force_cuaev = coordinates.grad
self.assertEqual(cu_aev, aev, f'cu_aev: {cu_aev}\n aev: {aev}') self.assertEqual(cu_aev, aev, f'cu_aev: {cu_aev}\n aev: {aev}')
self.assertEqual(cuaev_grad, aev_grad, f'\ncuaev_grad: {cuaev_grad}\n aev_grad: {aev_grad}') self.assertEqual(force_cuaev, force_ref, f'\nforce_cuaev: {force_cuaev}\n aev_grad: {force_ref}')
def testSimpleDoubleBackward_1(self):
"""
Test Double Backward (Force training) by parameters' gradient
"""
coordinates = torch.tensor([
[[0.03192167, 0.00638559, 0.01301679],
[-0.83140486, 0.39370209, -0.26395324],
[-0.66518241, -0.84461308, 0.20759389],
[0.45554739, 0.54289633, 0.81170881],
[0.66091919, -0.16799635, -0.91037834]],
[[-4.1862600, 0.0575700, -0.0381200],
[-3.1689400, 0.0523700, 0.0200000],
[-4.4978600, 0.8211300, 0.5604100],
[-4.4978700, -0.8000100, 0.4155600],
[0.00000000, -0.00000000, -0.00000000]]
], requires_grad=True, device=self.device)
species = torch.tensor([[1, 0, 0, 0, 0], [2, 0, 0, 0, -1]], device=self.device)
self._double_backward_1_test(species, coordinates)
def testSimpleDoubleBackward_2(self):
"""
Test Double Backward (Force training) directly.
Double backward:
Forward: input is dE/dAEV, output is force
Backward: input is dLoss/dForce, output is dLoss/(dE/dAEV)
"""
coordinates = torch.tensor([
[[0.03192167, 0.00638559, 0.01301679],
[-0.83140486, 0.39370209, -0.26395324],
[-0.66518241, -0.84461308, 0.20759389],
[0.45554739, 0.54289633, 0.81170881],
[0.66091919, -0.16799635, -0.91037834]],
[[-4.1862600, 0.0575700, -0.0381200],
[-3.1689400, 0.0523700, 0.0200000],
[-4.4978600, 0.8211300, 0.5604100],
[-4.4978700, -0.8000100, 0.4155600],
[0.00000000, -0.00000000, -0.00000000]]
], requires_grad=True, device=self.device)
species = torch.tensor([[1, 0, 0, 0, 0], [2, 0, 0, 0, -1]], device=self.device)
self._double_backward_2_test(species, coordinates)
def testTripeptideMD(self): def testTripeptideMD(self):
for i in range(100): for i in range(100):
...@@ -129,6 +230,15 @@ class TestCUAEV(TestCase): ...@@ -129,6 +230,15 @@ class TestCUAEV(TestCase):
self.assertEqual(cu_aev, aev) self.assertEqual(cu_aev, aev)
self.assertEqual(cuaev_grad, aev_grad, atol=5e-5, rtol=5e-5) self.assertEqual(cuaev_grad, aev_grad, atol=5e-5, rtol=5e-5)
def testTripeptideMDDoubleBackward_2(self):
for i in range(100):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
with open(datafile, 'rb') as f:
coordinates, species, *_ = pickle.load(f)
coordinates = torch.from_numpy(coordinates).float().unsqueeze(0).to(self.device).requires_grad_(True)
species = torch.from_numpy(species).unsqueeze(0).to(self.device)
self._double_backward_2_test(species, coordinates)
def testNIST(self): def testNIST(self):
datafile = os.path.join(path, 'test_data/NIST/all') datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
...@@ -144,7 +254,7 @@ class TestCUAEV(TestCase): ...@@ -144,7 +254,7 @@ class TestCUAEV(TestCase):
datafile = os.path.join(path, 'test_data/NIST/all') datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
data = pickle.load(f) data = pickle.load(f)
for coordinates, species, _, _, _, _ in data: for coordinates, species, _, _, _, _ in data[:10]:
coordinates = torch.from_numpy(coordinates).to(torch.float).to(self.device).requires_grad_(True) coordinates = torch.from_numpy(coordinates).to(torch.float).to(self.device).requires_grad_(True)
species = torch.from_numpy(species).to(self.device) species = torch.from_numpy(species).to(self.device)
_, aev = self.aev_computer((species, coordinates)) _, aev = self.aev_computer((species, coordinates))
...@@ -159,12 +269,21 @@ class TestCUAEV(TestCase): ...@@ -159,12 +269,21 @@ class TestCUAEV(TestCase):
self.assertEqual(cu_aev, aev) self.assertEqual(cu_aev, aev)
self.assertEqual(cuaev_grad, aev_grad, atol=5e-5, rtol=5e-5) self.assertEqual(cuaev_grad, aev_grad, atol=5e-5, rtol=5e-5)
def testNISTDoubleBackward_2(self):
datafile = os.path.join(path, 'test_data/NIST/all')
with open(datafile, 'rb') as f:
data = pickle.load(f)
for coordinates, species, _, _, _, _ in data[:3]:
coordinates = torch.from_numpy(coordinates).to(torch.float).to(self.device).requires_grad_(True)
species = torch.from_numpy(species).to(self.device)
self._double_backward_2_test(species, coordinates)
def testVeryDenseMolecule(self): def testVeryDenseMolecule(self):
""" """
Test very dense molecule for aev correctness, especially for angular kernel when center atom pairs are more than 32. Test very dense molecule for aev correctness, especially for angular kernel when center atom pairs are more than 32.
issue: https://github.com/aiqm/torchani/pull/555 issue: https://github.com/aiqm/torchani/pull/555
""" """
for i in range(100): for i in range(5):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i)) datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
coordinates, species, *_ = pickle.load(f) coordinates, species, *_ = pickle.load(f)
...@@ -176,7 +295,7 @@ class TestCUAEV(TestCase): ...@@ -176,7 +295,7 @@ class TestCUAEV(TestCase):
self.assertEqual(cu_aev, aev, atol=5e-5, rtol=5e-5) self.assertEqual(cu_aev, aev, atol=5e-5, rtol=5e-5)
def testVeryDenseMoleculeBackward(self): def testVeryDenseMoleculeBackward(self):
for i in range(100): for i in range(5):
datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i)) datafile = os.path.join(path, 'test_data/tripeptide-md/{}.dat'.format(i))
with open(datafile, 'rb') as f: with open(datafile, 'rb') as f:
coordinates, species, *_ = pickle.load(f) coordinates, species, *_ = pickle.load(f)
......
...@@ -26,7 +26,7 @@ def info(text): ...@@ -26,7 +26,7 @@ def info(text):
print('\033[32m{}\33[0m'.format(text)) # green print('\033[32m{}\33[0m'.format(text)) # green
def benchmark(speciesPositions, aev_comp, N, check_gpu_mem): def benchmark(speciesPositions, aev_comp, N, check_gpu_mem, nn=None, verbose=True):
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
torch.cuda.synchronize() torch.cuda.synchronize()
...@@ -34,14 +34,25 @@ def benchmark(speciesPositions, aev_comp, N, check_gpu_mem): ...@@ -34,14 +34,25 @@ def benchmark(speciesPositions, aev_comp, N, check_gpu_mem):
aev = None aev = None
for i in range(N): for i in range(N):
aev = aev_comp(speciesPositions).aevs species, coordinates = speciesPositions
if nn is not None: # double backward
coordinates = coordinates.requires_grad_()
_, aev = aev_computer((species, coordinates))
E = nn(aev).sum()
force = -torch.autograd.grad(E, coordinates, create_graph=True, retain_graph=True)[0]
force_true = torch.randn_like(force)
loss = torch.abs(force_true - force).sum(dim=(1, 2)).mean()
loss.backward()
else:
_, aev = aev_comp((species, coordinates))
if i == 2 and check_gpu_mem: if i == 2 and check_gpu_mem:
checkgpu() checkgpu()
torch.cuda.synchronize() torch.cuda.synchronize()
delta = time.time() - start delta = time.time() - start
print(f' Duration: {delta:.2f} s') if verbose:
print(f' Speed: {delta/N*1000:.2f} ms/it') print(f' Duration: {delta:.2f} s')
print(f' Speed: {delta/N*1000:.2f} ms/it')
return aev, delta return aev, delta
...@@ -63,10 +74,14 @@ if __name__ == "__main__": ...@@ -63,10 +74,14 @@ if __name__ == "__main__":
dest='check_gpu_mem', dest='check_gpu_mem',
action='store_const', action='store_const',
const=1) const=1)
parser.add_argument('--nsight', parser.add_argument('-s', '--nsight',
action='store_true', action='store_true',
help='use nsight profile') help='use nsight profile')
parser.add_argument('-b', '--backward',
action='store_true',
help='benchmark double backward')
parser.set_defaults(check_gpu_mem=0) parser.set_defaults(check_gpu_mem=0)
parser.set_defaults(backward=0)
parser = parser.parse_args() parser = parser.parse_args()
path = os.path.dirname(os.path.realpath(__file__)) path = os.path.dirname(os.path.realpath(__file__))
...@@ -74,7 +89,7 @@ if __name__ == "__main__": ...@@ -74,7 +89,7 @@ if __name__ == "__main__":
device = torch.device('cuda') device = torch.device('cuda')
files = ['small.pdb', '1hz5.pdb', '6W8H.pdb'] files = ['small.pdb', '1hz5.pdb', '6W8H.pdb']
N = 500 N = 200
if parser.nsight: if parser.nsight:
N = 3 N = 3
torch.cuda.profiler.start() torch.cuda.profiler.start()
...@@ -89,17 +104,24 @@ if __name__ == "__main__": ...@@ -89,17 +104,24 @@ if __name__ == "__main__":
nnp = torchani.models.ANI2x(periodic_table_index=True, model_index=None).to(device) nnp = torchani.models.ANI2x(periodic_table_index=True, model_index=None).to(device)
speciesPositions = nnp.species_converter((species, positions)) speciesPositions = nnp.species_converter((species, positions))
aev_computer = nnp.aev_computer aev_computer = nnp.aev_computer
if parser.backward:
nn = torch.nn.Sequential(torch.nn.Linear(nnp.aev_computer.aev_length, 1, False)).to(device)
else:
nn = None
if parser.nsight: if parser.nsight:
torch.cuda.nvtx.range_push(file) torch.cuda.nvtx.range_push(file)
print('Original TorchANI:') print('Original TorchANI:')
aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem) aev_ref, delta_ref = benchmark(speciesPositions, aev_computer, N, check_gpu_mem, nn)
print() print()
print('CUaev:') print('CUaev:')
nnp.aev_computer.use_cuda_extension = True nnp.aev_computer.use_cuda_extension = True
cuaev_computer = nnp.aev_computer cuaev_computer = nnp.aev_computer
aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem) # warm up
_, _ = benchmark(speciesPositions, cuaev_computer, 1, check_gpu_mem, nn, verbose=False)
# run
aev, delta = benchmark(speciesPositions, cuaev_computer, N, check_gpu_mem, nn)
if parser.nsight: if parser.nsight:
torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_pop()
......
...@@ -10,6 +10,9 @@ import os ...@@ -10,6 +10,9 @@ import os
import pickle import pickle
from torchani.units import hartree2kcalmol from torchani.units import hartree2kcalmol
summary = ''
runcounter = 0
def build_network(): def build_network():
H_network = torch.nn.Sequential( H_network = torch.nn.Sequential(
...@@ -51,7 +54,17 @@ def build_network(): ...@@ -51,7 +54,17 @@ def build_network():
torch.nn.CELU(0.1), torch.nn.CELU(0.1),
torch.nn.Linear(96, 1) torch.nn.Linear(96, 1)
) )
return [H_network, C_network, N_network, O_network] nets = [H_network, C_network, N_network, O_network]
for net in nets:
net.apply(init_normal)
return nets
def init_normal(m):
if type(m) == torch.nn.Linear:
torch.nn.init.kaiming_uniform_(m.weight)
def checkgpu(device=None): def checkgpu(device=None):
...@@ -66,6 +79,7 @@ def checkgpu(device=None): ...@@ -66,6 +79,7 @@ def checkgpu(device=None):
info = pynvml.nvmlDeviceGetMemoryInfo(h) info = pynvml.nvmlDeviceGetMemoryInfo(h)
name = pynvml.nvmlDeviceGetName(h) name = pynvml.nvmlDeviceGetName(h)
print(' GPU Memory Used (nvidia-smi): {:7.1f}MB / {:.1f}MB ({})'.format(info.used / 1024 / 1024, info.total / 1024 / 1024, name.decode())) print(' GPU Memory Used (nvidia-smi): {:7.1f}MB / {:.1f}MB ({})'.format(info.used / 1024 / 1024, info.total / 1024 / 1024, name.decode()))
return f'{(info.used / 1024 / 1024):.1f}MB'
def alert(text): def alert(text):
...@@ -85,7 +99,20 @@ def print_timer(label, t): ...@@ -85,7 +99,20 @@ def print_timer(label, t):
print(f'{label} - {t}') print(f'{label} - {t}')
def benchmark(parser, dataset, use_cuda_extension, force_inference=False): def format_time(t):
if t < 1:
t = f'{t * 1000:.1f} ms'
else:
t = f'{t:.3f} sec'
return t
def benchmark(parser, dataset, use_cuda_extension, force_train=False):
global summary
global runcounter
if parser.nsight and runcounter >= 0:
torch.cuda.nvtx.range_push(parser.runname)
synchronize = True synchronize = True
timers = {} timers = {}
...@@ -145,14 +172,14 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False): ...@@ -145,14 +172,14 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
for i, properties in enumerate(dataset): for i, properties in enumerate(dataset):
species = properties['species'].to(parser.device) species = properties['species'].to(parser.device)
coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_inference) coordinates = properties['coordinates'].to(parser.device).float().requires_grad_(force_train)
true_energies = properties['energies'].to(parser.device).float() true_energies = properties['energies'].to(parser.device).float()
num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype) num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
_, predicted_energies = model((species, coordinates)) _, predicted_energies = model((species, coordinates))
# TODO add sync after aev is done # TODO add sync after aev is done
sync_cuda(synchronize) sync_cuda(synchronize)
energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean() energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
if force_inference: if force_train:
sync_cuda(synchronize) sync_cuda(synchronize)
force_coefficient = 0.1 force_coefficient = 0.1
true_forces = properties['forces'].to(parser.device).float() true_forces = properties['forces'].to(parser.device).float()
...@@ -172,21 +199,21 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False): ...@@ -172,21 +199,21 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
loss = energy_loss loss = energy_loss
rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy() rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy()
progbar.update(i, values=[("rmse", rmse)]) progbar.update(i, values=[("rmse", rmse)])
if not force_inference: sync_cuda(synchronize)
sync_cuda(synchronize) loss_start = time.time()
loss_start = time.time() loss.backward()
loss.backward() sync_cuda(synchronize)
# print('2', coordinates.grad) loss_stop = time.time()
sync_cuda(synchronize) loss_time += loss_stop - loss_start
loss_stop = time.time() optimizer.step()
loss_time += loss_stop - loss_start sync_cuda(synchronize)
optimizer.step()
sync_cuda(synchronize)
checkgpu() gpumem = checkgpu()
sync_cuda(synchronize) sync_cuda(synchronize)
stop = time.time() stop = time.time()
if parser.nsight and runcounter >= 0:
torch.cuda.nvtx.range_pop()
print('=> More detail about benchmark PER EPOCH') print('=> More detail about benchmark PER EPOCH')
total_time = (stop - start) / parser.num_epochs total_time = (stop - start) / parser.num_epochs
loss_time = loss_time / parser.num_epochs loss_time = loss_time / parser.num_epochs
...@@ -199,9 +226,18 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False): ...@@ -199,9 +226,18 @@ def benchmark(parser, dataset, use_cuda_extension, force_inference=False):
print_timer(' Backward', loss_time) print_timer(' Backward', loss_time)
print_timer(' Force', force_time) print_timer(' Force', force_time)
print_timer(' Optimizer', opti_time) print_timer(' Optimizer', opti_time)
print_timer(' Others', total_time - loss_time - aev_time - forward_time - opti_time - force_time) others_time = total_time - loss_time - aev_time - forward_time - opti_time - force_time
print_timer(' Others', others_time)
print_timer(' Epoch time', total_time) print_timer(' Epoch time', total_time)
if runcounter == 0:
summary += '\n' + 'RUN'.ljust(27) + 'Total AEV'.ljust(13) + 'Forward'.ljust(13) + 'Backward'.ljust(13) + 'Force'.ljust(13) + \
'Optimizer'.ljust(13) + 'Others'.ljust(13) + 'Epoch time'.ljust(13) + 'GPU'.ljust(13) + '\n'
if runcounter >= 0:
summary += f'{runcounter} {parser.runname}'.ljust(27) + f'{format_time(aev_time)}'.ljust(13) + f'{format_time(forward_time)}'.ljust(13) + f'{format_time(loss_time)}'.ljust(13) + f'{format_time(force_time)}'.ljust(13) + \
f'{format_time(opti_time)}'.ljust(13) + f'{format_time(others_time)}'.ljust(13) + f'{format_time(total_time)}'.ljust(13) + f'{gpumem}'.ljust(13) + '\n'
runcounter += 1
if __name__ == "__main__": if __name__ == "__main__":
# parse command line arguments # parse command line arguments
...@@ -249,20 +285,43 @@ if __name__ == "__main__": ...@@ -249,20 +285,43 @@ if __name__ == "__main__":
print(' {}'.format(torch.cuda.get_device_properties(i))) print(' {}'.format(torch.cuda.get_device_properties(i)))
checkgpu(i) checkgpu(i)
print("\n\n=> Test 1: USE cuda extension, Energy training") # Warming UP
if len(dataset_shuffled) < 100:
runcounter = -1
parser.runname = 'Warning UP'
print(f"\n\n=> Test 0: {parser.runname}")
torch.cuda.empty_cache()
gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_train=False)
if parser.nsight:
torch.cuda.profiler.start()
parser.runname = 'cu Energy train'
print(f"\n\n=> Test 1: {parser.runname}")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=False) benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_train=False)
print("\n\n=> Test 2: NO cuda extension, Energy training")
parser.runname = 'py Energy train'
print(f"\n\n=> Test 2: {parser.runname}")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=False) benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_train=False)
print("\n\n=> Test 3: USE cuda extension, Force and Energy inference") parser.runname = 'cu Energy + Force train'
print(f"\n\n=> Test 3: {parser.runname}")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_inference=True) benchmark(parser, dataset_shuffled, use_cuda_extension=True, force_train=True)
print("\n\n=> Test 4: NO cuda extension, Force and Energy inference")
parser.runname = 'py Energy + Force train'
print(f"\n\n=> Test 4: {parser.runname}")
torch.cuda.empty_cache() torch.cuda.empty_cache()
gc.collect() gc.collect()
benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_inference=True) benchmark(parser, dataset_shuffled, use_cuda_extension=False, force_train=True)
print(summary)
if parser.nsight:
torch.cuda.profiler.stop()
# CUAEV # CUAEV
CUDA Extension for AEV calculation. CUDA Extension for AEV calculation.
Performance improvement is expected to be ~3X for AEV computation and ~1.5X for overall training workflow. Performance improvement is expected to be ~3X for AEV computation and ~1.5X for energy training, 2.6X for energy+force training.
## Requirement ## Requirement
CUAEV needs the nightly version [pytorch](https://pytorch.org/) to be able to work. CUAEV needs the nightly version [pytorch](https://pytorch.org/) to be able to work.
If you you use conda, you could install it by If you use conda, you could install it by
``` ```
conda install pytorch torchvision torchaudio cudatoolkit={YOUR_CUDA_VERSION} -c pytorch-nightly conda install pytorch torchvision torchaudio cudatoolkit={YOUR_CUDA_VERSION} -c pytorch-nightly
``` ```
...@@ -18,21 +18,102 @@ cd torchani ...@@ -18,21 +18,102 @@ cd torchani
# choose one option below # choose one option below
# use --cuaev-all-sms if you are building in SLURM environment and there are multiple different gpus in a node # use --cuaev-all-sms if you are building in SLURM environment and there are multiple different gpus in a node
# use --cuaev will only build for detected gpus # use --cuaev will only build for detected gpus
python setup.py install --cuaev-all-sms # build for all sms
python setup.py install --cuaev # only build for detected gpus python setup.py install --cuaev # only build for detected gpus
python setup.py install --cuaev-all-sms # build for all gpus
# or for development # or for development
# `pip install -e . && ` is only needed for the very first install (because issue of https://github.com/pypa/pip/issues/1883) # `pip install -e . && ` is only needed for the very first install (because issue of https://github.com/pypa/pip/issues/1883)
pip install -e . && pip install -v -e . --global-option="--cuaev-all-sms" # build for all sms
pip install -e . && pip install -v -e . --global-option="--cuaev" # only build for detected gpus pip install -e . && pip install -v -e . --global-option="--cuaev" # only build for detected gpus
pip install -e . && pip install -v -e . --global-option="--cuaev-all-sms" # build for all gpus
``` ```
<del>Notes for install on Hipergator</del> (Currently not working because Pytorch dropped the official build for cuda/10.0)
Notes for build CUAEV on multiple HPC
<details>
<summary>Bridges2</summary>
```bash
# prepare
srun -p GPU-small --ntasks=1 --cpus-per-task=5 --gpus=1 --time=02:00:00 --mem=20gb --pty -u bash -i
module load cuda/10.2.0
conda create -n cuaev python=3.8
conda activate cuaev
conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-nightly
# install torchani
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
<details>
<summary>Hipergator</summary>
```bash
srun -p gpu --ntasks=1 --cpus-per-task=2 --gpus=geforce:1 --time=02:00:00 --mem=10gb --pty -u bash -i
module load cuda/10.0.130 gcc/7.3.0 git
conda remove --name cuaev --all -y && conda create -n cuaev python=3.8 -y
conda activate cuaev
# install compiled torch-cu100 because pytorch droped official build for cuda 10.0
. /home/jinzexue/pytorch/loadmodule # note that there is a space after .
. /home/jinzexue/pytorch/install_deps
pip install $(realpath /home/jinzexue/pytorch/dist/torch-nightly-cu100.whl)
# check if pytorch is working, should print available's gpu infomations
python /home/jinzexue/pytorch/testcuda/testcuda.py
# install torchani
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
<details>
<summary>Expanse</summary>
```bash
srun -p gpu-shared --ntasks=1 --account=cwr109 --cpus-per-task=1 --gpus=1 --time=01:00:00 --mem=10gb --pty -u bash -i
# create env if necessary
conda create -n cuaev python=3.8
conda activate cuaev
# modules
module load cuda10.2/toolkit/10.2.89 gcc/7.5.0
# pytorch
conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch-nightly
# install
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
<details>
<summary>Moria</summary>
```bash ```bash
srun -p gpu --gpus=geforce:1 --time=01:00:00 --mem=10gb --pty -u bash -i # compile may fail because of low on memery (when memery is less than 5gb) srun --ntasks=1 --cpus-per-task=2 --gpus=1 --time=02:00:00 --mem=10gb --pty -u bash -i
conda install pytorch torchvision cudatoolkit=10.0 -c pytorch # make sure it's cudatoolkit=10.0 # create env if necessary
module load cuda/10.0.130 conda create -n cuaev python=3.8
module load gcc/7.3.0 conda activate cuaev
python setup.py install --cuaev-all-sms # cuda path (could be added to ~/.bashrc)
export PATH=/usr/local/cuda/bin:$PATH # nvcc for cuda 9.2
# pytorch
conda install pytorch torchvision cudatoolkit=9.2 -c pytorch-nightly
# install
git clone https://github.com/aiqm/torchani.git
cd torchani
pip install -e . && pip install -v -e . --global-option="--cuaev"
```
</details>
## Test
```bash
cd torchani
./download.sh
python tests/test_cuaev.py
``` ```
## Usage ## Usage
...@@ -44,27 +125,33 @@ cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, Sh ...@@ -44,27 +125,33 @@ cuaev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, Sh
## TODOs ## TODOs
- [x] CUAEV Forward - [x] CUAEV Forward
- [x] CUAEV Backwad (Force) - [x] CUAEV Backwad (Force)
- [x] CUAEV Double Backwad (Force training need aev's double backward w.r.t. grad_aev)
- [ ] PBC - [ ] PBC
- [ ] Force training (Need cuaev's second derivative)
## Benchmark ## Benchmark
Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/blob/master/tools/training-aev-benchmark.py) on TITAN V: Benchmark of [torchani/tools/training-aev-benchmark.py](https://github.com/aiqm/torchani/blob/master/tools/training-aev-benchmark.py):
| ANI-1x dataset (Batchsize 2560) | Energy Training | Energy and Force Inference | Train ANI-1x dataset (Batchsize 2560) on Tesla V100 for 1 epoch:
|---------------------------------|-------------------------|-----------------------------------| ```
| Time per Epoch / Memory | AEV / Total / GPU Mem | AEV / Force / Total / GPU Mem | RUN Total AEV Forward Backward Force Optimizer Others Epoch time GPU
| aev cuda extension | 3.90s / 31.5s / 2088 MB | 3.90s / 22.6s / 43.0s / 4234 MB | 0 cu Energy 3.355 sec 4.470 sec 4.685 sec 0.0 ms 3.508 sec 2.223 sec 18.241 sec 2780.8MB
| aev python code | 23.7s / 50.2s / 3540 MB | 25.3s / 48.0s / 88.2s / 11316 MB | 1 py Energy 19.682 sec 4.149 sec 4.663 sec 0.0 ms 3.495 sec 2.220 sec 34.209 sec 4038.8MB
2 cu Energy+Force 3.351 sec 4.200 sec 27.402 sec 16.514 sec 3.467 sec 4.556 sec 59.490 sec 7492.8MB
3 py Energy+Force 19.964 sec 4.176 sec 91.866 sec 36.554 sec 3.473 sec 5.403 sec 161.435 sec 8034.8MB
```
## Test Train ANI-1x dataset (Batchsize 1500) on GTX 1080 for 1 epoch:
```bash ```
cd torchani RUN Total AEV Forward Backward Force Optimizer Others Epoch time GPU
./download.sh 0 cu Energy 14.373 sec 10.870 sec 13.100 sec 0.0 ms 11.043 sec 2.913 sec 52.299 sec 1527.5MB
python tests/test_cuaev.py 1 py Energy 51.545 sec 10.228 sec 13.154 sec 0.0 ms 11.384 sec 2.874 sec 89.185 sec 2403.5MB
2 cu Energy+Force 14.275 sec 10.024 sec 85.423 sec 51.380 sec 7.396 sec 5.494 sec 173.992 sec 3577.5MB
3 py Energy+Force 51.305 sec 9.951 sec 271.078 sec 107.252 sec 7.835 sec 4.941 sec 452.362 sec 7307.5MB
``` ```
benchmark benchmark
``` ```
pip install pynvml pkbar
python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5 python tools/training-aev-benchmark.py download/dataset/ani-1x/sample.h5
python tools/aev-benchmark-size.py python tools/aev-benchmark-size.py
``` ```
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment