Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
torchani
Commits
b546adb8
Unverified
Commit
b546adb8
authored
Sep 06, 2018
by
Gao, Xiang
Committed by
GitHub
Sep 06, 2018
Browse files
AEV Cache, part2 (#89)
parent
bfc04ac8
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
278 additions
and
71 deletions
+278
-71
.gitignore
.gitignore
+2
-0
codefresh.yml
codefresh.yml
+2
-1
docs/api.rst
docs/api.rst
+1
-0
docs/index.rst
docs/index.rst
+1
-2
examples/cache_aev.py
examples/cache_aev.py
+197
-0
examples/nnp_training.py
examples/nnp_training.py
+15
-11
torchani/data/__init__.py
torchani/data/__init__.py
+54
-5
torchani/data/cache_aev.py
torchani/data/cache_aev.py
+2
-50
torchani/neurochem/__init__.py
torchani/neurochem/__init__.py
+4
-2
No files found.
.gitignore
View file @
b546adb8
...
@@ -21,3 +21,5 @@ benchmark_xyz
...
@@ -21,3 +21,5 @@ benchmark_xyz
/*.params
/*.params
/*.dat
/*.dat
/tmp
/tmp
*_cache
codefresh.yml
View file @
b546adb8
...
@@ -37,4 +37,5 @@ steps:
...
@@ -37,4 +37,5 @@ steps:
Docs
:
Docs
:
image
:
'
${{BuildTorchANI}}'
image
:
'
${{BuildTorchANI}}'
commands
:
commands
:
-
sphinx-build docs build
-
find . -name '*.pt' -delete
-
sphinx-build -D plot_gallery=0 docs build
docs/api.rst
View file @
b546adb8
...
@@ -15,6 +15,7 @@ Datasets
...
@@ -15,6 +15,7 @@ Datasets
.. automodule:: torchani.data
.. automodule:: torchani.data
.. autoclass:: torchani.data.BatchedANIDataset
.. autoclass:: torchani.data.BatchedANIDataset
.. autoclass:: torchani.data.AEVCacheLoader
.. automodule:: torchani.data.cache_aev
.. automodule:: torchani.data.cache_aev
...
...
docs/index.rst
View file @
b546adb8
...
@@ -2,8 +2,6 @@
...
@@ -2,8 +2,6 @@
Welcome to TorchANI's documentation!
Welcome to TorchANI's documentation!
====================================
====================================
Precompute AEVs to Improve Training Performance
.. automodule:: torchani
.. automodule:: torchani
.. toctree::
.. toctree::
...
@@ -18,6 +16,7 @@ Precompute AEVs to Improve Training Performance
...
@@ -18,6 +16,7 @@ Precompute AEVs to Improve Training Performance
examples/energy_force
examples/energy_force
examples/nnp_training
examples/nnp_training
examples/cache_aev
examples/neurochem_trainer
examples/neurochem_trainer
.. toctree::
.. toctree::
...
...
examples/cache_aev.py
0 → 100644
View file @
b546adb8
# -*- coding: utf-8 -*-
"""
Use Disk Cache of AEV to Boost Training
=======================================
In the previous :ref:`training-example` example, AEVs are computed everytime
when needed. This is not very efficient because the AEVs actually never change
during training. If one has a good SSD, it would be beneficial to cache these
AEVs. This example shows how to use disk cache to boost training
"""
###############################################################################
# Most part of the codes in this example are line by line copy of
# :ref:`training-example`.
import
torch
import
ignite
import
torchani
import
tqdm
import
timeit
import
tensorboardX
import
os
import
sys
# training and validation set
try
:
path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
except
NameError
:
path
=
os
.
getcwd
()
training_path
=
os
.
path
.
join
(
path
,
'../dataset/ani_gdb_s01.h5'
)
validation_path
=
os
.
path
.
join
(
path
,
'../dataset/ani_gdb_s01.h5'
)
# checkpoint file to save model when validation RMSE improves
model_checkpoint
=
'model.pt'
# max epochs to run the training
max_epochs
=
20
# Compute training RMSE every this steps. Since the training set is usually
# huge and the loss funcition does not directly gives us RMSE, we need to
# check the training RMSE to see overfitting.
training_rmse_every
=
5
# device to run the training
device
=
torch
.
device
(
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
)
# batch size
batch_size
=
1024
# log directory for tensorboardX
log
=
'runs'
###############################################################################
# Here, there is no need to manually construct aev computer and energy shifter,
# but we do need to generate a disk cache for datasets
const_file
=
os
.
path
.
join
(
path
,
'../torchani/resources/ani-1x_dft_x8ens/rHCNO-5.2R_16-3.5A_a4-8.params'
)
# noqa: E501
sae_file
=
os
.
path
.
join
(
path
,
'../torchani/resources/ani-1x_dft_x8ens/sae_linfit.dat'
)
# noqa: E501
training_cache
=
'./training_cache'
validation_cache
=
'./validation_cache'
# If the cache dirs already exists, then we assume these data has already been
# cached and skip the generation part.
if
not
os
.
path
.
exists
(
training_cache
):
torchani
.
data
.
cache_aev
(
training_cache
,
training_path
,
batch_size
,
device
,
const_file
,
True
,
sae_file
)
if
not
os
.
path
.
exists
(
validation_cache
):
torchani
.
data
.
cache_aev
(
validation_cache
,
validation_path
,
batch_size
,
device
,
const_file
,
True
,
sae_file
)
###############################################################################
# The codes that define the network are also the same
def
atomic
():
model
=
torch
.
nn
.
Sequential
(
torch
.
nn
.
Linear
(
384
,
128
),
torch
.
nn
.
CELU
(
0.1
),
torch
.
nn
.
Linear
(
128
,
128
),
torch
.
nn
.
CELU
(
0.1
),
torch
.
nn
.
Linear
(
128
,
64
),
torch
.
nn
.
CELU
(
0.1
),
torch
.
nn
.
Linear
(
64
,
1
)
)
return
model
nn
=
torchani
.
ANIModel
([
atomic
()
for
_
in
range
(
4
)])
print
(
nn
)
if
os
.
path
.
isfile
(
model_checkpoint
):
nn
.
load_state_dict
(
torch
.
load
(
model_checkpoint
))
else
:
torch
.
save
(
nn
.
state_dict
(),
model_checkpoint
)
class
Flatten
(
torch
.
nn
.
Module
):
def
forward
(
self
,
x
):
return
x
[
0
],
x
[
1
].
flatten
()
###############################################################################
# Except that at here we do not include aev computer into our pipeline, because
# the cache loader will load computed AEVs from disk.
model
=
torch
.
nn
.
Sequential
(
nn
,
Flatten
()).
to
(
device
)
###############################################################################
# This part is also a line by line copy
writer
=
tensorboardX
.
SummaryWriter
(
log_dir
=
log
)
###############################################################################
# Here we don't need to construct :class:`torchani.data.BatchedANIDataset`
# object, but instead an object of :class:`torchani.data.AEVCacheLoader`
training
=
torchani
.
data
.
AEVCacheLoader
(
training_cache
)
validation
=
torchani
.
data
.
AEVCacheLoader
(
validation_cache
)
###############################################################################
# The rest of the code are again the same
training
=
torchani
.
data
.
AEVCacheLoader
(
training_cache
)
container
=
torchani
.
ignite
.
Container
({
'energies'
:
model
})
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
())
trainer
=
ignite
.
engine
.
create_supervised_trainer
(
container
,
optimizer
,
torchani
.
ignite
.
MSELoss
(
'energies'
))
evaluator
=
ignite
.
engine
.
create_supervised_evaluator
(
container
,
metrics
=
{
'RMSE'
:
torchani
.
ignite
.
RMSEMetric
(
'energies'
)
})
@
trainer
.
on
(
ignite
.
engine
.
Events
.
EPOCH_STARTED
)
def
init_tqdm
(
trainer
):
trainer
.
state
.
tqdm
=
tqdm
.
tqdm
(
total
=
len
(
training
),
file
=
sys
.
stdout
,
desc
=
'epoch'
)
@
trainer
.
on
(
ignite
.
engine
.
Events
.
ITERATION_COMPLETED
)
def
update_tqdm
(
trainer
):
trainer
.
state
.
tqdm
.
update
(
1
)
@
trainer
.
on
(
ignite
.
engine
.
Events
.
EPOCH_COMPLETED
)
def
finalize_tqdm
(
trainer
):
trainer
.
state
.
tqdm
.
close
()
def
hartree2kcal
(
x
):
return
627.509
*
x
@
trainer
.
on
(
ignite
.
engine
.
Events
.
EPOCH_STARTED
)
def
validation_and_checkpoint
(
trainer
):
def
evaluate
(
dataset
,
name
):
evaluator
=
ignite
.
engine
.
create_supervised_evaluator
(
container
,
metrics
=
{
'RMSE'
:
torchani
.
ignite
.
RMSEMetric
(
'energies'
)
}
)
evaluator
.
run
(
dataset
)
metrics
=
evaluator
.
state
.
metrics
rmse
=
hartree2kcal
(
metrics
[
'RMSE'
])
writer
.
add_scalar
(
name
,
rmse
,
trainer
.
state
.
epoch
)
# compute validation RMSE
evaluate
(
validation
,
'validation_rmse_vs_epoch'
)
# compute training RMSE
if
trainer
.
state
.
epoch
%
training_rmse_every
==
1
:
evaluate
(
training
,
'training_rmse_vs_epoch'
)
# checkpoint model
torch
.
save
(
nn
.
state_dict
(),
model_checkpoint
)
start
=
timeit
.
default_timer
()
@
trainer
.
on
(
ignite
.
engine
.
Events
.
EPOCH_STARTED
)
def
log_time
(
trainer
):
elapsed
=
round
(
timeit
.
default_timer
()
-
start
,
2
)
writer
.
add_scalar
(
'time_vs_epoch'
,
elapsed
,
trainer
.
state
.
epoch
)
@
trainer
.
on
(
ignite
.
engine
.
Events
.
ITERATION_COMPLETED
)
def
log_loss
(
trainer
):
iteration
=
trainer
.
state
.
iteration
writer
.
add_scalar
(
'loss_vs_iteration'
,
trainer
.
state
.
output
,
iteration
)
trainer
.
run
(
training
,
max_epochs
)
###############################################################################
# In the end, we explicitly close the opened loader's process. If the loading
# processes are not closed, these processes would prevent the whole program
# from terminating. The closing of loading process can be done automatically
# when an :class:`torchani.data.AEVCacheLoader` object is garbage collected,
# but here since our cache loader objects are in global scope, it won't be
# garbage collected, se we need to terminate these processes manually.
training
.
__del__
()
validation
.
__del__
()
examples/nnp_training.py
View file @
b546adb8
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
"""
.. _training-example:
Train Your Own Neural Network Potential
Train Your Own Neural Network Potential
=======================================
=======================================
...
@@ -83,8 +85,15 @@ def atomic():
...
@@ -83,8 +85,15 @@ def atomic():
return
model
return
model
model
=
torchani
.
ANIModel
([
atomic
()
for
_
in
range
(
4
)])
nn
=
torchani
.
ANIModel
([
atomic
()
for
_
in
range
(
4
)])
print
(
model
)
print
(
nn
)
###############################################################################
# If checkpoint from previous training exists, then load it.
if
os
.
path
.
isfile
(
model_checkpoint
):
nn
.
load_state_dict
(
torch
.
load
(
model_checkpoint
))
else
:
torch
.
save
(
nn
.
state_dict
(),
model_checkpoint
)
###############################################################################
###############################################################################
...
@@ -97,15 +106,7 @@ class Flatten(torch.nn.Module):
...
@@ -97,15 +106,7 @@ class Flatten(torch.nn.Module):
return
x
[
0
],
x
[
1
].
flatten
()
return
x
[
0
],
x
[
1
].
flatten
()
model
=
torch
.
nn
.
Sequential
(
aev_computer
,
model
,
Flatten
())
model
=
torch
.
nn
.
Sequential
(
aev_computer
,
nn
,
Flatten
()).
to
(
device
)
###############################################################################
# If checkpoint from previous training exists, then load it.
if
os
.
path
.
isfile
(
model_checkpoint
):
model
.
load_state_dict
(
torch
.
load
(
model_checkpoint
))
else
:
torch
.
save
(
model
.
state_dict
(),
model_checkpoint
)
model
.
to
(
device
)
###############################################################################
###############################################################################
...
@@ -208,6 +209,9 @@ def validation_and_checkpoint(trainer):
...
@@ -208,6 +209,9 @@ def validation_and_checkpoint(trainer):
if
trainer
.
state
.
epoch
%
training_rmse_every
==
1
:
if
trainer
.
state
.
epoch
%
training_rmse_every
==
1
:
evaluate
(
training
,
'training_rmse_vs_epoch'
)
evaluate
(
training
,
'training_rmse_vs_epoch'
)
# checkpoint model
torch
.
save
(
nn
.
state_dict
(),
model_checkpoint
)
###############################################################################
###############################################################################
# Also some to log elapsed time:
# Also some to log elapsed time:
...
...
torchani/data/__init__.py
View file @
b546adb8
...
@@ -6,9 +6,11 @@ from os.path import join, isfile, isdir
...
@@ -6,9 +6,11 @@ from os.path import join, isfile, isdir
import
os
import
os
from
._pyanitools
import
anidataloader
from
._pyanitools
import
anidataloader
import
torch
import
torch
from
..
import
utils
from
..
import
utils
,
neurochem
,
aev
import
pickle
import
pickle
default_device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
def
chunk_counts
(
counts
,
split
):
def
chunk_counts
(
counts
,
split
):
split
=
[
x
+
1
for
x
in
split
]
+
[
None
]
split
=
[
x
+
1
for
x
in
split
]
+
[
None
]
...
@@ -131,7 +133,7 @@ class BatchedANIDataset(Dataset):
...
@@ -131,7 +133,7 @@ class BatchedANIDataset(Dataset):
def
__init__
(
self
,
path
,
species_tensor_converter
,
batch_size
,
def
__init__
(
self
,
path
,
species_tensor_converter
,
batch_size
,
shuffle
=
True
,
properties
=
[
'energies'
],
transform
=
(),
shuffle
=
True
,
properties
=
[
'energies'
],
transform
=
(),
dtype
=
torch
.
get_default_dtype
(),
device
=
torch
.
device
(
'cpu'
)
):
dtype
=
torch
.
get_default_dtype
(),
device
=
default_device
):
super
(
BatchedANIDataset
,
self
).
__init__
()
super
(
BatchedANIDataset
,
self
).
__init__
()
self
.
properties
=
properties
self
.
properties
=
properties
self
.
device
=
device
self
.
device
=
device
...
@@ -256,7 +258,7 @@ class AEVCacheLoader:
...
@@ -256,7 +258,7 @@ class AEVCacheLoader:
self
.
in_memory_size
=
in_memory_size
self
.
in_memory_size
=
in_memory_size
if
len
(
self
.
dataset
)
<
in_memory_size
:
if
len
(
self
.
dataset
)
<
in_memory_size
:
self
.
in_memory_size
=
len
(
self
.
dataset
)
self
.
in_memory_size
=
len
(
self
.
dataset
)
for
i
in
range
(
in_memory_size
):
for
i
in
range
(
self
.
in_memory_size
):
self
.
index_queue
.
put
(
i
)
self
.
index_queue
.
put
(
i
)
self
.
loader
=
torch
.
multiprocessing
.
Process
(
self
.
loader
=
torch
.
multiprocessing
.
Process
(
target
=
_disk_cache_loader
,
target
=
_disk_cache_loader
,
...
@@ -289,5 +291,52 @@ class AEVCacheLoader:
...
@@ -289,5 +291,52 @@ class AEVCacheLoader:
def
__del__
(
self
):
def
__del__
(
self
):
self
.
loader
.
terminate
()
self
.
loader
.
terminate
()
def
__len__
(
self
):
__all__
=
[
'BatchedANIDataset'
,
'AEVCacheLoader'
]
return
len
(
self
.
dataset
)
builtin
=
neurochem
.
Builtins
()
def
cache_aev
(
output
,
dataset_path
,
batchsize
,
device
=
default_device
,
constfile
=
builtin
.
const_file
,
subtract_sae
=
False
,
sae_file
=
builtin
.
sae_file
,
enable_tqdm
=
True
,
**
kwargs
):
# if output directory does not exist, then create it
if
not
os
.
path
.
exists
(
output
):
os
.
makedirs
(
output
)
device
=
torch
.
device
(
device
)
consts
=
neurochem
.
Constants
(
constfile
)
aev_computer
=
aev
.
AEVComputer
(
**
consts
).
to
(
device
)
if
subtract_sae
:
energy_shifter
=
neurochem
.
load_sae
(
sae_file
)
transform
=
(
energy_shifter
.
subtract_from_dataset
,)
else
:
transform
=
()
dataset
=
BatchedANIDataset
(
dataset_path
,
consts
.
species_to_tensor
,
batchsize
,
device
=
device
,
transform
=
transform
,
**
kwargs
)
# dump out the dataset
filename
=
os
.
path
.
join
(
output
,
'dataset'
)
with
open
(
filename
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
if
enable_tqdm
:
import
tqdm
indices
=
tqdm
.
trange
(
len
(
dataset
))
else
:
indices
=
range
(
len
(
dataset
))
for
i
in
indices
:
input_
,
_
=
dataset
[
i
]
aevs
=
[
aev_computer
(
j
)
for
j
in
input_
]
aevs
=
[(
x
.
cpu
(),
y
.
cpu
())
for
x
,
y
in
aevs
]
filename
=
os
.
path
.
join
(
output
,
'{}'
.
format
(
i
))
with
open
(
filename
,
'wb'
)
as
f
:
pickle
.
dump
(
aevs
,
f
)
__all__
=
[
'BatchedANIDataset'
,
'AEVCacheLoader'
,
'cache_aev'
]
torchani/data/cache_aev.py
View file @
b546adb8
...
@@ -4,57 +4,8 @@
...
@@ -4,57 +4,8 @@
computed aevs. Use the ``-h`` option for help.
computed aevs. Use the ``-h`` option for help.
"""
"""
import
os
import
torch
import
torch
from
..
import
aev
,
neurochem
from
.
import
cache_aev
,
builtin
,
default_device
from
.
import
BatchedANIDataset
import
pickle
builtin
=
neurochem
.
Builtins
()
default_device
=
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
default_dtype
=
str
(
torch
.
get_default_dtype
()).
split
(
'.'
)[
1
]
def
cache_aev
(
output
,
dataset_path
,
batchsize
,
device
=
default_device
,
constfile
=
builtin
.
const_file
,
subtract_sae
=
False
,
sae_file
=
builtin
.
sae_file
,
enable_tqdm
=
True
,
**
kwargs
):
# if output directory does not exist, then create it
if
not
os
.
path
.
exists
(
output
):
os
.
makedirs
(
output
)
device
=
torch
.
device
(
device
)
consts
=
neurochem
.
Constants
(
constfile
)
aev_computer
=
aev
.
AEVComputer
(
**
consts
).
to
(
device
)
if
subtract_sae
:
energy_shifter
=
neurochem
.
load_sae
(
sae_file
)
transform
=
(
energy_shifter
.
subtract_from_dataset
,)
else
:
transform
=
()
dataset
=
BatchedANIDataset
(
dataset_path
,
consts
.
species_to_tensor
,
batchsize
,
device
=
device
,
transform
=
transform
,
**
kwargs
)
# dump out the dataset
filename
=
os
.
path
.
join
(
output
,
'dataset'
)
with
open
(
filename
,
'wb'
)
as
f
:
pickle
.
dump
(
dataset
,
f
)
if
enable_tqdm
:
import
tqdm
indices
=
tqdm
.
trange
(
len
(
dataset
))
else
:
indices
=
range
(
len
(
dataset
))
for
i
in
indices
:
input_
,
_
=
dataset
[
i
]
aevs
=
[
aev_computer
(
j
)
for
j
in
input_
]
aevs
=
[(
x
.
cpu
(),
y
.
cpu
())
for
x
,
y
in
aevs
]
filename
=
os
.
path
.
join
(
output
,
'{}'
.
format
(
i
))
with
open
(
filename
,
'wb'
)
as
f
:
pickle
.
dump
(
aevs
,
f
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
@@ -72,6 +23,7 @@ if __name__ == '__main__':
...
@@ -72,6 +23,7 @@ if __name__ == '__main__':
parser
.
add_argument
(
'--properties'
,
nargs
=
'+'
,
parser
.
add_argument
(
'--properties'
,
nargs
=
'+'
,
help
=
'Output properties to load.`'
,
help
=
'Output properties to load.`'
,
default
=
[
'energies'
])
default
=
[
'energies'
])
default_dtype
=
str
(
torch
.
get_default_dtype
()).
split
(
'.'
)[
1
]
parser
.
add_argument
(
'--dtype'
,
help
=
'Data type'
,
default
=
default_dtype
)
parser
.
add_argument
(
'--dtype'
,
help
=
'Data type'
,
default
=
default_dtype
)
parser
.
add_argument
(
'-d'
,
'--device'
,
help
=
'Device for training'
,
parser
.
add_argument
(
'-d'
,
'--device'
,
help
=
'Device for training'
,
default
=
default_device
)
default
=
default_device
)
...
...
torchani/neurochem/__init__.py
View file @
b546adb8
...
@@ -16,7 +16,6 @@ from ..nn import ANIModel, Ensemble, Gaussian
...
@@ -16,7 +16,6 @@ from ..nn import ANIModel, Ensemble, Gaussian
from
..utils
import
EnergyShifter
from
..utils
import
EnergyShifter
from
..aev
import
AEVComputer
from
..aev
import
AEVComputer
from
..ignite
import
Container
,
MSELoss
,
TransformedLoss
,
RMSEMetric
,
MAEMetric
from
..ignite
import
Container
,
MSELoss
,
TransformedLoss
,
RMSEMetric
,
MAEMetric
from
..data
import
BatchedANIDataset
class
Constants
(
Mapping
):
class
Constants
(
Mapping
):
...
@@ -304,6 +303,9 @@ def hartree2kcal(x):
...
@@ -304,6 +303,9 @@ def hartree2kcal(x):
return
627.509
*
x
return
627.509
*
x
from
..data
import
BatchedANIDataset
# noqa: E402
class
Trainer
:
class
Trainer
:
"""Train with NeuroChem training configurations.
"""Train with NeuroChem training configurations.
...
@@ -676,4 +678,4 @@ class Trainer:
...
@@ -676,4 +678,4 @@ class Trainer:
__all__
=
[
'Constants'
,
'load_sae'
,
'load_model'
,
'load_model_ensemble'
,
__all__
=
[
'Constants'
,
'load_sae'
,
'load_model'
,
'load_model_ensemble'
,
'Trainer'
]
'Builtins'
,
'Trainer'
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment