nnp_training_force.py 12.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# -*- coding: utf-8 -*-
"""
.. _force-training-example:

Train Neural Network Potential To Both Energies and Forces
==========================================================

We have seen how to train a neural network potential by manually writing
training loop in :ref:`training-example`. This tutorial shows how to modify
that script to train to force.
"""

###############################################################################
# Most part of the script are the same as :ref:`training-example`, we will omit
# the comments for these parts. Please refer to :ref:`training-example` for more
# information
17

18
19
20
21
22
23
24
import torch
import torchani
import os
import math
import torch.utils.tensorboard
import tqdm

Ignacio Pickering's avatar
Ignacio Pickering committed
25
26
27
# helper function to convert energy unit from Hartree to kcal/mol
from torchani.units import hartree2kcalmol

28
29
30
31
32
33
34
35
36
37
38
39
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Rcr = 5.2000e+00
Rca = 3.5000e+00
EtaR = torch.tensor([1.6000000e+01], device=device)
ShfR = torch.tensor([9.0000000e-01, 1.1687500e+00, 1.4375000e+00, 1.7062500e+00, 1.9750000e+00, 2.2437500e+00, 2.5125000e+00, 2.7812500e+00, 3.0500000e+00, 3.3187500e+00, 3.5875000e+00, 3.8562500e+00, 4.1250000e+00, 4.3937500e+00, 4.6625000e+00, 4.9312500e+00], device=device)
Zeta = torch.tensor([3.2000000e+01], device=device)
ShfZ = torch.tensor([1.9634954e-01, 5.8904862e-01, 9.8174770e-01, 1.3744468e+00, 1.7671459e+00, 2.1598449e+00, 2.5525440e+00, 2.9452431e+00], device=device)
EtaA = torch.tensor([8.0000000e+00], device=device)
ShfA = torch.tensor([9.0000000e-01, 1.5500000e+00, 2.2000000e+00, 2.8500000e+00], device=device)
num_species = 4
aev_computer = torchani.AEVComputer(Rcr, Rca, EtaR, ShfR, EtaA, Zeta, ShfA, ShfZ, num_species)
40
energy_shifter = torchani.utils.EnergyShifter(None)
41
42
43
44
45
46
47
species_to_tensor = torchani.utils.ChemicalSymbolsToInts('HCNO')


try:
    path = os.path.dirname(os.path.realpath(__file__))
except NameError:
    path = os.getcwd()
48
dspath = os.path.join(path, '../dataset/ani-1x/sample.h5')
49
50
51
52
53
54
55
56

batch_size = 2560

###############################################################################
# The code to create the dataset is a bit different: we need to manually
# specify that ``atomic_properties=['forces']`` so that forces will be read
# from hdf5 files.

57
training, validation = torchani.data.load_ani_dataset(
58
59
    dspath, species_to_tensor, batch_size, rm_outlier=True,
    device=device, atomic_properties=['forces'],
60
    transform=[energy_shifter.subtract_from_dataset], split=[0.8, None])
61

62
63
print('Self atomic energies: ', energy_shifter.self_energies)

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
###############################################################################
# When iterating the dataset, we will get pairs of input and output
# ``(species_coordinates, properties)``, in this case, ``properties`` would
# contain a key ``'atomic'`` where ``properties['atomic']`` is a list of dict
# containing forces:

data = training[0]
properties = data[1]
atomic_properties = properties['atomic']
print(type(atomic_properties))
print(list(atomic_properties[0].keys()))

###############################################################################
# Due to padding, part of the forces might be 0
print(atomic_properties[0]['forces'][0])

###############################################################################
# The code to define networks, optimizers, are mostly the same

H_network = torch.nn.Sequential(
    torch.nn.Linear(384, 160),
    torch.nn.CELU(0.1),
    torch.nn.Linear(160, 128),
    torch.nn.CELU(0.1),
    torch.nn.Linear(128, 96),
    torch.nn.CELU(0.1),
    torch.nn.Linear(96, 1)
)

C_network = torch.nn.Sequential(
    torch.nn.Linear(384, 144),
    torch.nn.CELU(0.1),
    torch.nn.Linear(144, 112),
    torch.nn.CELU(0.1),
    torch.nn.Linear(112, 96),
    torch.nn.CELU(0.1),
    torch.nn.Linear(96, 1)
)

N_network = torch.nn.Sequential(
    torch.nn.Linear(384, 128),
    torch.nn.CELU(0.1),
    torch.nn.Linear(128, 112),
    torch.nn.CELU(0.1),
    torch.nn.Linear(112, 96),
    torch.nn.CELU(0.1),
    torch.nn.Linear(96, 1)
)

O_network = torch.nn.Sequential(
    torch.nn.Linear(384, 128),
    torch.nn.CELU(0.1),
    torch.nn.Linear(128, 112),
    torch.nn.CELU(0.1),
    torch.nn.Linear(112, 96),
    torch.nn.CELU(0.1),
    torch.nn.Linear(96, 1)
)

nn = torchani.ANIModel([H_network, C_network, N_network, O_network])
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
print(nn)

###############################################################################
# Initialize the weights and biases.
#
# .. note::
#   Pytorch default initialization for the weights and biases in linear layers
#   is Kaiming uniform. See: `TORCH.NN.MODULES.LINEAR`_
#   We initialize the weights similarly but from the normal distribution.
#   The biases were initialized to zero.
#
# .. _TORCH.NN.MODULES.LINEAR:
#   https://pytorch.org/docs/stable/_modules/torch/nn/modules/linear.html#Linear


def init_params(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.kaiming_normal_(m.weight, a=1.0)
        torch.nn.init.zeros_(m.bias)


nn.apply(init_params)

###############################################################################
# Let's now create a pipeline of AEV Computer --> Neural Networks.
149
model = torchani.nn.Sequential(aev_computer, nn).to(device)
150
151

###############################################################################
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# Here we will use Adam with weight decay for the weights and Stochastic Gradient
# Descent for biases.

AdamW = torchani.optim.AdamW([
    # H networks
    {'params': [H_network[0].weight]},
    {'params': [H_network[2].weight], 'weight_decay': 0.00001},
    {'params': [H_network[4].weight], 'weight_decay': 0.000001},
    {'params': [H_network[6].weight]},
    # C networks
    {'params': [C_network[0].weight]},
    {'params': [C_network[2].weight], 'weight_decay': 0.00001},
    {'params': [C_network[4].weight], 'weight_decay': 0.000001},
    {'params': [C_network[6].weight]},
    # N networks
    {'params': [N_network[0].weight]},
    {'params': [N_network[2].weight], 'weight_decay': 0.00001},
    {'params': [N_network[4].weight], 'weight_decay': 0.000001},
    {'params': [N_network[6].weight]},
    # O networks
    {'params': [O_network[0].weight]},
    {'params': [O_network[2].weight], 'weight_decay': 0.00001},
    {'params': [O_network[4].weight], 'weight_decay': 0.000001},
    {'params': [O_network[6].weight]},
])

SGD = torch.optim.SGD([
    # H networks
    {'params': [H_network[0].bias]},
    {'params': [H_network[2].bias]},
    {'params': [H_network[4].bias]},
    {'params': [H_network[6].bias]},
    # C networks
    {'params': [C_network[0].bias]},
    {'params': [C_network[2].bias]},
    {'params': [C_network[4].bias]},
    {'params': [C_network[6].bias]},
    # N networks
    {'params': [N_network[0].bias]},
    {'params': [N_network[2].bias]},
    {'params': [N_network[4].bias]},
    {'params': [N_network[6].bias]},
    # O networks
    {'params': [O_network[0].bias]},
    {'params': [O_network[2].bias]},
    {'params': [O_network[4].bias]},
    {'params': [O_network[6].bias]},
], lr=1e-3)

AdamW_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(AdamW, factor=0.5, patience=100, threshold=0)
SGD_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(SGD, factor=0.5, patience=100, threshold=0)
203
204
205
206
207

###############################################################################
# This part of the code is also the same
latest_checkpoint = 'force-training-latest.pt'

208
209
210
211
212
213
214
215
216
###############################################################################
# Resume training from previously saved checkpoints:
if os.path.isfile(latest_checkpoint):
    checkpoint = torch.load(latest_checkpoint)
    nn.load_state_dict(checkpoint['nn'])
    AdamW.load_state_dict(checkpoint['AdamW'])
    SGD.load_state_dict(checkpoint['SGD'])
    AdamW_scheduler.load_state_dict(checkpoint['AdamW_scheduler'])
    SGD_scheduler.load_state_dict(checkpoint['SGD_scheduler'])
217

218
219
220
221
222
###############################################################################
# During training, we need to validate on validation set and if validation error
# is better than the best, then save the new best model to a checkpoint


223
224
225
226
227
228
229
230
231
def validate():
    # run validation
    mse_sum = torch.nn.MSELoss(reduction='sum')
    total_mse = 0.0
    count = 0
    for batch_x, batch_y in validation:
        true_energies = batch_y['energies']
        predicted_energies = []
        for chunk_species, chunk_coordinates in batch_x:
232
            chunk_energies = model((chunk_species, chunk_coordinates)).energies
233
234
235
236
            predicted_energies.append(chunk_energies)
        predicted_energies = torch.cat(predicted_energies)
        total_mse += mse_sum(predicted_energies, true_energies).item()
        count += predicted_energies.shape[0]
Ignacio Pickering's avatar
Ignacio Pickering committed
237
    return hartree2kcalmol(math.sqrt(total_mse / count))
238
239
240


###############################################################################
241
# We will also use TensorBoard to visualize our training process
242
243
244
245
tensorboard = torch.utils.tensorboard.SummaryWriter()

###############################################################################
# In the training loop, we need to compute force, and loss for forces
246
247
248
mse = torch.nn.MSELoss(reduction='none')

print("training starting from epoch", AdamW_scheduler.last_epoch + 1)
Gao, Xiang's avatar
Gao, Xiang committed
249
250
251
# We only train 3 epoches here in able to generate the docs quickly.
# Real training should take much more than 3 epoches.
max_epochs = 3
252
early_stopping_learning_rate = 1.0E-5
253
force_coefficient = 0.1  # controls the importance of energy loss vs force loss
254
255
best_model_checkpoint = 'force-training-best.pt'

256
for _ in range(AdamW_scheduler.last_epoch + 1, max_epochs):
257
    rmse = validate()
258
    print('RMSE:', rmse, 'at epoch', AdamW_scheduler.last_epoch + 1)
259

260
    learning_rate = AdamW.param_groups[0]['lr']
261
262
263
264
265

    if learning_rate < early_stopping_learning_rate:
        break

    # checkpoint
266
    if AdamW_scheduler.is_better(rmse, AdamW_scheduler.best):
267
268
        torch.save(nn.state_dict(), best_model_checkpoint)

269
270
271
272
273
274
    AdamW_scheduler.step(rmse)
    SGD_scheduler.step(rmse)

    tensorboard.add_scalar('validation_rmse', rmse, AdamW_scheduler.last_epoch)
    tensorboard.add_scalar('best_validation_rmse', AdamW_scheduler.best, AdamW_scheduler.last_epoch)
    tensorboard.add_scalar('learning_rate', learning_rate, AdamW_scheduler.last_epoch)
275
276
277

    # Besides being stored in x, species and coordinates are also stored in y.
    # So here, for simplicity, we just ignore the x and use y for everything.
278
279
280
281
282
283
    for i, (_, batch_y) in tqdm.tqdm(
        enumerate(training),
        total=len(training),
        desc="epoch {}".format(AdamW_scheduler.last_epoch)
    ):

284
285
286
287
288
289
290
291
292
        true_energies = batch_y['energies']
        predicted_energies = []
        num_atoms = []
        force_loss = []

        for chunk in batch_y['atomic']:
            chunk_species = chunk['species']
            chunk_coordinates = chunk['coordinates']
            chunk_true_forces = chunk['forces']
293
            chunk_num_atoms = (chunk_species >= 0).to(true_energies.dtype).sum(dim=1)
294
295
296
297
298
299
            num_atoms.append(chunk_num_atoms)

            # We must set `chunk_coordinates` to make it requires grad, so
            # that we could compute force from it
            chunk_coordinates.requires_grad_(True)

300
            chunk_energies = model((chunk_species, chunk_coordinates)).energies
301

302
303
304
305
306
307
            # We can use torch.autograd.grad to compute force. Remember to
            # create graph so that the loss of the force can contribute to
            # the gradient of parameters, and also to retain graph so that
            # we can backward through it a second time when computing gradient
            # w.r.t. parameters.
            chunk_forces = -torch.autograd.grad(chunk_energies.sum(), chunk_coordinates, create_graph=True, retain_graph=True)[0]
308
309
310
311
312
313
314
315
316
317
318

            # Now let's compute loss for force of this chunk
            chunk_force_loss = mse(chunk_true_forces, chunk_forces).sum(dim=(1, 2)) / chunk_num_atoms

            predicted_energies.append(chunk_energies)
            force_loss.append(chunk_force_loss)

        num_atoms = torch.cat(num_atoms)
        predicted_energies = torch.cat(predicted_energies)

        # Now the total loss has two parts, energy loss and force loss
319
        energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
320
321
322
        force_loss = torch.cat(force_loss).mean()
        loss = energy_loss + force_coefficient * force_loss

323
324
        AdamW.zero_grad()
        SGD.zero_grad()
325
        loss.backward()
326
327
        AdamW.step()
        SGD.step()
328
329

        # write current batch loss to TensorBoard
330
        tensorboard.add_scalar('batch_loss', loss, AdamW_scheduler.last_epoch * len(training) + i)
331
332
333

    torch.save({
        'nn': nn.state_dict(),
334
335
336
337
        'AdamW': AdamW.state_dict(),
        'SGD': SGD.state_dict(),
        'AdamW_scheduler': AdamW_scheduler.state_dict(),
        'SGD_scheduler': SGD_scheduler.state_dict(),
338
    }, latest_checkpoint)