Add code snippet to save validation and training sets in nnp_training (#548)

57dd26bf · Ignacio Pickering · GitHub · 12422dd1 · 57dd26bf · 57dd26bf
Unverified Commit 57dd26bf authored Nov 17, 2020 by Ignacio Pickering Committed by GitHub Nov 17, 2020
Show whitespace changes
Inline Side-by-side

Showing with 27 additions and 3 deletions

.gitignore .gitignore +1 -0

examples/nnp_training.py examples/nnp_training.py +26 -3

No files found.
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,4 @@ Untitled.ipynb
 htmlcov/
 /include
 training_outputs/
+examples/dataset.pkl
--- a/examples/nnp_training.py
+++ b/examples/nnp_training.py
@@ -39,6 +39,7 @@ import os
 import math
 import torch.utils.tensorboard
 import tqdm
+import pickle

 # helper function to convert energy unit from Hartree to kcal/mol
 from torchani.units import hartree2kcalmol
@@ -95,9 +96,31 @@ except NameError:
 dspath = os.path.join(path, '../dataset/ani1-up_to_gdb4/ani_gdb_s01.h5')
 batch_size = 2560

-training, validation = torchani.data.load(dspath).subtract_self_energies(energy_shifter, species_order).species_to_indices(species_order).shuffle().split(0.8, None)
-training = training.collate(batch_size).cache()
-validation = validation.collate(batch_size).cache()
+pickled_dataset_path = 'dataset.pkl'
+
+# We pickle the dataset after loading to ensure we use the same validation set
+# each time we restart training, otherwise we risk mixing the validation and
+# training sets on each restart.
+if os.path.isfile(pickled_dataset_path):
+    print(f'Unpickling preprocessed dataset found in {pickled_dataset_path}')
+    with open(pickled_dataset_path, 'rb') as f:
+        dataset = pickle.load(f)
+    training = dataset['training'].collate(batch_size).cache()
+    validation = dataset['validation'].collate(batch_size).cache()
+    energy_shifter.self_energies = dataset['self_energies'].to(device)
+else:
+    print(f'Processing dataset in {dspath}')
+    training, validation = torchani.data.load(dspath)\
+                                        .subtract_self_energies(energy_shifter, species_order)\
+                                        .species_to_indices(species_order)\
+                                        .shuffle()\
+                                        .split(0.8, None)
+    with open(pickled_dataset_path, 'wb') as f:
+        pickle.dump({'training': training,
+                     'validation': validation,
+                     'self_energies': energy_shifter.self_energies.cpu()}, f)
+    training = training.collate(batch_size).cache()
+    validation = validation.collate(batch_size).cache()
 print('Self atomic energies: ', energy_shifter.self_energies)

 ###############################################################################