Remove everything about chunking (#432)

* Remove everything about chunking * aev.py * neurochem trainer * training-benchmark-nsys-profile.py * fix eval * training-benchmark.py * nnp_training.py * flake8 * nnp_training_force.py * fix dtype of species * fix * flake8 * requires_grad_ * git ignore * fix * original * fix * fix * fix * fix * save * save * save * save * save * save * save * save * save * collate * fix * save * fix * save * save * fix * save * fix * fix * no len * float * save * save * save * save * save * save * save * save * save * fix * save * save * save * save * fix * fix * fix * fix mypy * don't remove outliers * save * save * save * fix * flake8 * save * fix * flake8 * docs * more docs * fix test_data * remove test_data_new * fix

Remove everything about chunking (#432)
* Remove everything about chunking * aev.py * neurochem trainer * training-benchmark-nsys-profile.py * fix eval * training-benchmark.py * nnp_training.py * flake8 * nnp_training_force.py * fix dtype of species * fix * flake8 * requires_grad_ * git ignore * fix * original * fix * fix * fix * fix * save * save * save * save * save * save * save * save * save * collate * fix * save * fix * save * save * fix * save * fix * fix * no len * float * save * save * save * save * save * save * save * save * save * fix * save * save * save * save * fix * fix * fix * fix mypy * don't remove outliers * save * save * save * fix * flake8 * save * fix * flake8 * docs * more docs * fix test_data * remove test_data_new * fix
6b058c6e · Gao, Xiang · GitHub · 338f896a · 6b058c6e · 6b058c6e
Unverified Commit 6b058c6e authored Mar 21, 2020 by Gao, Xiang Committed by GitHub Mar 21, 2020
20 changed files
--- a/.github/workflows/tools.yml
+++ b/.github/workflows/tools.yml
@@ -39,3 +39,5 @@ jobs:
      run: python tools/comp6.py dataset/COMP6/COMP6v1/s66x8
    - name: Training Benchmark
      run: python tools/training-benchmark.py dataset/ani1-up_to_gdb4/ani_gdb_s01.h5
+    - name: Training Benchmark Nsight System
+      run: python tools/training-benchmark-nsys-profile.py --dry-run dataset/ani1-up_to_gdb4/ani_gdb_s01.h5
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -18,7 +18,7 @@ jobs:
        python-version: [3.6, 3.8]
        test-filenames: [
          test_aev.py, test_aev_benzene_md.py, test_aev_nist.py, test_aev_tripeptide_md.py,
-          test_data_new.py, test_utils.py, test_ase.py, test_energies.py, test_periodic_table_indexing.py,
+          test_utils.py, test_ase.py, test_energies.py, test_periodic_table_indexing.py,
          test_neurochem.py, test_vibrational.py, test_ensemble.py, test_padding.py,
          test_data.py, test_forces.py, test_structure_optim.py, test_jit_builtin_models.py]


--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,5 @@ dist
 /download.tar.xz
 *.qdrep
 *.qdstrm
+*.zip
+Untitled.ipynb
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -26,12 +26,6 @@ Datasets
 ========

 .. automodule:: torchani.data
-.. autofunction:: torchani.data.find_threshold
-.. autofunction:: torchani.data.ShuffledDataset
-.. autoclass:: torchani.data.CachedDataset
-    :members:
-.. autofunction:: torchani.data.load_ani_dataset
-.. autoclass:: torchani.data.PaddedBatchChunkDataset




--- a/examples/nnp_training.py
+++ b/examples/nnp_training.py
@@ -80,37 +80,17 @@ try:
 except NameError:
    path = os.getcwd()
 dspath = os.path.join(path, '../dataset/ani1-up_to_gdb4/ani_gdb_s01.h5')
-
 batch_size = 2560

-training, validation = torchani.data.load_ani_dataset(
-    dspath, species_to_tensor, batch_size, rm_outlier=True, device=device,
-    transform=[energy_shifter.subtract_from_dataset], split=[0.8, None])
+dataset = torchani.data.load(dspath).subtract_self_energies(energy_shifter).species_to_indices().shuffle()
+size = len(dataset)
+training, validation = dataset.split(int(0.8 * size), None)
+training = training.collate(batch_size).cache()
+validation = validation.collate(batch_size).cache()
 print('Self atomic energies: ', energy_shifter.self_energies)

 ###############################################################################
-# When iterating the dataset, we will get pairs of input and output
-# ``(species_coordinates, properties)``, where ``species_coordinates`` is the
-# input and ``properties`` is the output.
-#
-# ``species_coordinates`` is a list of species-coordinate pairs, with shape
-# ``(N, Na)`` and ``(N, Na, 3)``. The reason for getting this type is, when
-# loading the dataset and generating minibatches, the whole dataset are
-# shuffled and each minibatch contains structures of molecules with a wide
-# range of number of atoms. Molecules of different number of atoms are batched
-# into single by padding. The way padding works is: adding ghost atoms, with
-# species 'X', and do computations as if they were normal atoms. But when
-# computing AEVs, atoms with species `X` would be ignored. To avoid computation
-# wasting on padding atoms, minibatches are further splitted into chunks. Each
-# chunk contains structures of molecules of similar size, which minimize the
-# total number of padding atoms required to add. The input list
-# ``species_coordinates`` contains chunks of that minibatch we are getting. The
-# batching and chunking happens automatically, so the user does not need to
-# worry how to construct chunks, but the user need to compute the energies for
-# each chunk and concat them into single tensor.
-#
-# The output, i.e. ``properties`` is a dictionary holding each property. This
-# allows us to extend TorchANI in the future to training forces and properties.
+# When iterating the dataset, we will get a dict of name->property mapping
 #
 ###############################################################################
 # Now let's define atomic neural networks.
@@ -279,16 +259,11 @@ def validate():
    mse_sum = torch.nn.MSELoss(reduction='sum')
    total_mse = 0.0
    count = 0
-    for batch_x, batch_y in validation:
-        true_energies = batch_y['energies']
-        predicted_energies = []
-        atomic_properties = []
-        for chunk_species, chunk_coordinates in batch_x:
-            atomic_chunk = {'species': chunk_species, 'coordinates': chunk_coordinates}
-            atomic_properties.append(atomic_chunk)
-
-        atomic_properties = torchani.utils.pad_atomic_properties(atomic_properties)
-        predicted_energies = model((atomic_properties['species'], atomic_properties['coordinates'])).energies
+    for properties in validation:
+        species = properties['species'].to(device)
+        coordinates = properties['coordinates'].to(device).float()
+        true_energies = properties['energies'].to(device).float()
+        _, predicted_energies = model((species, coordinates))
        total_mse += mse_sum(predicted_energies, true_energies).item()
        count += predicted_energies.shape[0]
    return hartree2kcalmol(math.sqrt(total_mse / count))
@@ -331,26 +306,17 @@ for _ in range(AdamW_scheduler.last_epoch + 1, max_epochs):
    tensorboard.add_scalar('best_validation_rmse', AdamW_scheduler.best, AdamW_scheduler.last_epoch)
    tensorboard.add_scalar('learning_rate', learning_rate, AdamW_scheduler.last_epoch)

-    for i, (batch_x, batch_y) in tqdm.tqdm(
+    for i, properties in tqdm.tqdm(
        enumerate(training),
        total=len(training),
        desc="epoch {}".format(AdamW_scheduler.last_epoch)
    ):
+        species = properties['species'].to(device)
+        coordinates = properties['coordinates'].to(device).float()
+        true_energies = properties['energies'].to(device).float()
+        num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
+        _, predicted_energies = model((species, coordinates))

-        true_energies = batch_y['energies']
-        predicted_energies = []
-        num_atoms = []
-        atomic_properties = []
-
-        for chunk_species, chunk_coordinates in batch_x:
-            atomic_chunk = {'species': chunk_species, 'coordinates': chunk_coordinates}
-            atomic_properties.append(atomic_chunk)
-            num_atoms.append((chunk_species >= 0).to(true_energies.dtype).sum(dim=1))
-
-        atomic_properties = torchani.utils.pad_atomic_properties(atomic_properties)
-        predicted_energies = model((atomic_properties['species'], atomic_properties['coordinates'])).energies
-
-        num_atoms = torch.cat(num_atoms)
        loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()

        AdamW.zero_grad()

--- a/examples/nnp_training_force.py
+++ b/examples/nnp_training_force.py
@@ -49,34 +49,14 @@ dspath = os.path.join(path, '../dataset/ani-1x/sample.h5')

 batch_size = 2560

-###############################################################################
-# The code to create the dataset is a bit different: we need to manually
-# specify that ``atomic_properties=['forces']`` so that forces will be read
-# from hdf5 files.
-
-training, validation = torchani.data.load_ani_dataset(
-    dspath, species_to_tensor, batch_size, rm_outlier=True,
-    device=device, atomic_properties=['forces'],
-    transform=[energy_shifter.subtract_from_dataset], split=[0.8, None])
+dataset = torchani.data.load(dspath).subtract_self_energies(energy_shifter).species_to_indices().shuffle()
+size = len(dataset)
+training, validation = dataset.split(int(0.8 * size), None)
+training = training.collate(batch_size).cache()
+validation = validation.collate(batch_size).cache()

 print('Self atomic energies: ', energy_shifter.self_energies)

-###############################################################################
-# When iterating the dataset, we will get pairs of input and output
-# ``(species_coordinates, properties)``, in this case, ``properties`` would
-# contain a key ``'atomic'`` where ``properties['atomic']`` is a list of dict
-# containing forces:
-
-data = training[0]
-properties = data[1]
-atomic_properties = properties['atomic']
-print(type(atomic_properties))
-print(list(atomic_properties[0].keys()))
-
-###############################################################################
-# Due to padding, part of the forces might be 0
-print(atomic_properties[0]['forces'][0])
-
 ###############################################################################
 # The code to define networks, optimizers, are mostly the same

@@ -225,13 +205,11 @@ def validate():
    mse_sum = torch.nn.MSELoss(reduction='sum')
    total_mse = 0.0
    count = 0
-    for batch_x, batch_y in validation:
-        true_energies = batch_y['energies']
-        predicted_energies = []
-        for chunk_species, chunk_coordinates in batch_x:
-            chunk_energies = model((chunk_species, chunk_coordinates)).energies
-            predicted_energies.append(chunk_energies)
-        predicted_energies = torch.cat(predicted_energies)
+    for properties in validation:
+        species = properties['species'].to(device)
+        coordinates = properties['coordinates'].to(device).float()
+        true_energies = properties['energies'].to(device).float()
+        _, predicted_energies = model((species, coordinates))
        total_mse += mse_sum(predicted_energies, true_energies).item()
        count += predicted_energies.shape[0]
    return hartree2kcalmol(math.sqrt(total_mse / count))
@@ -275,49 +253,28 @@ for _ in range(AdamW_scheduler.last_epoch + 1, max_epochs):

    # Besides being stored in x, species and coordinates are also stored in y.
    # So here, for simplicity, we just ignore the x and use y for everything.
-    for i, (_, batch_y) in tqdm.tqdm(
+    for i, properties in tqdm.tqdm(
        enumerate(training),
        total=len(training),
        desc="epoch {}".format(AdamW_scheduler.last_epoch)
    ):
-
-        true_energies = batch_y['energies']
-        predicted_energies = []
-        num_atoms = []
-        force_loss = []
-
-        for chunk in batch_y['atomic']:
-            chunk_species = chunk['species']
-            chunk_coordinates = chunk['coordinates']
-            chunk_true_forces = chunk['forces']
-            chunk_num_atoms = (chunk_species >= 0).to(true_energies.dtype).sum(dim=1)
-            num_atoms.append(chunk_num_atoms)
-
-            # We must set `chunk_coordinates` to make it requires grad, so
-            # that we could compute force from it
-            chunk_coordinates.requires_grad_(True)
-
-            chunk_energies = model((chunk_species, chunk_coordinates)).energies
-
-            # We can use torch.autograd.grad to compute force. Remember to
-            # create graph so that the loss of the force can contribute to
-            # the gradient of parameters, and also to retain graph so that
-            # we can backward through it a second time when computing gradient
-            # w.r.t. parameters.
-            chunk_forces = -torch.autograd.grad(chunk_energies.sum(), chunk_coordinates, create_graph=True, retain_graph=True)[0]
-
-            # Now let's compute loss for force of this chunk
-            chunk_force_loss = mse(chunk_true_forces, chunk_forces).sum(dim=(1, 2)) / chunk_num_atoms
-
-            predicted_energies.append(chunk_energies)
-            force_loss.append(chunk_force_loss)
-
-        num_atoms = torch.cat(num_atoms)
-        predicted_energies = torch.cat(predicted_energies)
+        species = properties['species'].to(device)
+        coordinates = properties['coordinates'].to(device).float().requires_grad_(True)
+        true_energies = properties['energies'].to(device).float()
+        true_forces = properties['forces'].to(device).float()
+        num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
+        _, predicted_energies = model((species, coordinates))
+
+        # We can use torch.autograd.grad to compute force. Remember to
+        # create graph so that the loss of the force can contribute to
+        # the gradient of parameters, and also to retain graph so that
+        # we can backward through it a second time when computing gradient
+        # w.r.t. parameters.
+        forces = -torch.autograd.grad(predicted_energies.sum(), coordinates, create_graph=True, retain_graph=True)[0]

        # Now the total loss has two parts, energy loss and force loss
        energy_loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
-        force_loss = torch.cat(force_loss).mean()
+        force_loss = (mse(true_forces, forces).sum(dim=(1, 2)) / num_atoms).mean()
        loss = energy_loss + force_coefficient * force_loss

        AdamW.zero_grad()

--- a/tests/test_aev.py
+++ b/tests/test_aev.py
@@ -118,7 +118,8 @@ class TestAEV(_TestAEVBase):
                species = self.transform(species)
                radial = self.transform(radial)
                angular = self.transform(angular)
-                species_coordinates.append({'species': species, 'coordinates': coordinates})
+                species_coordinates.append(torchani.utils.broadcast_first_dim(
+                    {'species': species, 'coordinates': coordinates}))
                radial_angular.append((radial, angular))
        species_coordinates = torchani.utils.pad_atomic_properties(
            species_coordinates)

--- a/tests/test_data.py
+++ b/tests/test_data.py
 import os
-import torch
 import torchani
 import unittest

 path = os.path.dirname(os.path.realpath(__file__))
-dataset_path = os.path.join(path, '../dataset/ani1-up_to_gdb4')
-dataset_path2 = os.path.join(path, '../dataset/ani1-up_to_gdb4/ani_gdb_s01.h5')
+dataset_path = os.path.join(path, 'dataset/ani-1x/sample.h5')
 batch_size = 256
 ani1x = torchani.models.ANI1x()
 consts = ani1x.consts
+sae_dict = ani1x.sae_dict
 aev_computer = ani1x.aev_computer


 class TestData(unittest.TestCase):

-    def setUp(self):
-        self.ds = torchani.data.load_ani_dataset(dataset_path,
-                                                 consts.species_to_tensor,
-                                                 batch_size)
-
-    def _assertTensorEqual(self, t1, t2):
-        self.assertLess((t1 - t2).abs().max().item(), 1e-6)
-
-    def testSplitBatch(self):
-        species1 = torch.randint(4, (5, 4), dtype=torch.long)
-        coordinates1 = torch.randn(5, 4, 3)
-        species2 = torch.randint(4, (2, 8), dtype=torch.long)
-        coordinates2 = torch.randn(2, 8, 3)
-        species3 = torch.randint(4, (10, 20), dtype=torch.long)
-        coordinates3 = torch.randn(10, 20, 3)
-        species_coordinates = torchani.utils.pad_atomic_properties([
-            {'species': species1, 'coordinates': coordinates1},
-            {'species': species2, 'coordinates': coordinates2},
-            {'species': species3, 'coordinates': coordinates3},
-        ])
-        species = species_coordinates['species']
-        coordinates = species_coordinates['coordinates']
-        natoms = (species >= 0).to(torch.long).sum(1)
-        chunks = torchani.data.split_batch(natoms, species_coordinates)
-        start = 0
-        last = None
-        for chunk in chunks:
-            s = chunk['species']
-            c = chunk['coordinates']
-            n = (s >= 0).to(torch.long).sum(1)
-            if last is not None:
-                self.assertNotEqual(last[-1], n[0])
-            conformations = s.shape[0]
-            self.assertGreater(conformations, 0)
-            s_ = species[start:(start + conformations), ...]
-            c_ = coordinates[start:(start + conformations), ...]
-            sc = torchani.utils.strip_redundant_padding({'species': s_, 'coordinates': c_})
-            s_ = sc['species']
-            c_ = sc['coordinates']
-            self._assertTensorEqual(s, s_)
-            self._assertTensorEqual(c, c_)
-            start += conformations
-
-        sc = torchani.utils.pad_atomic_properties(chunks)
-        s = sc['species']
-        c = sc['coordinates']
-        self._assertTensorEqual(s, species)
-        self._assertTensorEqual(c, coordinates)
-
    def testTensorShape(self):
-        for i in self.ds:
-            input_, output = i
-            input_ = [{'species': x[0], 'coordinates': x[1]} for x in input_]
-            species_coordinates = torchani.utils.pad_atomic_properties(input_)
-            species = species_coordinates['species']
-            coordinates = species_coordinates['coordinates']
-            energies = output['energies']
+        ds = torchani.data.load(dataset_path).subtract_self_energies(sae_dict).species_to_indices().shuffle().collate(batch_size).cache()
+        for d in ds:
+            species = d['species']
+            coordinates = d['coordinates']
+            energies = d['energies']
            self.assertEqual(len(species.shape), 2)
            self.assertLessEqual(species.shape[0], batch_size)
            self.assertEqual(len(coordinates.shape), 3)
@@ -80,11 +28,11 @@ class TestData(unittest.TestCase):
            self.assertEqual(coordinates.shape[0], energies.shape[0])

    def testNoUnnecessaryPadding(self):
-        for i in self.ds:
-            for input_ in i[0]:
-                species, _ = input_
-                non_padding = (species >= 0)[:, -1].nonzero()
-                self.assertGreater(non_padding.numel(), 0)
+        ds = torchani.data.load(dataset_path).subtract_self_energies(sae_dict).species_to_indices().shuffle().collate(batch_size).cache()
+        for d in ds:
+            species = d['species']
+            non_padding = (species >= 0)[:, -1].nonzero()
+            self.assertGreater(non_padding.numel(), 0)


 if __name__ == '__main__':

--- a/tests/test_data_new.py
+++ b/tests/test_data_new.py
-import torchani
-import unittest
-import pkbar
-import torch
-import os
-
-path = os.path.dirname(os.path.realpath(__file__))
-dspath = os.path.join(path, '../dataset/ani1-up_to_gdb4/ani_gdb_s03.h5')
-
-batch_size = 2560
-chunk_threshold = 5
-
-other_properties = {'properties': ['dipoles', 'forces', 'energies'],
-                    'padding_values': [None, 0, None],
-                    'padded_shapes': [(batch_size, 3), (batch_size, -1, 3), (batch_size, )],
-                    'dtypes': [torch.float32, torch.float32, torch.float64],
-                    }
-
-other_properties = {'properties': ['energies'],
-                    'padding_values': [None],
-                    'padded_shapes': [(batch_size, )],
-                    'dtypes': [torch.float64],
-                    }
-
-
-class TestFindThreshold(unittest.TestCase):
-    def setUp(self):
-        print('.. check find threshold to split chunks')
-
-    def testFindThreshould(self):
-        torchani.data.find_threshold(dspath, batch_size=batch_size, threshold_max=10)
-
-
-class TestShuffledData(unittest.TestCase):
-
-    def setUp(self):
-        print('.. setup shuffle dataset')
-        self.ds = torchani.data.ShuffledDataset(dspath, batch_size=batch_size,
-                                                chunk_threshold=chunk_threshold,
-                                                num_workers=2,
-                                                other_properties=other_properties,
-                                                subtract_self_energies=True)
-        self.chunks, self.properties = iter(self.ds).next()
-
-    def testTensorShape(self):
-        print('=> checking tensor shape')
-        print('the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        batch_len = 0
-        print('1. chunks')
-        for i, chunk in enumerate(self.chunks):
-            print('chunk{}'.format(i + 1), 'species:', list(chunk[0].size()), chunk[0].dtype,
-                  'coordinates:', list(chunk[1].size()), chunk[1].dtype)
-            # check dtype
-            self.assertEqual(chunk[0].dtype, torch.int64)
-            self.assertEqual(chunk[1].dtype, torch.float32)
-            # check shape
-            self.assertEqual(chunk[1].shape[2], 3)
-            self.assertEqual(chunk[1].shape[:2], chunk[0].shape[:2])
-            batch_len += chunk[0].shape[0]
-        print('2. properties')
-        for i, key in enumerate(other_properties['properties']):
-            print(key, list(self.properties[key].size()), self.properties[key].dtype)
-            # check dtype
-            self.assertEqual(self.properties[key].dtype, other_properties['dtypes'][i])
-            # shape[0] == batch_size
-            self.assertEqual(self.properties[key].shape[0], other_properties['padded_shapes'][i][0])
-            # check len(shape)
-            self.assertEqual(len(self.properties[key].shape), len(other_properties['padded_shapes'][i]))
-
-    def testLoadDataset(self):
-        print('=> test loading all dataset')
-        pbar = pkbar.Pbar('loading and processing dataset into cpu memory, total '
-                          + 'batches: {}, batch_size: {}'.format(len(self.ds), batch_size),
-                          len(self.ds))
-        for i, _ in enumerate(self.ds):
-            pbar.update(i)
-
-    def testSplitDataset(self):
-        print('=> test splitting dataset')
-        train_ds, val_ds = torchani.data.ShuffledDataset(dspath, batch_size=batch_size, chunk_threshold=chunk_threshold, num_workers=2, validation_split=0.1)
-        frac = len(val_ds) / (len(val_ds) + len(train_ds))
-        self.assertLess(abs(frac - 0.1), 0.05)
-
-    def testNoUnnecessaryPadding(self):
-        print('=> checking No Unnecessary Padding')
-        for i, chunk in enumerate(self.chunks):
-            species, _ = chunk
-            non_padding = (species >= 0)[:, -1].nonzero()
-            self.assertGreater(non_padding.numel(), 0)
-
-
-class TestCachedData(unittest.TestCase):
-
-    def setUp(self):
-        print('.. setup cached dataset')
-        self.ds = torchani.data.CachedDataset(dspath, batch_size=batch_size, device='cpu',
-                                              chunk_threshold=chunk_threshold,
-                                              other_properties=other_properties,
-                                              subtract_self_energies=True)
-        self.chunks, self.properties = self.ds[0]
-
-    def testTensorShape(self):
-        print('=> checking tensor shape')
-        print('the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        batch_len = 0
-        print('1. chunks')
-        for i, chunk in enumerate(self.chunks):
-            print('chunk{}'.format(i + 1), 'species:', list(chunk[0].size()), chunk[0].dtype,
-                  'coordinates:', list(chunk[1].size()), chunk[1].dtype)
-            # check dtype
-            self.assertEqual(chunk[0].dtype, torch.int64)
-            self.assertEqual(chunk[1].dtype, torch.float32)
-            # check shape
-            self.assertEqual(chunk[1].shape[2], 3)
-            self.assertEqual(chunk[1].shape[:2], chunk[0].shape[:2])
-            batch_len += chunk[0].shape[0]
-        print('2. properties')
-        for i, key in enumerate(other_properties['properties']):
-            print(key, list(self.properties[key].size()), self.properties[key].dtype)
-            # check dtype
-            self.assertEqual(self.properties[key].dtype, other_properties['dtypes'][i])
-            # shape[0] == batch_size
-            self.assertEqual(self.properties[key].shape[0], other_properties['padded_shapes'][i][0])
-            # check len(shape)
-            self.assertEqual(len(self.properties[key].shape), len(other_properties['padded_shapes'][i]))
-
-    def testLoadDataset(self):
-        print('=> test loading all dataset')
-        self.ds.load()
-
-    def testSplitDataset(self):
-        print('=> test splitting dataset')
-        train_dataset, val_dataset = self.ds.split(0.1)
-        frac = len(val_dataset) / len(self.ds)
-        self.assertLess(abs(frac - 0.1), 0.05)
-
-    def testNoUnnecessaryPadding(self):
-        print('=> checking No Unnecessary Padding')
-        for i, chunk in enumerate(self.chunks):
-            species, _ = chunk
-            non_padding = (species >= 0)[:, -1].nonzero()
-            self.assertGreater(non_padding.numel(), 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_energies.py
+++ b/tests/test_energies.py
@@ -54,7 +54,8 @@ class TestEnergies(unittest.TestCase):
                coordinates = self.transform(coordinates)
                species = self.transform(species)
                e = self.transform(e)
-                species_coordinates.append({'species': species, 'coordinates': coordinates})
+                species_coordinates.append(
+                    torchani.utils.broadcast_first_dim({'species': species, 'coordinates': coordinates}))
                energies.append(e)
        species_coordinates = torchani.utils.pad_atomic_properties(
            species_coordinates)

--- a/tests/test_forces.py
+++ b/tests/test_forces.py
@@ -55,7 +55,8 @@ class TestForce(unittest.TestCase):
                species = self.transform(species)
                forces = self.transform(forces)
                coordinates.requires_grad_(True)
-                species_coordinates.append({'species': species, 'coordinates': coordinates})
+                species_coordinates.append(torchani.utils.broadcast_first_dim(
+                    {'species': species, 'coordinates': coordinates}))
        species_coordinates = torchani.utils.pad_atomic_properties(
            species_coordinates)
        _, energies = self.model((species_coordinates['species'], species_coordinates['coordinates']))

--- a/tests/test_jit_builtin_models.py
+++ b/tests/test_jit_builtin_models.py
@@ -18,16 +18,14 @@ other_properties = {'properties': ['energies'],
 class TestBuiltinModelsJIT(unittest.TestCase):

    def setUp(self):
-        self.ds = torchani.data.CachedDataset(dspath, batch_size=batch_size, device='cpu',
-                                              chunk_threshold=chunk_threshold,
-                                              other_properties=other_properties,
-                                              subtract_self_energies=True)
        self.ani1ccx = torchani.models.ANI1ccx()
+        self.ds = torchani.data.load(dspath).subtract_self_energies(self.ani1ccx.sae_dict).species_to_indices().shuffle().collate(256).cache()

    def _test_model(self, model):
-        chunk = self.ds[0][0][0]
-        _, e = model(chunk)
-        _, e2 = torch.jit.script(model)(chunk)
+        properties = next(iter(self.ds))
+        input_ = (properties['species'], properties['coordinates'].float())
+        _, e = model(input_)
+        _, e2 = torch.jit.script(model)(input_)
        self.assertTrue(torch.allclose(e, e2))

    def _test_ensemble(self, ensemble):

--- a/tests/test_padding.py
+++ b/tests/test_padding.py
@@ -3,6 +3,9 @@ import torch
 import torchani


+b = torchani.utils.broadcast_first_dim
+
+
 class TestPaddings(unittest.TestCase):

    def testVectorSpecies(self):
@@ -11,8 +14,8 @@ class TestPaddings(unittest.TestCase):
        species2 = torch.tensor([[3, 2, 0, 1, 0]])
        coordinates2 = torch.zeros(2, 5, 3)
        atomic_properties = torchani.utils.pad_atomic_properties([
-            {'species': species1, 'coordinates': coordinates1},
-            {'species': species2, 'coordinates': coordinates2},
+            b({'species': species1, 'coordinates': coordinates1}),
+            b({'species': species2, 'coordinates': coordinates2}),
        ])
        self.assertEqual(atomic_properties['species'].shape[0], 7)
        self.assertEqual(atomic_properties['species'].shape[1], 5)
@@ -34,8 +37,8 @@ class TestPaddings(unittest.TestCase):
        species2 = torch.tensor([[3, 2, 0, 1, 0]])
        coordinates2 = torch.zeros(2, 5, 3)
        atomic_properties = torchani.utils.pad_atomic_properties([
-            {'species': species1, 'coordinates': coordinates1},
-            {'species': species2, 'coordinates': coordinates2},
+            b({'species': species1, 'coordinates': coordinates1}),
+            b({'species': species2, 'coordinates': coordinates2}),
        ])
        self.assertEqual(atomic_properties['species'].shape[0], 7)
        self.assertEqual(atomic_properties['species'].shape[1], 5)
@@ -63,8 +66,8 @@ class TestPaddings(unittest.TestCase):
        species2 = torch.tensor([[3, 2, 0, 1, 0]])
        coordinates2 = torch.zeros(2, 5, 3)
        atomic_properties = torchani.utils.pad_atomic_properties([
-            {'species': species1, 'coordinates': coordinates1},
-            {'species': species2, 'coordinates': coordinates2},
+            b({'species': species1, 'coordinates': coordinates1}),
+            b({'species': species2, 'coordinates': coordinates2}),
        ])
        self.assertEqual(atomic_properties['species'].shape[0], 7)
        self.assertEqual(atomic_properties['species'].shape[1], 5)
@@ -98,28 +101,28 @@ class TestStripRedundantPadding(unittest.TestCase):
        species2 = torch.randint(4, (2, 5), dtype=torch.long)
        coordinates2 = torch.randn(2, 5, 3)
        atomic_properties12 = torchani.utils.pad_atomic_properties([
-            {'species': species1, 'coordinates': coordinates1},
-            {'species': species2, 'coordinates': coordinates2},
+            b({'species': species1, 'coordinates': coordinates1}),
+            b({'species': species2, 'coordinates': coordinates2}),
        ])
        species12 = atomic_properties12['species']
        coordinates12 = atomic_properties12['coordinates']
        species3 = torch.randint(4, (2, 10), dtype=torch.long)
        coordinates3 = torch.randn(2, 10, 3)
        atomic_properties123 = torchani.utils.pad_atomic_properties([
-            {'species': species1, 'coordinates': coordinates1},
-            {'species': species2, 'coordinates': coordinates2},
-            {'species': species3, 'coordinates': coordinates3},
+            b({'species': species1, 'coordinates': coordinates1}),
+            b({'species': species2, 'coordinates': coordinates2}),
+            b({'species': species3, 'coordinates': coordinates3}),
        ])
        species123 = atomic_properties123['species']
        coordinates123 = atomic_properties123['coordinates']
        species_coordinates1_ = torchani.utils.strip_redundant_padding(
-            {'species': species123[:5, ...], 'coordinates': coordinates123[:5, ...]})
+            b({'species': species123[:5, ...], 'coordinates': coordinates123[:5, ...]}))
        species1_ = species_coordinates1_['species']
        coordinates1_ = species_coordinates1_['coordinates']
        self._assertTensorEqual(species1_, species1)
        self._assertTensorEqual(coordinates1_, coordinates1)
        species_coordinates12_ = torchani.utils.strip_redundant_padding(
-            {'species': species123[:7, ...], 'coordinates': coordinates123[:7, ...]})
+            b({'species': species123[:7, ...], 'coordinates': coordinates123[:7, ...]}))
        species12_ = species_coordinates12_['species']
        coordinates12_ = species_coordinates12_['coordinates']
        self._assertTensorEqual(species12_, species12)

--- a/tools/training-benchmark-nsys-profile.py
+++ b/tools/training-benchmark-nsys-profile.py
@@ -55,24 +55,11 @@ if __name__ == "__main__":
    parser.add_argument('-b', '--batch_size',
                        help='Number of conformations of each batch',
                        default=2560, type=int)
-    parser.add_argument('-o', '--original_dataset_api',
-                        help='use original dataset api',
-                        dest='dataset',
-                        action='store_const',
-                        const='original')
-    parser.add_argument('-s', '--shuffle_dataset_api',
-                        help='use shuffle dataset api',
-                        dest='dataset',
-                        action='store_const',
-                        const='shuffle')
-    parser.add_argument('-c', '--cache_dataset_api',
-                        help='use cache dataset api',
-                        dest='dataset',
-                        action='store_const',
-                        const='cache')
-    parser.set_defaults(dataset='shuffle')
+    parser.add_argument('-d', '--dry-run',
+                        help='just run it in a CI without GPU',
+                        action='store_true')
    parser = parser.parse_args()
-    parser.device = torch.device('cuda')
+    parser.device = torch.device('cpu' if parser.dry_run else 'cuda')

    Rcr = 5.2000e+00
    Rca = 3.5000e+00
@@ -90,45 +77,9 @@ if __name__ == "__main__":
    optimizer = torch.optim.Adam(model.parameters(), lr=0.000001)
    mse = torch.nn.MSELoss(reduction='none')

-    if parser.dataset == 'shuffle':
-        print('using shuffle dataset API')
-        print('=> loading dataset...')
-        dataset = torchani.data.ShuffledDataset(file_path=parser.dataset_path,
-                                                species_order=['H', 'C', 'N', 'O'],
-                                                subtract_self_energies=True,
-                                                batch_size=parser.batch_size,
-                                                num_workers=2)
-        print('=> the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        chunks, properties = iter(dataset).next()
-    elif parser.dataset == 'original':
-        print('using original dataset API')
-        print('=> loading dataset...')
-        energy_shifter = torchani.utils.EnergyShifter(None)
-        species_to_tensor = torchani.utils.ChemicalSymbolsToInts(['H', 'C', 'N', 'O'])
-        dataset = torchani.data.load_ani_dataset(parser.dataset_path, species_to_tensor,
-                                                 parser.batch_size, device=parser.device,
-                                                 transform=[energy_shifter.subtract_from_dataset])
-        print('=> the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        chunks, properties = dataset[0]
-    elif parser.dataset == 'cache':
-        print('using cache dataset API')
-        print('=> loading dataset...')
-        dataset = torchani.data.CachedDataset(file_path=parser.dataset_path,
-                                              species_order=['H', 'C', 'N', 'O'],
-                                              subtract_self_energies=True,
-                                              batch_size=parser.batch_size)
-        print('=> caching all dataset into cpu')
-        pbar = pkbar.Pbar('loading and processing dataset into cpu memory, total '
-                          + 'batches: {}, batch_size: {}'.format(len(dataset), parser.batch_size),
-                          len(dataset))
-        for i, t in enumerate(dataset):
-            pbar.update(i)
-        print('=> the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        chunks, properties = dataset[0]
-
-    for i, chunk in enumerate(chunks):
-        print('chunk{}'.format(i + 1), list(chunk[0].size()), list(chunk[1].size()))
-    print('energies', list(properties['energies'].size()))
+    print('=> loading dataset...')
+    shifter = torchani.EnergyShifter(None)
+    dataset = list(torchani.data.load(parser.dataset_path).subtract_self_energies(shifter).species_to_indices().shuffle().collate(parser.batch_size))

    print('=> start warming up')
    total_batch_counter = 0
@@ -137,36 +88,24 @@ if __name__ == "__main__":
        print('Epoch: %d/inf' % (epoch + 1,))
        progbar = pkbar.Kbar(target=len(dataset) - 1, width=8)

-        for i, (batch_x, batch_y) in enumerate(dataset):
+        for i, properties in enumerate(dataset):

-            if total_batch_counter == WARM_UP_BATCHES:
+            if not parser.dry_run and total_batch_counter == WARM_UP_BATCHES:
                print('=> warm up finished, start profiling')
                enable_timers(model)
                torch.cuda.cudart().cudaProfilerStart()

-            PROFILING_STARTED = (total_batch_counter >= WARM_UP_BATCHES)
+            PROFILING_STARTED = not parser.dry_run and (total_batch_counter >= WARM_UP_BATCHES)

            if PROFILING_STARTED:
                torch.cuda.nvtx.range_push("batch{}".format(total_batch_counter))

-            true_energies = batch_y['energies'].to(parser.device)
-            predicted_energies = []
-            num_atoms = []
-
-            for j, (chunk_species, chunk_coordinates) in enumerate(batch_x):
-                if PROFILING_STARTED:
-                    torch.cuda.nvtx.range_push("chunk{}".format(j))
-                chunk_species = chunk_species.to(parser.device)
-                chunk_coordinates = chunk_coordinates.to(parser.device)
-                num_atoms.append((chunk_species >= 0).to(true_energies.dtype).sum(dim=1))
-                with torch.autograd.profiler.emit_nvtx(enabled=PROFILING_STARTED, record_shapes=True):
-                    _, chunk_energies = model((chunk_species, chunk_coordinates))
-                predicted_energies.append(chunk_energies)
-                if PROFILING_STARTED:
-                    torch.cuda.nvtx.range_pop()
-
-            num_atoms = torch.cat(num_atoms)
-            predicted_energies = torch.cat(predicted_energies).to(true_energies.dtype)
+            species = properties['species'].to(parser.device)
+            coordinates = properties['coordinates'].to(parser.device).float()
+            true_energies = properties['energies'].to(parser.device).float()
+            num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
+            with torch.autograd.profiler.emit_nvtx(enabled=PROFILING_STARTED, record_shapes=True):
+                _, predicted_energies = model((species, coordinates))
            loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
            rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy()


--- a/tools/training-benchmark.py
+++ b/tools/training-benchmark.py
@@ -49,25 +49,9 @@ if __name__ == "__main__":
    parser.add_argument('-b', '--batch_size',
                        help='Number of conformations of each batch',
                        default=2560, type=int)
-    parser.add_argument('-o', '--original_dataset_api',
-                        help='use original dataset api',
-                        dest='dataset',
-                        action='store_const',
-                        const='original')
-    parser.add_argument('-s', '--shuffle_dataset_api',
-                        help='use shuffle dataset api',
-                        dest='dataset',
-                        action='store_const',
-                        const='shuffle')
-    parser.add_argument('-c', '--cache_dataset_api',
-                        help='use cache dataset api',
-                        dest='dataset',
-                        action='store_const',
-                        const='cache')
    parser.add_argument('-y', '--synchronize',
                        action='store_true',
                        help='whether to insert torch.cuda.synchronize() at the end of each function')
-    parser.set_defaults(dataset='shuffle')
    parser.add_argument('-n', '--num_epochs',
                        help='epochs',
                        default=1, type=int)
@@ -107,48 +91,9 @@ if __name__ == "__main__":
    model[0].forward = time_func('total', model[0].forward)
    model[1].forward = time_func('forward', model[1].forward)

-    if parser.dataset == 'shuffle':
-        torchani.data.ShuffledDataset = time_func('data_loading', torchani.data.ShuffledDataset)
-        print('using shuffle dataset API')
-        print('=> loading dataset...')
-        dataset = torchani.data.ShuffledDataset(file_path=parser.dataset_path,
-                                                species_order=['H', 'C', 'N', 'O'],
-                                                subtract_self_energies=True,
-                                                batch_size=parser.batch_size,
-                                                num_workers=2)
-        print('=> the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        chunks, properties = iter(dataset).next()
-    elif parser.dataset == 'original':
-        torchani.data.load_ani_dataset = time_func('data_loading', torchani.data.load_ani_dataset)
-        print('using original dataset API')
-        print('=> loading dataset...')
-        energy_shifter = torchani.utils.EnergyShifter(None)
-        species_to_tensor = torchani.utils.ChemicalSymbolsToInts(['H', 'C', 'N', 'O'])
-        dataset = torchani.data.load_ani_dataset(parser.dataset_path, species_to_tensor,
-                                                 parser.batch_size, device=parser.device,
-                                                 transform=[energy_shifter.subtract_from_dataset])
-        print('=> the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        chunks, properties = dataset[0]
-    elif parser.dataset == 'cache':
-        torchani.data.CachedDataset = time_func('data_loading', torchani.data.CachedDataset)
-        print('using cache dataset API')
-        print('=> loading dataset...')
-        dataset = torchani.data.CachedDataset(file_path=parser.dataset_path,
-                                              species_order=['H', 'C', 'N', 'O'],
-                                              subtract_self_energies=True,
-                                              batch_size=parser.batch_size)
-        print('=> caching all dataset into cpu')
-        pbar = pkbar.Pbar('loading and processing dataset into cpu memory, total '
-                          + 'batches: {}, batch_size: {}'.format(len(dataset), parser.batch_size),
-                          len(dataset))
-        for i, t in enumerate(dataset):
-            pbar.update(i)
-        print('=> the first batch is ([chunk1, chunk2, ...], {"energies", "force", ...}) in which chunk1=(species, coordinates)')
-        chunks, properties = dataset[0]
-
-    for i, chunk in enumerate(chunks):
-        print('chunk{}'.format(i + 1), list(chunk[0].size()), list(chunk[1].size()))
-    print('energies', list(properties['energies'].size()))
+    print('=> loading dataset...')
+    shifter = torchani.EnergyShifter(None)
+    dataset = list(torchani.data.load(parser.dataset_path).subtract_self_energies(shifter).species_to_indices().shuffle().collate(parser.batch_size))

    print('=> start training')
    start = time.time()
@@ -158,24 +103,12 @@ if __name__ == "__main__":
        print('Epoch: %d/%d' % (epoch + 1, parser.num_epochs))
        progbar = pkbar.Kbar(target=len(dataset) - 1, width=8)

-        for i, (batch_x, batch_y) in enumerate(dataset):
-
-            true_energies = batch_y['energies'].to(parser.device)
-            predicted_energies = []
-            num_atoms = []
-            atomic_properties = []
-
-            for chunk_species, chunk_coordinates in batch_x:
-                chunk_species = chunk_species.to(parser.device)
-                chunk_coordiantes = chunk_coordinates.to(parser.device)
-                atomic_chunk = {'species': chunk_species, 'coordinates': chunk_coordinates}
-                atomic_properties.append(atomic_chunk)
-                num_atoms.append((chunk_species >= 0).to(true_energies.dtype).sum(dim=1))
-
-            atomic_properties = torchani.utils.pad_atomic_properties(atomic_properties)
-            predicted_energies = model((atomic_properties['species'], atomic_properties['coordinates'])).energies.to(true_energies.dtype)
-
-            num_atoms = torch.cat(num_atoms)
+        for i, properties in enumerate(dataset):
+            species = properties['species'].to(parser.device)
+            coordinates = properties['coordinates'].to(parser.device).float()
+            true_energies = properties['energies'].to(parser.device).float()
+            num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
+            _, predicted_energies = model((species, coordinates))
            loss = (mse(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
            rmse = hartree2kcalmol((mse(predicted_energies, true_energies)).mean()).detach().cpu().numpy()
            loss.backward()
@@ -191,6 +124,5 @@ if __name__ == "__main__":
        if k.startswith('torchani.'):
            print('{} - {:.1f}s'.format(k, timers[k]))
    print('Total AEV - {:.1f}s'.format(timers['total']))
-    print('Data Loading - {:.1f}s'.format(timers['data_loading']))
    print('NN - {:.1f}s'.format(timers['forward']))
    print('Epoch time - {:.1f}s'.format(stop - start))
--- a/torchani/aev.py
+++ b/torchani/aev.py
@@ -408,8 +408,8 @@ class AEVComputer(torch.nn.Module):

                If you don't care about periodic boundary conditions at all,
                then input can be a tuple of two tensors: species, coordinates.
-                species must have shape ``(C, A)``, coordinates must have shape
-                ``(C, A, 3)`` where ``C`` is the number of molecules in a chunk,
+                species must have shape ``(N, A)``, coordinates must have shape
+                ``(N, A, 3)`` where ``N`` is the number of molecules in a batch,
                and ``A`` is the number of atoms.

                .. warning::
@@ -437,7 +437,7 @@ class AEVComputer(torch.nn.Module):

        Returns:
            NamedTuple: Species and AEVs. species are the species from the input
-            unchanged, and AEVs is a tensor of shape ``(C, A, self.aev_length())``
+            unchanged, and AEVs is a tensor of shape ``(N, A, self.aev_length())``
        """
        species, coordinates = input_


--- a/torchani/data/__init__.py
+++ b/torchani/data/__init__.py
 # -*- coding: utf-8 -*-
-"""Tools for loading, shuffling, and batching ANI datasets"""
+"""Tools for loading, shuffling, and batching ANI datasets
+
+The `torchani.data.load(path)` creates an iterable of raw data,
+where species are strings, and coordinates are numpy ndarrays.
+
+You can transform these iterable by using transformations.
+To do transformation, just do `it.transformation_name()`.
+
+Available transformations are listed below:
+
+- `species_to_indices` converts species from strings to numbers.
+- `subtract_self_energies` subtracts self energies, you can pass.
+    a dict of self energies, or an `EnergyShifter` to let it infer
+    self energy from dataset and store the result to the given shifter.
+- `remove_outliers`
+- `shuffle`
+- `cache` cache the result of previous transformations.
+- `collate` pad the dataset, convert it to tensor, and stack them
+    together to get a batch.
+- `pin_memory` copy the tensor to pinned memory so that later transfer
+    to cuda could be faster.
+
+You can also use `split` to split the iterable to pieces. Use `split` as:
+
+.. code-block:: python
+
+    it.split(size1, size2, None)
+
+where the None in the end indicate that we want to use all of the the rest
+
+Example:
+
+.. code-block:: python
+
+    energy_shifter = torchani.utils.EnergyShifter(None)
+    dataset = torchani.data.load(path).subtract_self_energies(energy_shifter).species_to_indices().shuffle()
+    size = len(dataset)
+    training, validation = dataset.split(int(0.8 * size), None)
+    training = training.collate(batch_size).cache()
+    validation = validation.collate(batch_size).cache()
+"""

-from torch.utils.data import Dataset
 from os.path import join, isfile, isdir
 import os
 from ._pyanitools import anidataloader
 import torch
 from .. import utils
-from .new import CachedDataset, ShuffledDataset, find_threshold
-
-default_device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-
-def chunk_counts(counts, split):
-    split = [x + 1 for x in split] + [None]
-    count_chunks = []
-    start = 0
-    for i in split:
-        count_chunks.append(counts[start:i])
-        start = i
-    chunk_molecules = [sum([y[1] for y in x]) for x in count_chunks]
-    chunk_maxatoms = [x[-1][0] for x in count_chunks]
-    return chunk_molecules, chunk_maxatoms
-
-
-def split_cost(counts, split):
-    split_min_cost = 40000
-    cost = 0
-    chunk_molecules, chunk_maxatoms = chunk_counts(counts, split)
-    for molecules, maxatoms in zip(chunk_molecules, chunk_maxatoms):
-        cost += max(molecules * maxatoms ** 2, split_min_cost)
-    return cost
-
-
-def split_batch(natoms, atomic_properties):
-
-    # count number of conformation by natoms
-    natoms = natoms.tolist()
-    counts = []
-    for i in natoms:
-        if not counts:
-            counts.append([i, 1])
-            continue
-        if i == counts[-1][0]:
-            counts[-1][1] += 1
-        else:
-            counts.append([i, 1])
-
-    # find best split using greedy strategy
-    split = []
-    cost = split_cost(counts, split)
-    improved = True
-    while improved:
-        improved = False
-        cycle_split = split
-        cycle_cost = cost
-        for i in range(len(counts) - 1):
-            if i not in split:
-                s = sorted(split + [i])
-                c = split_cost(counts, s)
-                if c < cycle_cost:
-                    improved = True
-                    cycle_cost = c
-                    cycle_split = s
-        if improved:
-            split = cycle_split
-            cost = cycle_cost
-
-    # do split
-    chunk_molecules, _ = chunk_counts(counts, split)
-    num_chunks = None
-    for k in atomic_properties:
-        atomic_properties[k] = atomic_properties[k].split(chunk_molecules)
-        if num_chunks is None:
-            num_chunks = len(atomic_properties[k])
-        else:
-            assert num_chunks == len(atomic_properties[k])
-    chunks = []
-    for i in range(num_chunks):
-        chunk = {k: atomic_properties[k][i] for k in atomic_properties}
-        chunks.append(utils.strip_redundant_padding(chunk))
-    return chunks
-
-
-def load_and_pad_whole_dataset(path, species_tensor_converter, shuffle=True,
-                               properties=('energies',), atomic_properties=()):
-    # get name of files storing data
-    files = []
-    if isdir(path):
-        for f in os.listdir(path):
-            f = join(path, f)
-            if isfile(f) and (f.endswith('.h5') or f.endswith('.hdf5')):
-                files.append(f)
-    elif isfile(path):
-        files = [path]
-    else:
-        raise ValueError('Bad path')
-
-    # load full dataset
-    atomic_properties_ = []
-    properties = {k: [] for k in properties}
-    for f in files:
-        for m in anidataloader(f):
-            atomic_properties_.append(dict(
-                species=species_tensor_converter(m['species']).unsqueeze(0),
-                **{
-                    k: torch.from_numpy(m[k]).to(torch.double)
-                    for k in ['coordinates'] + list(atomic_properties)
-                }
-            ))
-            for i in properties:
-                p = torch.from_numpy(m[i]).to(torch.double)
-                properties[i].append(p)
-    atomic_properties = utils.pad_atomic_properties(atomic_properties_)
-    for i in properties:
-        properties[i] = torch.cat(properties[i])
-
-    # shuffle if required
-    molecules = atomic_properties['species'].shape[0]
-    if shuffle:
-        indices = torch.randperm(molecules)
-        for i in properties:
-            properties[i] = properties[i].index_select(0, indices)
-        for i in atomic_properties:
-            atomic_properties[i] = atomic_properties[i].index_select(0, indices)
-    return atomic_properties, properties
-
-
-def split_whole_into_batches_and_chunks(atomic_properties, properties, batch_size):
-    molecules = atomic_properties['species'].shape[0]
-    # split into minibatches
-    for k in properties:
-        properties[k] = properties[k].split(batch_size)
-    for k in atomic_properties:
-        atomic_properties[k] = atomic_properties[k].split(batch_size)
-
-    # further split batch into chunks and strip redundant padding
-    batches = []
-    num_batches = (molecules + batch_size - 1) // batch_size
-    for i in range(num_batches):
-        batch_properties = {k: v[i] for k, v in properties.items()}
-        batch_atomic_properties = {k: v[i] for k, v in atomic_properties.items()}
-        species = batch_atomic_properties['species']
-        natoms = (species >= 0).to(torch.long).sum(1)
-
-        # sort batch by number of atoms to prepare for splitting
-        natoms, indices = natoms.sort()
-        for k in batch_properties:
-            batch_properties[k] = batch_properties[k].index_select(0, indices)
-        for k in batch_atomic_properties:
-            batch_atomic_properties[k] = batch_atomic_properties[k].index_select(0, indices)
-
-        batch_atomic_properties = split_batch(natoms, batch_atomic_properties)
-        batches.append((batch_atomic_properties, batch_properties))
-
-    return batches
-
-
-class PaddedBatchChunkDataset(Dataset):
-    r""" Dataset that contains batches in 'chunks', with padded structures
-
-    This dataset acts as a container of batches to be used when training. Each
-    of the batches is broken up into 'chunks', each of which is a tensor has
-    molecules with a smiliar number of atoms, but which have been padded with
-    dummy atoms in order for them to have the same tensor dimensions.
-    """
-
-    def __init__(self, atomic_properties, properties, batch_size,
-                 dtype=torch.get_default_dtype(), device=default_device):
-        super().__init__()
-        self.device = device
-        self.dtype = dtype
-
-        # convert to desired dtype
-        for k in properties:
-            properties[k] = properties[k].to(dtype)
-        for k in atomic_properties:
-            if k == 'species':
-                continue
-            atomic_properties[k] = atomic_properties[k].to(dtype)
-
-        self.batches = split_whole_into_batches_and_chunks(atomic_properties, properties, batch_size)
-
-    def __getitem__(self, idx):
-        atomic_properties, properties = self.batches[idx]
-        atomic_properties, properties = atomic_properties.copy(), properties.copy()
-        species_coordinates = []
-        for chunk in atomic_properties:
-            for k in chunk:
-                chunk[k] = chunk[k].to(self.device)
-            species_coordinates.append((chunk['species'], chunk['coordinates']))
-        for k in properties:
-            properties[k] = properties[k].to(self.device)
-        properties['atomic'] = atomic_properties
-        return species_coordinates, properties
+import importlib
+import functools
+import math
+import random
+from collections import Counter
+import numpy
+
+PKBAR_INSTALLED = importlib.util.find_spec('pkbar') is not None  # type: ignore
+if PKBAR_INSTALLED:
+    import pkbar
+
+verbose = True
+
+
+PROPERTIES = ('energies', 'forces')
+PADDING = {
+    'species': -1,
+    'coordinates': 0.0,
+    'forces': 0.0,
+    'energies': 0.0
+}
+
+
+class Transformations:
+
+    @staticmethod
+    def species_to_indices(iter_, species_order=('H', 'C', 'N', 'O', 'F', 'Cl', 'S')):
+        if species_order == 'periodic_table':
+            species_order = utils.PERIODIC_TABLE
+        idx = {k: i for i, k in enumerate(species_order)}
+        for d in iter_:
+            d['species'] = numpy.array([idx[s] for s in d['species']])
+            yield d
+
+    @staticmethod
+    def subtract_self_energies(iter_, self_energies=None):
+        iter_ = list(iter_)
+        intercept = 0.0
+        if isinstance(self_energies, utils.EnergyShifter):
+            shifter = self_energies
+            self_energies = {}
+            counts = {}
+            Y = []
+            for n, d in enumerate(iter_):
+                species = d['species']
+                count = Counter()
+                for s in species:
+                    count[s] += 1
+                for s, c in count.items():
+                    if s not in counts:
+                        counts[s] = [0] * n
+                    counts[s].append(c)
+                for s in counts:
+                    if len(counts[s]) != n + 1:
+                        counts[s].append(0)
+                Y.append(d['energies'])
+            species = sorted(list(counts.keys()))
+            X = [counts[s] for s in species]
+            if shifter.fit_intercept:
+                X.append([1] * n)
+            X = numpy.array(X).transpose()
+            Y = numpy.array(Y)
+            sae, _, _, _ = numpy.linalg.lstsq(X, Y, rcond=None)
+            sae_ = sae
+            if shifter.fit_intercept:
+                intercept = sae[-1]
+                sae_ = sae[:-1]
+            for s, e in zip(species, sae_):
+                self_energies[s] = e
+            shifter.__init__(sae, shifter.fit_intercept)
+        for d in iter_:
+            e = intercept
+            for s in d['species']:
+                e += self_energies[s]
+            d['energies'] -= e
+            yield d
+
+    @staticmethod
+    def remove_outliers(iter_, threshold1=15.0, threshold2=8.0):
+        assert 'subtract_self_energies', "Transformation remove_outliers can only run after subtract_self_energies"
+
+        # pass 1: remove everything that has per-atom energy > threshold1
+        def scaled_energy(x):
+            num_atoms = len(x['species'])
+            return abs(x['energies']) / math.sqrt(num_atoms)
+        filtered = [x for x in iter_ if scaled_energy(x) < threshold1]
+
+        # pass 2: compute those that are outside the mean by threshold2 * std
+        n = 0
+        mean = 0
+        std = 0
+        for m in filtered:
+            n += 1
+            mean += m['energies']
+            std += m['energies'] ** 2
+        mean /= n
+        std = math.sqrt(std / n - mean ** 2)
+
+        return filter(lambda x: abs(x['energies'] - mean) < threshold2 * std, filtered)
+
+    @staticmethod
+    def shuffle(iter_):
+        list_ = list(iter_)
+        random.shuffle(list_)
+        return list_
+
+    @staticmethod
+    def cache(iter_):
+        return list(iter_)
+
+    @staticmethod
+    def collate(iter_, batch_size):
+        batch = []
+        i = 0
+        for d in iter_:
+            d = {k: torch.as_tensor(d[k]) for k in d}
+            batch.append(d)
+            i += 1
+            if i == batch_size:
+                i = 0
+                yield utils.stack_with_padding(batch, PADDING)
+                batch = []
+        if len(batch) > 0:
+            yield utils.stack_with_padding(batch, PADDING)
+
+    @staticmethod
+    def pin_memory(iter_):
+        for d in iter_:
+            yield {k: d[k].pin_memory() for k in d}
+
+
+class TransformableIterable:
+    def __init__(self, wrapped_iter, transformations=()):
+        self.wrapped_iter = wrapped_iter
+        self.transformations = transformations
+
+    def __iter__(self):
+        return iter(self.wrapped_iter)
+
+    def __next__(self):
+        return next(self.wrapped_iter)
+
+    def __getattr__(self, name):
+        transformation = getattr(Transformations, name)
+
+        @functools.wraps(transformation)
+        def f(*args, **kwargs):
+            return TransformableIterable(
+                transformation(self, *args, **kwargs),
+                self.transformations + (name,))
+
+        return f
+
+    def split(self, *nums):
+        iters = []
+        self_iter = iter(self)
+        for n in nums:
+            list_ = []
+            if n is not None:
+                for _ in range(n):
+                    list_.append(next(self_iter))
+            else:
+                for i in self_iter:
+                    list_.append(i)
+            iters.append(TransformableIterable(list_, self.transformations + ('split',)))
+        return iters

    def __len__(self):
-        return len(self.batches)
-
-
-def load_ani_dataset(path, species_tensor_converter, batch_size, shuffle=True,
-                     rm_outlier=False, properties=('energies',), atomic_properties=(),
-                     transform=(), dtype=torch.get_default_dtype(), device=default_device,
-                     split=(None,)):
-    """Load ANI dataset from hdf5 files, and split into subsets.
-
-    The return datasets are already a dataset of batches, so when iterated, a
-    batch rather than a single data point will be yielded.
-
-    Since each batch might contain molecules of very different sizes, putting
-    the whole batch into a single tensor would require adding ghost atoms to
-    pad everything to the size of the largest molecule. As a result, huge
-    amount of computation would be wasted on ghost atoms. To avoid this issue,
-    the input of each batch, i.e. species and coordinates, are further divided
-    into chunks according to some heuristics, so that each chunk would only
-    have molecules of similar size, to minimize the padding required.
-
-    So, when iterating on this dataset, a tuple will be yielded. The first
-    element of this tuple is a list of (species, coordinates) pairs. Each pair
-    is a chunk of molecules of similar size. The second element of this tuple
-    would be a dictionary, where the keys are those specified in the argument
-    :attr:`properties`, and values are a single tensor of the whole batch
-    (properties are not splitted into chunks).
-
-    Splitting batch into chunks leads to some inconvenience on training,
-    especially when using high level libraries like ``ignite``. To overcome
-    this inconvenience, :class:`torchani.ignite.Container` is created for
-    working with ignite.
-
-    Arguments:
-        path (str): Path to hdf5 files. If :attr:`path` is a file, then that
-            file would be loaded using `pyanitools.py`_. If :attr:`path` is
-            a directory, then all files with suffix `.h5` or `.hdf5` will be
-            loaded.
-        species_tensor_converter (:class:`collections.abc.Callable`): A
-            callable that convert species in the format of list of strings
-            to 1D tensor.
-        batch_size (int): Number of different 3D structures in a single
-            minibatch.
-        shuffle (bool): Whether to shuffle the whole dataset.
-        rm_outlier (bool): Whether to discard the outlier energy conformers
-            from a given dataset.
-        properties (list): List of keys of `molecular` properties in the
-            dataset to be loaded. Here `molecular` means, no matter the number
-            of atoms that property always have fixed size, i.e. the tensor
-            shape of molecular properties should be (molecule, ...). An example
-            of molecular property is the molecular energies. ``'species'`` and
-            ``'coordinates'`` are always loaded and need not to be specified
-            anywhere.
-        atomic_properties (list): List of keys of `atomic` properties in the
-            dataset to be loaded. Here `atomic` means, the size of property
-            is proportional to the number of atoms in the molecule, i.e. the
-            tensor shape of atomic properties should be (molecule, atoms, ...).
-            An example of atomic property is the forces. ``'species'`` and
-            ``'coordinates'`` are always loaded and need not to be specified
-            anywhere.
-        transform (list): List of :class:`collections.abc.Callable` that
-            transform the data. Callables must take atomic properties,
-            properties as arguments, and return the transformed atomic
-            properties and properties.
-        dtype (:class:`torch.dtype`): dtype of coordinates and properties to
-            to convert the dataset to.
-        device (:class:`torch.dtype`): device to put tensors when iterating.
-        split (list): as sequence of integers or floats or ``None``. Integers
-            are interpreted as number of elements, floats are interpreted as
-            percentage, and ``None`` are interpreted as the rest of the dataset
-            and can only appear as the last element of :class:`split`. For
-            example, if the whole dataset has 10000 entry, and split is
-            ``(5000, 0.1, None)``, then this function will create 3 datasets,
-            where the first dataset contains 5000 elements, the second dataset
-            contains ``int(0.1 * 10000)``, which is 1000, and the third dataset
-            will contains ``10000 - 5000 - 1000`` elements. By default this
-            creates only a single dataset.
-
-    Returns:
-        An instance of :class:`torchani.data.PaddedBatchChunkDataset` if there is
-        only one element in :attr:`split`, otherwise returns a tuple of the same
-        classes according to :attr:`split`.
-
-    .. _pyanitools.py:
-        https://github.com/isayev/ASE_ANI/blob/master/lib/pyanitools.py
-    """
-    atomic_properties_, properties_ = load_and_pad_whole_dataset(
-        path, species_tensor_converter, shuffle, properties, atomic_properties)
-
-    molecules = atomic_properties_['species'].shape[0]
-    atomic_keys = ['species', 'coordinates', *atomic_properties]
-    keys = properties
-
-    # do transformations on data
-    for t in transform:
-        atomic_properties_, properties_ = t(atomic_properties_, properties_)
-
-    if rm_outlier:
-        transformed_energies = properties_['energies']
-        num_atoms = (atomic_properties_['species'] >= 0).to(transformed_energies.dtype).sum(dim=1)
-        scaled_diff = transformed_energies / num_atoms.sqrt()
-
-        mean = scaled_diff[torch.abs(scaled_diff) < 15.0].mean()
-        std = scaled_diff[torch.abs(scaled_diff) < 15.0].std()
-
-        # -8 * std + mean < scaled_diff < +8 * std + mean
-        tol = 8.0 * std + mean
-        low_idx = (torch.abs(scaled_diff) < tol).nonzero().squeeze()
-        outlier_count = molecules - low_idx.numel()
-
-        # discard outlier energy conformers if exist
-        if outlier_count > 0:
-            print("Note: {} outlier energy conformers have been discarded from dataset".format(outlier_count))
-            for key, val in atomic_properties_.items():
-                atomic_properties_[key] = val[low_idx]
-            for key, val in properties_.items():
-                properties_[key] = val[low_idx]
-            molecules = low_idx.numel()
-
-    # compute size of each subset
-    split_ = []
-    total = 0
-    for index, size in enumerate(split):
-        if isinstance(size, float):
-            size = int(size * molecules)
-        if size is None:
-            assert index == len(split) - 1
-            size = molecules - total
-        split_.append(size)
-        total += size
-
-    # split
-    start = 0
-    splitted = []
-    for size in split_:
-        ap = {k: atomic_properties_[k][start:start + size] for k in atomic_keys}
-        p = {k: properties_[k][start:start + size] for k in keys}
-        start += size
-        splitted.append((ap, p))
-
-    # consturct batched dataset
-    ret = []
-    for ap, p in splitted:
-        ds = PaddedBatchChunkDataset(ap, p, batch_size, dtype, device)
-        ds.properties = properties
-        ds.atomic_properties = atomic_properties
-        ret.append(ds)
-    if len(ret) == 1:
-        return ret[0]
-    return tuple(ret)
-
-
-__all__ = ['load_ani_dataset', 'PaddedBatchChunkDataset', 'CachedDataset', 'ShuffledDataset', 'find_threshold']
+        return len(self.wrapped_iter)
+
+
+def load(path, additional_properties=()):
+    properties = PROPERTIES + additional_properties
+
+    def h5_files(path):
+        """yield file name of all h5 files in a path"""
+        if isdir(path):
+            for f in os.listdir(path):
+                f = join(path, f)
+                yield from h5_files(f)
+        elif isfile(path) and (path.endswith('.h5') or path.endswith('.hdf5')):
+            yield path
+
+    def molecules():
+        for f in h5_files(path):
+            anidata = anidataloader(f)
+            anidata_size = anidata.size()
+            use_pbar = PKBAR_INSTALLED and verbose
+            if use_pbar:
+                pbar = pkbar.Pbar('=> loading {}, total molecules: {}'.format(f, anidata_size), anidata_size)
+            for i, m in enumerate(anidata):
+                yield m
+                if use_pbar:
+                    pbar.update(i)
+
+    def conformations():
+        for m in molecules():
+            species = m['species']
+            coordinates = m['coordinates']
+            for i in range(coordinates.shape[0]):
+                ret = {'species': species, 'coordinates': coordinates[i]}
+                for k in properties:
+                    if k in m:
+                        ret[k] = m[k][i]
+                yield ret
+
+    return TransformableIterable(conformations())
+
+
+__all__ = ['load']
--- a/torchani/data/new.py
+++ b/torchani/data/new.py
-import numpy as np
-import torch
-import functools
-from ._pyanitools import anidataloader
-from importlib import util as u
-import gc
-
-PKBAR_INSTALLED = u.find_spec('pkbar') is not None
-if PKBAR_INSTALLED:
-    import pkbar
-
-
-def find_threshold(file_path, batch_size, threshold_max=100):
-    """Find resonable threshold to split chunks before using ``torchani.data.CachedDataset`` or ``torchani.data.ShuffledDataset``.
-
-    Arguments:
-        file_path (str): Path to one hdf5 files.
-        batch_size (int): batch size.
-        threshold_max (int): max threshould to test.
-    """
-    ds = CachedDataset(file_path=file_path, batch_size=batch_size)
-    ds.find_threshold(threshold_max + 1)
-
-
-class CachedDataset(torch.utils.data.Dataset):
-    """ Cached Dataset which is shuffled once, but the dataset keeps the same at every epoch.
-
-    Arguments:
-        file_path (str): Path to one hdf5 file.
-        batch_size (int): batch size.
-        device (str): ``'cuda'`` or ``'cpu'``, cache to CPU or GPU. Commonly, 'cpu' is already fast enough.
-            Default is ``'cpu'``.
-        chunk_threshold (int): threshould to split batch into chunks. Set to ``None`` will not split chunks.
-            Use ``torchani.data.find_threshold`` to find resonable ``chunk_threshold``.
-        other_properties (dict): A dict which is used to extract properties other than
-            ``energies`` from dataset with correct padding, shape and dtype.\n
-            The example below will extract ``dipoles`` and ``forces``.\n
-            ``padding_values``: set to ``None`` means there is no need to pad for this property.
-
-            .. code-block:: python
-
-                other_properties = {'properties': ['dipoles', 'forces'],
-                                    'padding_values': [None, 0],
-                                    'padded_shapes': [(batch_size, 3), (batch_size, -1, 3)],
-                                    'dtypes': [torch.float32, torch.float32]
-                                    }
-
-        include_energies (bool): Whether include energies into properties. Default is ``True``.
-        species_order (list): a list which specify how species are transfomed to int.
-            for example: ``['H', 'C', 'N', 'O']`` means ``{'H': 0, 'C': 1, 'N': 2, 'O': 3}``.
-        subtract_self_energies (bool): whether subtract self energies from ``energies``.
-        self_energies (list): if `subtract_self_energies` is True, the order should keep
-            the same as ``species_order``.
-            for example :``[-0.600953, -38.08316, -54.707756, -75.194466]`` will be converted
-            to ``{'H': -0.600953, 'C': -38.08316, 'N': -54.707756, 'O': -75.194466}``.
-
-    .. note::
-        The resulting dataset will be:
-        ``([chunk1, chunk2, ...], {'energies', 'force', ...})`` in which chunk1 is a
-        tuple of ``(species, coordinates)``.
-
-        e.g. the shape of\n
-        chunk1: ``[[1807, 21], [1807, 21, 3]]``\n
-        chunk2: ``[[193, 50], [193, 50, 3]]``\n
-        'energies': ``[2000, 1]``
-    """
-    def __init__(self, file_path,
-                 batch_size=1000,
-                 device='cpu',
-                 chunk_threshold=20,
-                 other_properties={},
-                 include_energies=True,
-                 species_order=['H', 'C', 'N', 'O'],
-                 subtract_self_energies=False,
-                 self_energies=[-0.600953, -38.08316, -54.707756, -75.194466]):
-
-        super(CachedDataset, self).__init__()
-
-        # example of species_dict will looks like
-        # species_dict: {'H': 0, 'C': 1, 'N': 2, 'O': 3}
-        # self_energies_dict: {'H': -0.600953, 'C': -38.08316, 'N': -54.707756, 'O': -75.194466}
-        species_dict = {}
-        self_energies_dict = {}
-        for i, s in enumerate(species_order):
-            species_dict[s] = i
-            self_energies_dict[s] = self_energies[i]
-
-        self.batch_size = batch_size
-        self.data_species = []
-        self.data_coordinates = []
-        data_self_energies = []
-        self.data_properties = {}
-        self.properties_info = other_properties
-
-        # whether include energies to properties
-        if include_energies:
-            self.add_energies_to_properties()
-        # let user check the properties will be loaded
-        self.check_properties()
-
-        # anidataloader
-        anidata = anidataloader(file_path)
-        anidata_size = anidata.group_size()
-        self.enable_pkbar = anidata_size > 5 and PKBAR_INSTALLED
-        if self.enable_pkbar:
-            pbar = pkbar.Pbar('=> loading h5 dataset into cpu memory, total molecules: {}'.format(anidata_size), anidata_size)
-
-        # load h5 data into cpu memory as lists
-        for i, molecule in enumerate(anidata):
-            # conformations
-            num_conformations = len(molecule['coordinates'])
-            # species and coordinates
-            self.data_coordinates += list(molecule['coordinates'].reshape(num_conformations, -1).astype(np.float32))
-            species = np.array([species_dict[x] for x in molecule['species']])
-            self.data_species += list(np.tile(species, (num_conformations, 1)))
-            # if subtract_self_energies
-            if subtract_self_energies:
-                self_energies = np.array(sum([self_energies_dict[x] for x in molecule['species']]))
-                data_self_energies += list(np.tile(self_energies, (num_conformations, 1)))
-            # properties
-            for key in self.data_properties:
-                self.data_properties[key] += list(molecule[key].reshape(num_conformations, -1))
-            # pkbar update
-            if self.enable_pkbar:
-                pbar.update(i)
-
-        # if subtract self energies
-        if subtract_self_energies and 'energies' in self.properties_info['properties']:
-            self.data_properties['energies'] = np.array(self.data_properties['energies']) - np.array(data_self_energies)
-            del data_self_energies
-            gc.collect()
-
-        self.length = (len(self.data_species) + self.batch_size - 1) // self.batch_size
-        self.device = device
-        self.shuffled_index = np.arange(len(self.data_species))
-        np.random.shuffle(self.shuffled_index)
-
-        self.chunk_threshold = chunk_threshold
-        if not self.chunk_threshold:
-            self.chunk_threshold = np.inf
-
-        # clean trash
-        anidata.cleanup()
-        del num_conformations
-        del species
-        del anidata
-        gc.collect()
-
-    @functools.lru_cache(maxsize=None)
-    def __getitem__(self, index):
-
-        if index >= self.length:
-            raise IndexError()
-
-        batch_indices = slice(index * self.batch_size, (index + 1) * self.batch_size)
-        batch_indices_shuffled = self.shuffled_index[batch_indices]
-
-        batch_species = [self.data_species[i] for i in batch_indices_shuffled]
-        batch_coordinates = [self.data_coordinates[i] for i in batch_indices_shuffled]
-
-        # get sort index
-        num_atoms_each_mole = [b.shape[0] for b in batch_species]
-        atoms = torch.tensor(num_atoms_each_mole, dtype=torch.int32)
-        sorted_atoms, sorted_atoms_idx = torch.sort(atoms)
-
-        # sort each batch of data
-        batch_species = self.sort_list_with_index(batch_species, sorted_atoms_idx.numpy())
-        batch_coordinates = self.sort_list_with_index(batch_coordinates, sorted_atoms_idx.numpy())
-
-        # get chunk size
-        output, count = torch.unique(atoms, sorted=True, return_counts=True)
-        counts = torch.cat((output.unsqueeze(-1).int(), count.unsqueeze(-1).int()), dim=-1)
-        chunk_size_list, chunk_max_list = split_to_chunks(counts, chunk_threshold=self.chunk_threshold * self.batch_size * 20)
-        chunk_size_list = torch.stack(chunk_size_list).flatten()
-
-        # split into chunks
-        chunks_batch_species = self.split_list_with_size(batch_species, chunk_size_list.numpy())
-        chunks_batch_coordinates = self.split_list_with_size(batch_coordinates, chunk_size_list.numpy())
-
-        # padding each data
-        chunks_batch_species = self.pad_and_convert_to_tensor(chunks_batch_species, padding_value=-1)
-        chunks_batch_coordinates = self.pad_and_convert_to_tensor(chunks_batch_coordinates)
-
-        # chunks
-        chunks = list(zip(chunks_batch_species, chunks_batch_coordinates))
-        for i, _ in enumerate(chunks):
-            chunks[i] = (chunks[i][0], chunks[i][1].reshape(chunks[i][1].shape[0], -1, 3))
-
-        # properties
-        properties = {}
-        for i, key in enumerate(self.properties_info['properties']):
-            # get a batch of property
-            prop = [self.data_properties[key][i] for i in batch_indices_shuffled]
-            # sort with number of atoms
-            prop = self.sort_list_with_index(prop, sorted_atoms_idx.numpy())
-            # padding and convert to tensor
-            if self.properties_info['padding_values'][i] is None:
-                prop = self.pad_and_convert_to_tensor([prop], no_padding=True)[0]
-            else:
-                prop = self.pad_and_convert_to_tensor([prop], padding_value=self.properties_info['padding_values'][i])[0]
-            # set property shape and dtype
-            padded_shape = list(self.properties_info['padded_shapes'][i])
-            padded_shape[0] = prop.shape[0]  # the last batch may does not have one batch data
-            properties[key] = prop.reshape(padded_shape).to(self.properties_info['dtypes'][i])
-
-        # return: [chunk1, chunk2, ...], {"energies", "force", ...} in which chunk1=(species, coordinates)
-        # e.g. chunk1 = [[1807, 21], [1807, 21, 3]], chunk2 = [[193, 50], [193, 50, 3]]
-        # 'energies' = [2000, 1]
-        return chunks, properties
-
-    def __len__(self):
-        return self.length
-
-    def split(self, validation_split):
-        """Split dataset into traning and validaiton.
-
-        Arguments:
-            validation_split (float): Float between 0 and 1. Fraction of the dataset to be used
-                as validation data.
-        """
-        val_size = int(validation_split * len(self))
-        train_size = len(self) - val_size
-
-        ds = []
-        if self.enable_pkbar:
-            message = ('=> processing, splitting and caching dataset into cpu memory: \n'
-                       + 'total batches: {}, train batches: {}, val batches: {}, batch_size: {}')
-            pbar = pkbar.Pbar(message.format(len(self), train_size, val_size, self.batch_size),
-                              len(self))
-        for i, _ in enumerate(self):
-            ds.append(self[i])
-            if self.enable_pkbar:
-                pbar.update(i)
-
-        train_dataset = ds[:train_size]
-        val_dataset = ds[train_size:]
-
-        return train_dataset, val_dataset
-
-    def load(self):
-        """Cache dataset into CPU memory. If not called, dataset will be cached during the first epoch.
-        """
-        if self.enable_pkbar:
-            pbar = pkbar.Pbar('=> processing and caching dataset into cpu memory: \ntotal '
-                              + 'batches: {}, batch_size: {}'.format(len(self), self.batch_size),
-                              len(self))
-        for i, _ in enumerate(self):
-            if self.enable_pkbar:
-                pbar.update(i)
-
-    def add_energies_to_properties(self):
-        # if user does not provide energies info
-        if 'properties' in self.properties_info and 'energies' not in self.properties_info['properties']:
-            # setup energies info, so the user does not need to input energies
-            self.properties_info['properties'].append('energies')
-            self.properties_info['padding_values'].append(None)
-            self.properties_info['padded_shapes'].append((self.batch_size, ))
-            self.properties_info['dtypes'].append(torch.float64)
-        # if no properties provided
-        if 'properties' not in self.properties_info:
-            self.properties_info = {'properties': ['energies'],
-                                    'padding_values': [None],
-                                    'padded_shapes': [(self.batch_size, )],
-                                    'dtypes': [torch.float64],
-                                    }
-
-    def check_properties(self):
-        # print properties information
-        print('... The following properties will be loaded:')
-        for i, prop in enumerate(self.properties_info['properties']):
-            self.data_properties[prop] = []
-            message = '{}: (dtype: {}, padding_value: {}, padded_shape: {})'
-            print(message.format(prop, self.properties_info['dtypes'][i],
-                                 self.properties_info['padding_values'][i],
-                                 self.properties_info['padded_shapes'][i]))
-
-    @staticmethod
-    def sort_list_with_index(inputs, index):
-        return [inputs[i] for i in index]
-
-    @staticmethod
-    def split_list_with_size(inputs, split_size):
-        output = []
-        for i, _ in enumerate(split_size):
-            start_index = np.sum(split_size[:i])
-            stop_index = np.sum(split_size[:i + 1])
-            output.append(inputs[start_index:stop_index])
-        return output
-
-    def pad_and_convert_to_tensor(self, inputs, padding_value=0, no_padding=False):
-        if no_padding:
-            for i, input_tmp in enumerate(inputs):
-                inputs[i] = torch.from_numpy(np.stack(input_tmp)).to(self.device)
-        else:
-            for i, input_tmp in enumerate(inputs):
-                inputs[i] = torch.nn.utils.rnn.pad_sequence(
-                    [torch.from_numpy(b) for b in inputs[i]],
-                    batch_first=True, padding_value=padding_value).to(self.device)
-        return inputs
-
-    def find_threshold(self, threshold_max=100):
-        batch_indices = slice(0, self.batch_size)
-        batch_indices_shuffled = self.shuffled_index[batch_indices]
-
-        batch_species = [self.data_species[i] for i in batch_indices_shuffled]
-
-        num_atoms_each_mole = [b.shape[0] for b in batch_species]
-        atoms = torch.tensor(num_atoms_each_mole, dtype=torch.int32)
-
-        output, count = torch.unique(atoms, sorted=True, return_counts=True)
-        counts = torch.cat((output.unsqueeze(-1).int(), count.unsqueeze(-1).int()), dim=-1)
-
-        print('=> choose a reasonable threshold to split chunks')
-        print('format is [chunk_size, chunk_max]')
-
-        for b in range(0, threshold_max, 1):
-            test_chunk_size_list, test_chunk_max_list = split_to_chunks(counts, chunk_threshold=b * self.batch_size * 20)
-            size_max = []
-            for i, _ in enumerate(test_chunk_size_list):
-                size_max.append([list(test_chunk_size_list[i].numpy())[0],
-                                list(test_chunk_max_list[i].numpy())[0]])
-            print('chunk_threshold = {}'.format(b))
-            print(size_max)
-
-    def release_h5(self):
-        del self.data_species
-        del self.data_coordinates
-        del self.data_energies
-        gc.collect()
-
-
-def ShuffledDataset(file_path,
-                    batch_size=1000, num_workers=0, shuffle=True,
-                    chunk_threshold=20,
-                    other_properties={},
-                    include_energies=True,
-                    validation_split=0.0,
-                    species_order=['H', 'C', 'N', 'O'],
-                    subtract_self_energies=False,
-                    self_energies=[-0.600953, -38.08316, -54.707756, -75.194466]):
-    """ Shuffled Dataset which using `torch.utils.data.DataLoader`, it will shuffle at every epoch.
-
-    Arguments:
-        file_path (str): Path to one hdf5 file.
-        batch_size (int): batch size.
-        num_workers (int): multiple process to prepare dataset at background when
-            training is going.
-        shuffle (bool): whether to shuffle.
-        chunk_threshold (int): threshould to split batch into chunks. Set to ``None`` will not split chunks.
-            Use ``torchani.data.find_threshold`` to find resonable ``chunk_threshold``.
-        other_properties (dict): A dict which is used to extract properties other than
-            ``energies`` from dataset with correct padding, shape and dtype.\n
-            The example below will extract ``dipoles`` and ``forces``.\n
-            ``padding_values``: set to ``None`` means there is no need to pad for this property.
-
-            .. code-block:: python
-
-                other_properties = {'properties': ['dipoles', 'forces'],
-                                    'padding_values': [None, 0],
-                                    'padded_shapes': [(batch_size, 3), (batch_size, -1, 3)],
-                                    'dtypes': [torch.float32, torch.float32]
-                                    }
-
-        include_energies (bool): Whether include energies into properties. Default is ``True``.
-        validation_split (float): Float between 0 and 1. Fraction of the dataset to be used
-            as validation data.
-        species_order (list): a list which specify how species are transfomed to int.
-            for example: ``['H', 'C', 'N', 'O']`` means ``{'H': 0, 'C': 1, 'N': 2, 'O': 3}``.
-        subtract_self_energies (bool): whether subtract self energies from ``energies``.
-        self_energies (list): if `subtract_self_energies` is True, the order should keep
-            the same as ``species_order``.
-            for example :``[-0.600953, -38.08316, -54.707756, -75.194466]`` will be
-            converted to ``{'H': -0.600953, 'C': -38.08316, 'N': -54.707756, 'O': -75.194466}``.
-
-    .. note::
-        Return a dataloader that, when iterating, you will get
-
-        ``([chunk1, chunk2, ...], {'energies', 'force', ...})`` in which chunk1 is a
-        tuple of ``(species, coordinates)``.\n
-        e.g. the shape of\n
-        chunk1: ``[[1807, 21], [1807, 21, 3]]``\n
-        chunk2: ``[[193, 50], [193, 50, 3]]``\n
-        'energies': ``[2000, 1]``
-    """
-
-    dataset = TorchData(file_path,
-                        batch_size,
-                        other_properties,
-                        include_energies,
-                        species_order,
-                        subtract_self_energies,
-                        self_energies)
-    properties_info = dataset.get_properties_info()
-
-    if not chunk_threshold:
-        chunk_threshold = np.inf
-
-    def my_collate_fn(data, chunk_threshold=chunk_threshold, properties_info=properties_info):
-        return collate_fn(data, chunk_threshold, properties_info)
-
-    val_size = int(validation_split * len(dataset))
-    train_size = len(dataset) - val_size
-    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
-
-    train_data_loader = torch.utils.data.DataLoader(dataset=train_dataset,
-                                                    batch_size=batch_size,
-                                                    shuffle=shuffle,
-                                                    num_workers=num_workers,
-                                                    pin_memory=False,
-                                                    collate_fn=my_collate_fn)
-    if val_size == 0:
-        return train_data_loader
-
-    val_data_loader = torch.utils.data.DataLoader(dataset=val_dataset,
-                                                  batch_size=batch_size,
-                                                  shuffle=False,
-                                                  num_workers=num_workers,
-                                                  pin_memory=False,
-                                                  collate_fn=my_collate_fn)
-
-    return train_data_loader, val_data_loader
-
-
-class TorchData(torch.utils.data.Dataset):
-
-    def __init__(self, file_path,
-                 batch_size,
-                 other_properties,
-                 include_energies,
-                 species_order,
-                 subtract_self_energies,
-                 self_energies):
-
-        super(TorchData, self).__init__()
-
-        species_dict = {}
-        self_energies_dict = {}
-        for i, s in enumerate(species_order):
-            species_dict[s] = i
-            self_energies_dict[s] = self_energies[i]
-
-        self.batch_size = batch_size
-        self.data_species = []
-        self.data_coordinates = []
-        data_self_energies = []
-        self.data_properties = {}
-        self.properties_info = other_properties
-
-        # whether include energies to properties
-        if include_energies:
-            self.add_energies_to_properties()
-        # let user check the properties will be loaded
-        self.check_properties()
-
-        # anidataloader
-        anidata = anidataloader(file_path)
-        anidata_size = anidata.group_size()
-        self.enable_pkbar = anidata_size > 5 and PKBAR_INSTALLED
-        if self.enable_pkbar:
-            pbar = pkbar.Pbar('=> loading h5 dataset into cpu memory, total molecules: {}'.format(anidata_size), anidata_size)
-
-        # load h5 data into cpu memory as lists
-        for i, molecule in enumerate(anidata):
-            # conformations
-            num_conformations = len(molecule['coordinates'])
-            # species and coordinates
-            self.data_coordinates += list(molecule['coordinates'].reshape(num_conformations, -1).astype(np.float32))
-            species = np.array([species_dict[x] for x in molecule['species']])
-            self.data_species += list(np.tile(species, (num_conformations, 1)))
-            # if subtract_self_energies
-            if subtract_self_energies:
-                self_energies = np.array(sum([self_energies_dict[x] for x in molecule['species']]))
-                data_self_energies += list(np.tile(self_energies, (num_conformations, 1)))
-            # properties
-            for key in self.data_properties:
-                self.data_properties[key] += list(molecule[key].reshape(num_conformations, -1))
-            # pkbar update
-            if self.enable_pkbar:
-                pbar.update(i)
-
-        # if subtract self energies
-        if subtract_self_energies and 'energies' in self.properties_info['properties']:
-            self.data_properties['energies'] = np.array(self.data_properties['energies']) - np.array(data_self_energies)
-            del data_self_energies
-            gc.collect()
-
-        self.length = len(self.data_species)
-
-        # clean trash
-        anidata.cleanup()
-        del num_conformations
-        del species
-        del anidata
-        gc.collect()
-
-    def __getitem__(self, index):
-
-        if index >= self.length:
-            raise IndexError()
-
-        species = torch.from_numpy(self.data_species[index])
-        coordinates = torch.from_numpy(self.data_coordinates[index]).float()
-        properties = {}
-        for key in self.data_properties:
-            properties[key] = torch.from_numpy(self.data_properties[key][index])
-
-        return [species, coordinates, properties]
-
-    def __len__(self):
-        return self.length
-
-    def add_energies_to_properties(self):
-        # if user does not provide energies info
-        if 'properties' in self.properties_info and 'energies' not in self.properties_info['properties']:
-            # setup energies info, so the user does not need to input energies
-            self.properties_info['properties'].append('energies')
-            self.properties_info['padding_values'].append(None)
-            self.properties_info['padded_shapes'].append((self.batch_size, ))
-            self.properties_info['dtypes'].append(torch.float64)
-        # if no properties provided
-        if 'properties' not in self.properties_info:
-            self.properties_info = {'properties': ['energies'],
-                                    'padding_values': [None],
-                                    'padded_shapes': [(self.batch_size, )],
-                                    'dtypes': [torch.float64],
-                                    }
-
-    def check_properties(self):
-        # print properties information
-        print('... The following properties will be loaded:')
-        for i, prop in enumerate(self.properties_info['properties']):
-            self.data_properties[prop] = []
-            message = '{}: (dtype: {}, padding_value: {}, padded_shape: {})'
-            print(message.format(prop, self.properties_info['dtypes'][i],
-                                 self.properties_info['padding_values'][i],
-                                 self.properties_info['padded_shapes'][i]))
-
-    def get_properties_info(self):
-        return self.properties_info
-
-
-def collate_fn(data, chunk_threshold, properties_info):
-    """Creates a batch of chunked data.
-    """
-
-    # unzip a batch of molecules (each molecule is a list)
-    batch_species, batch_coordinates, batch_properties = zip(*data)
-
-    batch_size = len(batch_species)
-
-    # padding - time: 13.2s
-    batch_species = torch.nn.utils.rnn.pad_sequence(batch_species,
-                                                    batch_first=True,
-                                                    padding_value=-1)
-    batch_coordinates = torch.nn.utils.rnn.pad_sequence(batch_coordinates,
-                                                        batch_first=True,
-                                                        padding_value=np.inf)
-
-    # sort - time: 0.7s
-    atoms = torch.sum(~(batch_species == -1), dim=-1, dtype=torch.int32)
-    sorted_atoms, sorted_atoms_idx = torch.sort(atoms)
-
-    batch_species = torch.index_select(batch_species, dim=0, index=sorted_atoms_idx)
-    batch_coordinates = torch.index_select(batch_coordinates, dim=0, index=sorted_atoms_idx)
-
-    # get chunk size - time: 2.1s
-    output, count = torch.unique(atoms, sorted=True, return_counts=True)
-    counts = torch.cat((output.unsqueeze(-1).int(), count.unsqueeze(-1).int()), dim=-1)
-    chunk_size_list, chunk_max_list = split_to_chunks(counts, chunk_threshold=chunk_threshold * batch_size * 20)
-
-    # split into chunks - time: 0.3s
-    chunks_batch_species = torch.split(batch_species, chunk_size_list, dim=0)
-    chunks_batch_coordinates = torch.split(batch_coordinates, chunk_size_list, dim=0)
-
-    # truncate redundant padding - time: 1.3s
-    chunks_batch_species = trunc_pad(list(chunks_batch_species), padding_value=-1)
-    chunks_batch_coordinates = trunc_pad(list(chunks_batch_coordinates), padding_value=np.inf)
-
-    for i, c in enumerate(chunks_batch_coordinates):
-        chunks_batch_coordinates[i] = c.reshape(c.shape[0], -1, 3)
-
-    chunks = list(zip(chunks_batch_species, chunks_batch_coordinates))
-
-    for i, _ in enumerate(chunks):
-        chunks[i] = (chunks[i][0], chunks[i][1])
-
-    # properties
-    properties = {}
-    for i, key in enumerate(properties_info['properties']):
-        # get a batch of property
-        prop = tuple(p[key] for p in batch_properties)
-        # padding and convert to tensor
-        if properties_info['padding_values'][i] is None:
-            prop = torch.stack(prop)
-        else:
-            prop = torch.nn.utils.rnn.pad_sequence(prop,
-                                                   batch_first=True,
-                                                   padding_value=properties_info['padding_values'][i])
-        # sort with number of atoms
-        prop = torch.index_select(prop, dim=0, index=sorted_atoms_idx)
-        # set property shape and dtype
-        padded_shape = list(properties_info['padded_shapes'][i])
-        padded_shape[0] = prop.shape[0]  # the last batch may does not have one batch data
-        properties[key] = prop.reshape(padded_shape).to(properties_info['dtypes'][i])
-
-    # return: [chunk1, chunk2, ...], {"energies", "force", ...} in which chunk1=(species, coordinates)
-    # e.g. chunk1 = [[1807, 21], [1807, 21, 3]], chunk2 = [[193, 50], [193, 50, 3]]
-    # 'energies' = [2000, 1]
-    return chunks, properties
-
-
-def split_to_two_chunks(counts, chunk_threshold):
-    counts = counts.cpu()
-    # NB (@yueyericardo): In principle this dtype should be `torch.bool`, but unfortunately
-    # `triu` is not implemented for bool tensor right now. This should be fixed when PyTorch
-    # add support for it.
-    left_mask = torch.triu(torch.ones([counts.shape[0], counts.shape[0]], dtype=torch.uint8))
-    left_mask = left_mask.t()
-
-    counts_atoms = counts[:, 0].repeat(counts.shape[0], 1)
-    counts_counts = counts[:, 1].repeat(counts.shape[0], 1)
-
-    counts_atoms_left = torch.where(left_mask, counts_atoms, torch.zeros_like(counts_atoms))
-    counts_atoms_right = torch.where(~left_mask, counts_atoms, torch.zeros_like(counts_atoms))
-    counts_counts_left = torch.where(left_mask, counts_counts, torch.zeros_like(counts_atoms))
-    counts_counts_right = torch.where(~left_mask, counts_counts, torch.zeros_like(counts_atoms))
-
-    # chunk max
-    chunk_max_left = torch.max(counts_atoms_left, dim=-1, keepdim=True).values
-    chunk_max_right = torch.max(counts_atoms_right, dim=-1, keepdim=True).values
-
-    # chunk size
-    chunk_size_left = torch.sum(counts_counts_left, dim=-1, keepdim=True, dtype=torch.int32)
-    chunk_size_right = torch.sum(counts_counts_right, dim=-1, keepdim=True, dtype=torch.int32)
-
-    # calculate cost
-    min_cost_threshold = torch.tensor([chunk_threshold], dtype=torch.int32)
-    cost = (torch.max(chunk_size_left * chunk_max_left * chunk_max_left, min_cost_threshold)
-            + torch.max(chunk_size_right * chunk_max_right * chunk_max_right, min_cost_threshold))
-
-    # find smallest cost
-    cost_min, cost_min_index = torch.min(cost.squeeze(), dim=-1)
-
-    # find smallest cost chunk_size, if not splitted, it will be [max_chunk_size, 0]
-    final_chunk_size = [chunk_size_left[cost_min_index], chunk_size_right[cost_min_index]]
-    final_chunk_max = [chunk_max_left[cost_min_index], chunk_max_right[cost_min_index]]
-
-    # if not splitted
-    if cost_min_index == (counts.shape[0] - 1):
-        return False, counts, [final_chunk_size[0]], [final_chunk_max[0]], cost_min
-    # if splitted
-    return True, [counts[:cost_min_index + 1], counts[(cost_min_index + 1):]], \
-        final_chunk_size, final_chunk_max, cost_min
-
-
-def split_to_chunks(counts, chunk_threshold=np.inf):
-    splitted, counts_list, chunk_size, chunk_max, cost = split_to_two_chunks(counts, chunk_threshold)
-    final_chunk_size = []
-    final_chunk_max = []
-
-    if (splitted):
-        for i, _ in enumerate(counts_list):
-            tmp_chunk_size, tmp_chunk_max = split_to_chunks(counts_list[i], chunk_threshold)
-            final_chunk_size.extend(tmp_chunk_size)
-            final_chunk_max.extend(tmp_chunk_max)
-        return final_chunk_size, final_chunk_max
-    # if not splitted
-    return chunk_size, chunk_max
-
-
-def trunc_pad(chunks, padding_value=0):
-    for i, _ in enumerate(chunks):
-        lengths = torch.sum(~(chunks[i] == padding_value), dim=-1, dtype=torch.int32)
-        chunks[i] = chunks[i][..., :lengths.max()]
-    return chunks
--- a/torchani/models.py
+++ b/torchani/models.py
@@ -91,7 +91,7 @@ class BuiltinNet(torch.nn.Module):
        self.species = self.consts.species
        self.species_converter = SpeciesConverter(self.species)
        self.aev_computer = AEVComputer(**self.consts)
-        self.energy_shifter = neurochem.load_sae(self.sae_file)
+        self.energy_shifter, self.sae_dict = neurochem.load_sae(self.sae_file, return_dict=True)
        self.neural_networks = neurochem.load_model_ensemble(
            self.species, self.ensemble_prefix, self.ensemble_size)


--- a/torchani/neurochem/__init__.py
+++ b/torchani/neurochem/__init__.py
@@ -70,17 +70,22 @@ class Constants(collections.abc.Mapping):
        return getattr(self, item)


-def load_sae(filename):
+def load_sae(filename, return_dict=False):
    """Returns an object of :class:`EnergyShifter` with self energies from
    NeuroChem sae file"""
    self_energies = []
+    d = {}
    with open(filename) as f:
        for i in f:
            line = [x.strip() for x in i.split('=')]
+            species = line[0].split(',')[0].strip()
            index = int(line[0].split(',')[1].strip())
            value = float(line[1])
+            d[species] = value
            self_energies.append((index, value))
    self_energies = [i for _, i in sorted(self_energies)]
+    if return_dict:
+        return EnergyShifter(self_energies), d
    return EnergyShifter(self_energies)


@@ -285,13 +290,13 @@ if sys.version_info[0] > 2:
        def __init__(self, filename, device=torch.device('cuda'), tqdm=False,
                     tensorboard=None, checkpoint_name='model.pt'):

-            from ..data import load_ani_dataset  # noqa: E402
+            from ..data import load  # noqa: E402

            class dummy:
                pass

            self.imports = dummy()
-            self.imports.load_ani_dataset = load_ani_dataset
+            self.imports.load = load

            self.filename = filename
            self.device = device
@@ -467,7 +472,7 @@ if sys.version_info[0] > 2:
            self.aev_computer = AEVComputer(**self.consts)
            del params['sflparamsfile']
            self.sae_file = os.path.join(dir_, params['atomEnergyFile'])
-            self.shift_energy = load_sae(self.sae_file)
+            self.shift_energy, self.sae = load_sae(self.sae_file, return_dict=True)
            del params['atomEnergyFile']
            network_dir = os.path.join(dir_, params['ntwkStoreDir'])
            if not os.path.exists(network_dir):
@@ -552,26 +557,18 @@ if sys.version_info[0] > 2:

        def load_data(self, training_path, validation_path):
            """Load training and validation dataset from file."""
-            self.training_set = self.imports.load_ani_dataset(
-                training_path, self.consts.species_to_tensor,
-                self.training_batch_size, rm_outlier=True, device=self.device,
-                transform=[self.shift_energy.subtract_from_dataset])
-            self.validation_set = self.imports.load_ani_dataset(
-                validation_path, self.consts.species_to_tensor,
-                self.validation_batch_size, rm_outlier=True, device=self.device,
-                transform=[self.shift_energy.subtract_from_dataset])
+            self.training_set = self.imports.load(training_path).subtract_self_energies(self.sae).species_to_indices().shuffle().collate(self.training_batch_size).cache()
+            self.validation_set = self.imports.load(validation_path).subtract_self_energies(self.sae).species_to_indices().shuffle().collate(self.validation_batch_size).cache()

        def evaluate(self, dataset):
            """Run the evaluation"""
            total_mse = 0.0
            count = 0
-            for batch_x, batch_y in dataset:
-                true_energies = batch_y['energies']
-                predicted_energies = []
-                for chunk_species, chunk_coordinates in batch_x:
-                    _, chunk_energies = self.model((chunk_species, chunk_coordinates))
-                    predicted_energies.append(chunk_energies)
-                predicted_energies = torch.cat(predicted_energies)
+            for properties in dataset:
+                species = properties['species'].to(self.device)
+                coordinates = properties['coordinates'].to(self.device).float()
+                true_energies = properties['energies'].to(self.device).float()
+                _, predicted_energies = self.model((species, coordinates))
                total_mse += self.mse_sum(predicted_energies, true_energies).item()
                count += predicted_energies.shape[0]
            return hartree2kcalmol(math.sqrt(total_mse / count))
@@ -620,21 +617,16 @@ if sys.version_info[0] > 2:
                    self.tensorboard.add_scalar('learning_rate', learning_rate, AdamW_scheduler.last_epoch)
                    self.tensorboard.add_scalar('no_improve_count_vs_epoch', no_improve_count, AdamW_scheduler.last_epoch)

-                for i, (batch_x, batch_y) in self.tqdm(
+                for i, properties in self.tqdm(
                    enumerate(self.training_set),
                    total=len(self.training_set),
                    desc='epoch {}'.format(AdamW_scheduler.last_epoch)
                ):
-
-                    true_energies = batch_y['energies']
-                    predicted_energies = []
-                    num_atoms = []
-                    for chunk_species, chunk_coordinates in batch_x:
-                        num_atoms.append((chunk_species >= 0).sum(dim=1))
-                        _, chunk_energies = self.model((chunk_species, chunk_coordinates))
-                        predicted_energies.append(chunk_energies)
-                    num_atoms = torch.cat(num_atoms).to(true_energies.dtype)
-                    predicted_energies = torch.cat(predicted_energies)
+                    species = properties['species'].to(self.device)
+                    coordinates = properties['coordinates'].to(self.device).float()
+                    true_energies = properties['energies'].to(self.device).float()
+                    num_atoms = (species >= 0).sum(dim=1, dtype=true_energies.dtype)
+                    _, predicted_energies = self.model((species, coordinates))
                    loss = (self.mse_se(predicted_energies, true_energies) / num_atoms.sqrt()).mean()
                    AdamW_optim.zero_grad()
                    SGD_optim.zero_grad()