Commit b83f824a authored by Farhad Ramezanghorbani's avatar Farhad Ramezanghorbani Committed by Gao, Xiang
Browse files

Discard outlier energy conformers (#287)

* outlier removal process fixed

* remove outlier energies if exist
parent 0b3e26ee
......@@ -82,7 +82,7 @@ dspath = os.path.join(path, '../dataset/ani1-up_to_gdb4/ani_gdb_s01.h5')
batch_size = 2560
training, validation = torchani.data.load_ani_dataset(
dspath, species_to_tensor, batch_size, device=device,
dspath, species_to_tensor, batch_size, rm_outlier=True, device=device,
transform=[energy_shifter.subtract_from_dataset], split=[0.8, None])
print('Self atomic energies: ', energy_shifter.self_energies)
......
......@@ -52,8 +52,8 @@ batch_size = 2560
# from hdf5 files.
training, validation = torchani.data.load_ani_dataset(
dspath, species_to_tensor, batch_size, device=device,
atomic_properties=['forces'],
dspath, species_to_tensor, batch_size, rm_outlier=True,
device=device, atomic_properties=['forces'],
transform=[energy_shifter.subtract_from_dataset], split=[0.8, None])
print('Self atomic energies: ', energy_shifter.self_energies)
......
......@@ -102,7 +102,7 @@ writer = torch.utils.tensorboard.SummaryWriter(log_dir=log)
###############################################################################
# Now load training and validation datasets into memory.
training, validation = torchani.data.load_ani_dataset(
dspath, consts.species_to_tensor, batch_size, device=device,
dspath, consts.species_to_tensor, batch_size, rm_outlier=True, device=device,
transform=[energy_shifter.subtract_from_dataset], split=[0.8, None])
###############################################################################
......
......@@ -309,23 +309,27 @@ def load_ani_dataset(path, species_tensor_converter, batch_size, shuffle=True,
atomic_properties_, properties_ = t(atomic_properties_, properties_)
if rm_outlier:
# This is how NeuroChem discard the outliers
transformed_energies = properties_['energies']
num_atoms = (atomic_properties_['species'] >= 0).to(transformed_energies.dtype).sum(dim=1)
scaled_diff = transformed_energies / num_atoms.sqrt()
mean = transformed_energies.mean()
std = transformed_energies.std()
tol = 15.0 * std + mean
mean = scaled_diff[torch.abs(scaled_diff) < 15.0].mean()
std = torch.abs(scaled_diff[torch.abs(scaled_diff) < 15.0]).std()
low_idx = (torch.abs(scaled_diff) < tol).nonzero().squeeze()
# -15 * std + mean < scaled_diff < +11 * std + mean
tol = 13.0 * std + mean
low_idx = (torch.abs(scaled_diff + 2.0 * std) < tol).nonzero().squeeze()
outlier_count = molecules - low_idx.numel()
# discard outlier energy conformers if exist
if outlier_count > 0:
print(f'Note: {outlier_count} outlier energy conformers have been discarded from dataset')
print("Note: {} outlier energy conformers have been discarded from dataset".format(outlier_count))
for key, val in atomic_properties_.items():
atomic_properties_[key] = val[low_idx]
for key, val in properties_.items():
properties_[key] = val[low_idx]
molecules = low_idx.numel()
# compute size of each subset
split_ = []
......
......@@ -567,11 +567,11 @@ if sys.version_info[0] > 2:
else:
self.training_set = self.imports.load_ani_dataset(
training_path, self.consts.species_to_tensor,
self.training_batch_size, device=self.device,
self.training_batch_size, rm_outlier=True, device=self.device,
transform=[self.shift_energy.subtract_from_dataset])
self.validation_set = self.imports.load_ani_dataset(
validation_path, self.consts.species_to_tensor,
self.validation_batch_size, device=self.device,
self.validation_batch_size, rm_outlier=True, device=self.device,
transform=[self.shift_energy.subtract_from_dataset])
def evaluate(self, dataset):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment