import os
import pickle
import pyanitools
import numpy
from neurochem_calculator import NeuroChem, path
import json
import tqdm
import random


neurochem = NeuroChem()

# generate expect for ANI1 subset
mol_count = 0
for i in [1, 2, 3, 4]:
    data_file = os.path.join(
        path, '../dataset/ani_gdb_s0{}.h5'.format(i))
    adl = pyanitools.anidataloader(data_file)
    for data in tqdm.tqdm(adl, desc='ANI1: {} heavy atoms'.format(i)):
        coordinates = data['coordinates'][:10, :]
        pickleobj = neurochem(coordinates, data['species'])
        dumpfile = os.path.join(
            path, '../tests/test_data/ANI1_subset/{}'.format(mol_count))
        with open(dumpfile, 'wb') as f:
            pickle.dump(pickleobj, f)
        mol_count += 1


# generate expect for NIST
keep_ratio = 0.1  # reduce the size of generated file by discarding
mol_count = 0
with open(os.path.join(path, 'diverse_test_set/result.json')) as f:
    pickle_objects = []
    for i in tqdm.tqdm(json.load(f), desc='NIST'):
        if random.random() > keep_ratio:
            continue
        atoms = i['atoms']
        natoms = len(atoms)
        species = []
        coordinates = []
        for atype, x, y, z in atoms:
            species.append(atype)
            coordinates.append([x, y, z])
        pickleobj = neurochem(numpy.array(coordinates), species)
        pickle_objects.append(pickleobj)
        mol_count += 1

    dumpfile = os.path.join(
        path, '../tests/test_data/NIST/all')
    with open(dumpfile, 'wb') as f:
        pickle.dump(pickle_objects, f)