[LifeSci] Move to Independent Repo (#1592)

* Move LifeSci * Remove doc

[LifeSci] Move to Independent Repo (#1592)
* Move LifeSci * Remove doc
36c7b771 · Mufei Li · GitHub · 94c67203 · 94c67203 · 94c67203
Unverified Commit 36c7b771 authored Jun 05, 2020 by Mufei Li Committed by GitHub Jun 05, 2020
20 changed files
--- a/apps/life_sci/examples/generative_models/jtnn/jtnn/chemutils.py
+++ b/apps/life_sci/examples/generative_models/jtnn/jtnn/chemutils.py
-import rdkit.Chem as Chem
-import torch
-from collections import defaultdict
-from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers
-from scipy.sparse import csr_matrix
-from scipy.sparse.csgraph import minimum_spanning_tree
-from dgl import DGLGraph
-ELEM_LIST = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na',
-             'Ca', 'Fe', 'Al', 'I', 'B', 'K', 'Se', 'Zn', 'H', 'Cu', 'Mn', 'unknown']
-MST_MAX_WEIGHT = 100
-MAX_NCAND = 2000
-def onek_encoding_unk(x, allowable_set):
-    if x not in allowable_set:
-        x = allowable_set[-1]
-    return [x == s for s in allowable_set]
-def set_atommap(mol, num=0):
-    for atom in mol.GetAtoms():
-        atom.SetAtomMapNum(num)
-def get_mol(smiles):
-    mol = Chem.MolFromSmiles(smiles)
-    if mol is None:
-        return None
-    Chem.Kekulize(mol)
-    return mol
-def get_smiles(mol):
-    return Chem.MolToSmiles(mol, kekuleSmiles=True)
-def decode_stereo(smiles2D):
-    mol = Chem.MolFromSmiles(smiles2D)
-    dec_isomers = list(EnumerateStereoisomers(mol))
-    dec_isomers = [Chem.MolFromSmiles(Chem.MolToSmiles(
-        mol, isomericSmiles=True)) for mol in dec_isomers]
-    smiles3D = [Chem.MolToSmiles(mol, isomericSmiles=True)
-                for mol in dec_isomers]
-    chiralN = [atom.GetIdx() for atom in dec_isomers[0].GetAtoms() if int(
-        atom.GetChiralTag()) > 0 and atom.GetSymbol() == "N"]
-    if len(chiralN) > 0:
-        for mol in dec_isomers:
-            for idx in chiralN:
-                mol.GetAtomWithIdx(idx).SetChiralTag(
-                    Chem.rdchem.ChiralType.CHI_UNSPECIFIED)
-            smiles3D.append(Chem.MolToSmiles(mol, isomericSmiles=True))
-    return smiles3D
-def sanitize(mol):
-    try:
-        smiles = get_smiles(mol)
-        mol = get_mol(smiles)
-    except Exception as e:
-        return None
-    return mol
-def copy_atom(atom):
-    new_atom = Chem.Atom(atom.GetSymbol())
-    new_atom.SetFormalCharge(atom.GetFormalCharge())
-    new_atom.SetAtomMapNum(atom.GetAtomMapNum())
-    return new_atom
-def copy_edit_mol(mol):
-    new_mol = Chem.RWMol(Chem.MolFromSmiles(''))
-    for atom in mol.GetAtoms():
-        new_atom = copy_atom(atom)
-        new_mol.AddAtom(new_atom)
-    for bond in mol.GetBonds():
-        a1 = bond.GetBeginAtom().GetIdx()
-        a2 = bond.GetEndAtom().GetIdx()
-        bt = bond.GetBondType()
-        new_mol.AddBond(a1, a2, bt)
-    return new_mol
-def get_clique_mol(mol, atoms):
-    smiles = Chem.MolFragmentToSmiles(mol, atoms, kekuleSmiles=True)
-    new_mol = Chem.MolFromSmiles(smiles, sanitize=False)
-    new_mol = copy_edit_mol(new_mol).GetMol()
-    new_mol = sanitize(new_mol)  # We assume this is not None
-    return new_mol
-def tree_decomp(mol):
-    n_atoms = mol.GetNumAtoms()
-    if n_atoms == 1:
-        return [[0]], []
-    cliques = []
-    for bond in mol.GetBonds():
-        a1 = bond.GetBeginAtom().GetIdx()
-        a2 = bond.GetEndAtom().GetIdx()
-        if not bond.IsInRing():
-            cliques.append([a1, a2])
-    ssr = [list(x) for x in Chem.GetSymmSSSR(mol)]
-    cliques.extend(ssr)
-    nei_list = [[] for i in range(n_atoms)]
-    for i in range(len(cliques)):
-        for atom in cliques[i]:
-            nei_list[atom].append(i)
-    # Merge Rings with intersection > 2 atoms
-    for i in range(len(cliques)):
-        if len(cliques[i]) <= 2:
-            continue
-        for atom in cliques[i]:
-            for j in nei_list[atom]:
-                if i >= j or len(cliques[j]) <= 2:
-                    continue
-                inter = set(cliques[i]) & set(cliques[j])
-                if len(inter) > 2:
-                    cliques[i].extend(cliques[j])
-                    cliques[i] = list(set(cliques[i]))
-                    cliques[j] = []
-    cliques = [c for c in cliques if len(c) > 0]
-    nei_list = [[] for i in range(n_atoms)]
-    for i in range(len(cliques)):
-        for atom in cliques[i]:
-            nei_list[atom].append(i)
-    # Build edges and add singleton cliques
-    edges = defaultdict(int)
-    for atom in range(n_atoms):
-        if len(nei_list[atom]) <= 1:
-            continue
-        cnei = nei_list[atom]
-        bonds = [c for c in cnei if len(cliques[c]) == 2]
-        rings = [c for c in cnei if len(cliques[c]) > 4]
-        # In general, if len(cnei) >= 3, a singleton should be added, but 1 bond + 2 ring is currently not dealt with.
-        if len(bonds) > 2 or (len(bonds) == 2 and len(cnei) > 2):
-            cliques.append([atom])
-            c2 = len(cliques) - 1
-            for c1 in cnei:
-                edges[(c1, c2)] = 1
-        elif len(rings) > 2:  # Multiple (n>2) complex rings
-            cliques.append([atom])
-            c2 = len(cliques) - 1
-            for c1 in cnei:
-                edges[(c1, c2)] = MST_MAX_WEIGHT - 1
-        else:
-            for i in range(len(cnei)):
-                for j in range(i + 1, len(cnei)):
-                    c1, c2 = cnei[i], cnei[j]
-                    inter = set(cliques[c1]) & set(cliques[c2])
-                    if edges[(c1, c2)] < len(inter):
-                        # cnei[i] < cnei[j] by construction
-                        edges[(c1, c2)] = len(inter)
-    edges = [u + (MST_MAX_WEIGHT-v,) for u, v in edges.items()]
-    if len(edges) == 0:
-        return cliques, edges
-    # Compute Maximum Spanning Tree
-    row, col, data = list(zip(*edges))
-    n_clique = len(cliques)
-    clique_graph = csr_matrix((data, (row, col)), shape=(n_clique, n_clique))
-    junc_tree = minimum_spanning_tree(clique_graph)
-    row, col = junc_tree.nonzero()
-    edges = [(row[i], col[i]) for i in range(len(row))]
-    return (cliques, edges)
-def atom_equal(a1, a2):
-    return a1.GetSymbol() == a2.GetSymbol() and a1.GetFormalCharge() == a2.GetFormalCharge()
-# Bond type not considered because all aromatic (so SINGLE matches DOUBLE)
-def ring_bond_equal(b1, b2, reverse=False):
-    b1 = (b1.GetBeginAtom(), b1.GetEndAtom())
-    if reverse:
-        b2 = (b2.GetEndAtom(), b2.GetBeginAtom())
-    else:
-        b2 = (b2.GetBeginAtom(), b2.GetEndAtom())
-    return atom_equal(b1[0], b2[0]) and atom_equal(b1[1], b2[1])
-def attach_mols_nx(ctr_mol, neighbors, prev_nodes, nei_amap):
-    prev_nids = [node['nid'] for node in prev_nodes]
-    for nei_node in prev_nodes + neighbors:
-        nei_id, nei_mol = nei_node['nid'], nei_node['mol']
-        amap = nei_amap[nei_id]
-        for atom in nei_mol.GetAtoms():
-            if atom.GetIdx() not in amap:
-                new_atom = copy_atom(atom)
-                amap[atom.GetIdx()] = ctr_mol.AddAtom(new_atom)
-        if nei_mol.GetNumBonds() == 0:
-            nei_atom = nei_mol.GetAtomWithIdx(0)
-            ctr_atom = ctr_mol.GetAtomWithIdx(amap[0])
-            ctr_atom.SetAtomMapNum(nei_atom.GetAtomMapNum())
-        else:
-            for bond in nei_mol.GetBonds():
-                a1 = amap[bond.GetBeginAtom().GetIdx()]
-                a2 = amap[bond.GetEndAtom().GetIdx()]
-                if ctr_mol.GetBondBetweenAtoms(a1, a2) is None:
-                    ctr_mol.AddBond(a1, a2, bond.GetBondType())
-                elif nei_id in prev_nids:  # father node overrides
-                    ctr_mol.RemoveBond(a1, a2)
-                    ctr_mol.AddBond(a1, a2, bond.GetBondType())
-    return ctr_mol
-def local_attach_nx(ctr_mol, neighbors, prev_nodes, amap_list):
-    ctr_mol = copy_edit_mol(ctr_mol)
-    nei_amap = {nei['nid']: {} for nei in prev_nodes + neighbors}
-    for nei_id, ctr_atom, nei_atom in amap_list:
-        nei_amap[nei_id][nei_atom] = ctr_atom
-    ctr_mol = attach_mols_nx(ctr_mol, neighbors, prev_nodes, nei_amap)
-    return ctr_mol.GetMol()
-# This version records idx mapping between ctr_mol and nei_mol
-def enum_attach_nx(ctr_mol, nei_node, amap, singletons):
-    nei_mol, nei_idx = nei_node['mol'], nei_node['nid']
-    att_confs = []
-    black_list = [atom_idx for nei_id, atom_idx,
-                  _ in amap if nei_id in singletons]
-    ctr_atoms = [atom for atom in ctr_mol.GetAtoms() if atom.GetIdx()
-                 not in black_list]
-    ctr_bonds = [bond for bond in ctr_mol.GetBonds()]
-    if nei_mol.GetNumBonds() == 0:  # neighbor singleton
-        nei_atom = nei_mol.GetAtomWithIdx(0)
-        used_list = [atom_idx for _, atom_idx, _ in amap]
-        for atom in ctr_atoms:
-            if atom_equal(atom, nei_atom) and atom.GetIdx() not in used_list:
-                new_amap = amap + [(nei_idx, atom.GetIdx(), 0)]
-                att_confs.append(new_amap)
-    elif nei_mol.GetNumBonds() == 1:  # neighbor is a bond
-        bond = nei_mol.GetBondWithIdx(0)
-        bond_val = int(bond.GetBondTypeAsDouble())
-        b1, b2 = bond.GetBeginAtom(), bond.GetEndAtom()
-        for atom in ctr_atoms:
-            # Optimize if atom is carbon (other atoms may change valence)
-            if atom.GetAtomicNum() == 6 and atom.GetTotalNumHs() < bond_val:
-                continue
-            if atom_equal(atom, b1):
-                new_amap = amap + [(nei_idx, atom.GetIdx(), b1.GetIdx())]
-                att_confs.append(new_amap)
-            elif atom_equal(atom, b2):
-                new_amap = amap + [(nei_idx, atom.GetIdx(), b2.GetIdx())]
-                att_confs.append(new_amap)
-    else:
-        # intersection is an atom
-        for a1 in ctr_atoms:
-            for a2 in nei_mol.GetAtoms():
-                if atom_equal(a1, a2):
-                    # Optimize if atom is carbon (other atoms may change valence)
-                    if a1.GetAtomicNum() == 6 and a1.GetTotalNumHs() + a2.GetTotalNumHs() < 4:
-                        continue
-                    new_amap = amap + [(nei_idx, a1.GetIdx(), a2.GetIdx())]
-                    att_confs.append(new_amap)
-        # intersection is an bond
-        if ctr_mol.GetNumBonds() > 1:
-            for b1 in ctr_bonds:
-                for b2 in nei_mol.GetBonds():
-                    if ring_bond_equal(b1, b2):
-                        new_amap = amap + [(nei_idx, b1.GetBeginAtom().GetIdx(), b2.GetBeginAtom(
-                        ).GetIdx()), (nei_idx, b1.GetEndAtom().GetIdx(), b2.GetEndAtom().GetIdx())]
-                        att_confs.append(new_amap)
-                    if ring_bond_equal(b1, b2, reverse=True):
-                        new_amap = amap + [(nei_idx, b1.GetBeginAtom().GetIdx(), b2.GetEndAtom(
-                        ).GetIdx()), (nei_idx, b1.GetEndAtom().GetIdx(), b2.GetBeginAtom().GetIdx())]
-                        att_confs.append(new_amap)
-    return att_confs
-# Try rings first: Speed-Up
-def enum_assemble_nx(node, neighbors, prev_nodes=[], prev_amap=[]):
-    all_attach_confs = []
-    singletons = [nei_node['nid'] for nei_node in neighbors +
-                  prev_nodes if nei_node['mol'].GetNumAtoms() == 1]
-    def search(cur_amap, depth):
-        if len(all_attach_confs) > MAX_NCAND:
-            return
-        if depth == len(neighbors):
-            all_attach_confs.append(cur_amap)
-            return
-        nei_node = neighbors[depth]
-        cand_amap = enum_attach_nx(node['mol'], nei_node, cur_amap, singletons)
-        cand_smiles = set()
-        candidates = []
-        for amap in cand_amap:
-            cand_mol = local_attach_nx(
-                node['mol'], neighbors[:depth+1], prev_nodes, amap)
-            cand_mol = sanitize(cand_mol)
-            if cand_mol is None:
-                continue
-            smiles = get_smiles(cand_mol)
-            if smiles in cand_smiles:
-                continue
-            cand_smiles.add(smiles)
-            candidates.append(amap)
-        if len(candidates) == 0:
-            return []
-        for new_amap in candidates:
-            search(new_amap, depth + 1)
-    search(prev_amap, 0)
-    cand_smiles = set()
-    candidates = []
-    for amap in all_attach_confs:
-        cand_mol = local_attach_nx(node['mol'], neighbors, prev_nodes, amap)
-        cand_mol = Chem.MolFromSmiles(Chem.MolToSmiles(cand_mol))
-        smiles = Chem.MolToSmiles(cand_mol)
-        if smiles in cand_smiles:
-            continue
-        cand_smiles.add(smiles)
-        Chem.Kekulize(cand_mol)
-        candidates.append((smiles, cand_mol, amap))
-    return candidates
-# Only used for debugging purpose
-def dfs_assemble_nx(graph, cur_mol, global_amap, fa_amap, cur_node_id, fa_node_id):
-    cur_node = graph.nodes_dict[cur_node_id]
-    fa_node = graph.nodes_dict[fa_node_id] if fa_node_id is not None else None
-    fa_nid = fa_node['nid'] if fa_node is not None else -1
-    prev_nodes = [fa_node] if fa_node is not None else []
-    children_id = [nei for nei in graph[cur_node_id]
-                   if graph.nodes_dict[nei]['nid'] != fa_nid]
-    children = [graph.nodes_dict[nei] for nei in children_id]
-    neighbors = [nei for nei in children if nei['mol'].GetNumAtoms() > 1]
-    neighbors = sorted(
-        neighbors, key=lambda x: x['mol'].GetNumAtoms(), reverse=True)
-    singletons = [nei for nei in children if nei['mol'].GetNumAtoms() == 1]
-    neighbors = singletons + neighbors
-    cur_amap = [(fa_nid, a2, a1)
-                for nid, a1, a2 in fa_amap if nid == cur_node['nid']]
-    cands = enum_assemble_nx(
-        graph.nodes_dict[cur_node_id], neighbors, prev_nodes, cur_amap)
-    if len(cands) == 0:
-        return
-    cand_smiles, _, cand_amap = zip(*cands)
-    label_idx = cand_smiles.index(cur_node['label'])
-    label_amap = cand_amap[label_idx]
-    for nei_id, ctr_atom, nei_atom in label_amap:
-        if nei_id == fa_nid:
-            continue
-        global_amap[nei_id][nei_atom] = global_amap[cur_node['nid']][ctr_atom]
-    # father is already attached
-    cur_mol = attach_mols_nx(cur_mol, children, [], global_amap)
-    for nei_node_id, nei_node in zip(children_id, children):
-        if not nei_node['is_leaf']:
-            dfs_assemble_nx(graph, cur_mol, global_amap,
-                            label_amap, nei_node_id, cur_node_id)
-def mol2dgl_dec(cand_batch):
-    # Note that during graph decoding they don't predict stereochemistry-related
-    # characteristics (i.e. Chiral Atoms, E-Z, Cis-Trans).  Instead, they decode
-    # the 2-D graph first, then enumerate all possible 3-D forms and find the
-    # one with highest score.
-    def atom_features(atom):
-        return (torch.Tensor(onek_encoding_unk(atom.GetSymbol(), ELEM_LIST)
-                             + onek_encoding_unk(atom.GetDegree(),
-                                                 [0, 1, 2, 3, 4, 5])
-                             + onek_encoding_unk(atom.GetFormalCharge(),
-                                                 [-1, -2, 1, 2, 0])
-                             + [atom.GetIsAromatic()]))
-    def bond_features(bond):
-        bt = bond.GetBondType()
-        return (torch.Tensor([bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE,
-                              bt == Chem.rdchem.BondType.TRIPLE, bt == Chem.rdchem.BondType.AROMATIC,
-                              bond.IsInRing()]))
-    cand_graphs = []
-    tree_mess_source_edges = []  # map these edges from trees to...
-    tree_mess_target_edges = []  # these edges on candidate graphs
-    tree_mess_target_nodes = []
-    n_nodes = 0
-    atom_x = []
-    bond_x = []
-    for mol, mol_tree, ctr_node_id in cand_batch:
-        n_atoms = mol.GetNumAtoms()
-        g = DGLGraph()
-        for i, atom in enumerate(mol.GetAtoms()):
-            assert i == atom.GetIdx()
-            atom_x.append(atom_features(atom))
-        g.add_nodes(n_atoms)
-        bond_src = []
-        bond_dst = []
-        for i, bond in enumerate(mol.GetBonds()):
-            a1 = bond.GetBeginAtom()
-            a2 = bond.GetEndAtom()
-            begin_idx = a1.GetIdx()
-            end_idx = a2.GetIdx()
-            features = bond_features(bond)
-            bond_src.append(begin_idx)
-            bond_dst.append(end_idx)
-            bond_x.append(features)
-            bond_src.append(end_idx)
-            bond_dst.append(begin_idx)
-            bond_x.append(features)
-            x_nid, y_nid = a1.GetAtomMapNum(), a2.GetAtomMapNum()
-            # Tree node ID in the batch
-            x_bid = mol_tree.nodes_dict[x_nid - 1]['idx'] if x_nid > 0 else -1
-            y_bid = mol_tree.nodes_dict[y_nid - 1]['idx'] if y_nid > 0 else -1
-            if x_bid >= 0 and y_bid >= 0 and x_bid != y_bid:
-                if mol_tree.has_edge_between(x_bid, y_bid):
-                    tree_mess_target_edges.append(
-                        (begin_idx + n_nodes, end_idx + n_nodes))
-                    tree_mess_source_edges.append((x_bid, y_bid))
-                    tree_mess_target_nodes.append(end_idx + n_nodes)
-                if mol_tree.has_edge_between(y_bid, x_bid):
-                    tree_mess_target_edges.append(
-                        (end_idx + n_nodes, begin_idx + n_nodes))
-                    tree_mess_source_edges.append((y_bid, x_bid))
-                    tree_mess_target_nodes.append(begin_idx + n_nodes)
-        n_nodes += n_atoms
-        g.add_edges(bond_src, bond_dst)
-        cand_graphs.append(g)
-    return cand_graphs, torch.stack(atom_x), \
-        torch.stack(bond_x) if len(bond_x) > 0 else torch.zeros(0), \
-        torch.LongTensor(tree_mess_source_edges), \
-        torch.LongTensor(tree_mess_target_edges), \
-        torch.LongTensor(tree_mess_target_nodes)
-def mol2dgl_enc(smiles):
-    def atom_features(atom):
-        return (torch.Tensor(onek_encoding_unk(atom.GetSymbol(), ELEM_LIST)
-                             + onek_encoding_unk(atom.GetDegree(),
-                                                 [0, 1, 2, 3, 4, 5])
-                             + onek_encoding_unk(atom.GetFormalCharge(), [-1, -2, 1, 2, 0])
-                             + onek_encoding_unk(int(atom.GetChiralTag()), [0, 1, 2, 3])
-                             + [atom.GetIsAromatic()]))
-    def bond_features(bond):
-        bt = bond.GetBondType()
-        stereo = int(bond.GetStereo())
-        fbond = [bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt ==
-                 Chem.rdchem.BondType.TRIPLE, bt == Chem.rdchem.BondType.AROMATIC, bond.IsInRing()]
-        fstereo = onek_encoding_unk(stereo, [0, 1, 2, 3, 4, 5])
-        return (torch.Tensor(fbond + fstereo))
-    n_edges = 0
-    atom_x = []
-    bond_x = []
-    mol = get_mol(smiles)
-    n_atoms = mol.GetNumAtoms()
-    n_bonds = mol.GetNumBonds()
-    graph = DGLGraph()
-    for i, atom in enumerate(mol.GetAtoms()):
-        assert i == atom.GetIdx()
-        atom_x.append(atom_features(atom))
-    graph.add_nodes(n_atoms)
-    bond_src = []
-    bond_dst = []
-    for i, bond in enumerate(mol.GetBonds()):
-        begin_idx = bond.GetBeginAtom().GetIdx()
-        end_idx = bond.GetEndAtom().GetIdx()
-        features = bond_features(bond)
-        bond_src.append(begin_idx)
-        bond_dst.append(end_idx)
-        bond_x.append(features)
-        # set up the reverse direction
-        bond_src.append(end_idx)
-        bond_dst.append(begin_idx)
-        bond_x.append(features)
-    graph.add_edges(bond_src, bond_dst)
-    n_edges += n_bonds
-    return graph, torch.stack(atom_x), \
-        torch.stack(bond_x) if len(bond_x) > 0 else torch.zeros(0)
--- a/apps/life_sci/examples/generative_models/jtnn/jtnn/datautils.py
+++ b/apps/life_sci/examples/generative_models/jtnn/jtnn/datautils.py
-import dgl
-import os
-import torch
-from dgl.data.utils import download, extract_archive, get_download_dir
-from torch.utils.data import Dataset
-from .mol_tree import Vocab, DGLMolTree
-from .chemutils import mol2dgl_dec, mol2dgl_enc
-ELEM_LIST = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca',
-             'Fe', 'Al', 'I', 'B', 'K', 'Se', 'Zn', 'H', 'Cu', 'Mn', 'unknown']
-ATOM_FDIM_DEC = len(ELEM_LIST) + 6 + 5 + 1
-BOND_FDIM_DEC = 5
-MAX_NB = 10
-PAPER = os.getenv('PAPER', False)
-_url = 'https://s3-ap-southeast-1.amazonaws.com/dgl-data-cn/dataset/jtnn.zip'
-def _unpack_field(examples, field):
-    return [e[field] for e in examples]
-def _set_node_id(mol_tree, vocab):
-    wid = []
-    for i, node in enumerate(mol_tree.nodes_dict):
-        mol_tree.nodes_dict[node]['idx'] = i
-        wid.append(vocab.get_index(mol_tree.nodes_dict[node]['smiles']))
-    return wid
-class JTNNDataset(Dataset):
-    def __init__(self, data, vocab, training=True):
-        self.dir = get_download_dir()
-        self.zip_file_path='{}/jtnn.zip'.format(self.dir)
-        download(_url, path=self.zip_file_path)
-        extract_archive(self.zip_file_path, '{}/jtnn'.format(self.dir))
-        print('Loading data...')
-        if data in ['train', 'test']:
-            data_file = '{}/jtnn/{}.txt'.format(self.dir, data)
-        else:
-            data_file = data
-        with open(data_file) as f:
-            self.data = [line.strip("\r\n ").split()[0] for line in f]
-        self.vocab_file = '{}/jtnn/{}.txt'.format(self.dir, vocab)
-        print('Loading finished.')
-        print('\tNum samples:', len(self.data))
-        print('\tVocab file:', self.vocab_file)
-        self.training = training
-        self.vocab = Vocab([x.strip("\r\n ") for x in open(self.vocab_file)])
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, idx):
-        smiles = self.data[idx]
-        mol_tree = DGLMolTree(smiles)
-        mol_tree.recover()
-        mol_tree.assemble()
-        wid = _set_node_id(mol_tree, self.vocab)
-        # prebuild the molecule graph
-        mol_graph, atom_x_enc, bond_x_enc = mol2dgl_enc(mol_tree.smiles)
-        result = {
-                'mol_tree': mol_tree,
-                'mol_graph': mol_graph,
-                'atom_x_enc': atom_x_enc,
-                'bond_x_enc': bond_x_enc,
-                'wid': wid,
-                }
-        if not self.training:
-            return result
-        # prebuild the candidate graph list
-        cands = []
-        for node_id, node in mol_tree.nodes_dict.items():
-            # fill in ground truth
-            if node['label'] not in node['cands']:
-                node['cands'].append(node['label'])
-                node['cand_mols'].append(node['label_mol'])
-            if node['is_leaf'] or len(node['cands']) == 1:
-                continue
-            cands.extend([(cand, mol_tree, node_id)
-                         for cand in node['cand_mols']])
-        if len(cands) > 0:
-            cand_graphs, atom_x_dec, bond_x_dec, tree_mess_src_e, \
-                    tree_mess_tgt_e, tree_mess_tgt_n = mol2dgl_dec(cands)
-        else:
-            cand_graphs = []
-            atom_x_dec = torch.zeros(0, ATOM_FDIM_DEC)
-            bond_x_dec = torch.zeros(0, BOND_FDIM_DEC)
-            tree_mess_src_e = torch.zeros(0, 2).long()
-            tree_mess_tgt_e = torch.zeros(0, 2).long()
-            tree_mess_tgt_n = torch.zeros(0).long()
-        # prebuild the stereoisomers
-        cands = mol_tree.stereo_cands
-        if len(cands) > 1:
-            if mol_tree.smiles3D not in cands:
-                cands.append(mol_tree.smiles3D)
-            stereo_graphs = [mol2dgl_enc(c) for c in cands]
-            stereo_cand_graphs, stereo_atom_x_enc, stereo_bond_x_enc = \
-                    zip(*stereo_graphs)
-            stereo_atom_x_enc = torch.cat(stereo_atom_x_enc)
-            stereo_bond_x_enc = torch.cat(stereo_bond_x_enc)
-            stereo_cand_label = [(cands.index(mol_tree.smiles3D), len(cands))]
-        else:
-            stereo_cand_graphs = []
-            stereo_atom_x_enc = torch.zeros(0, atom_x_enc.shape[1])
-            stereo_bond_x_enc = torch.zeros(0, bond_x_enc.shape[1])
-            stereo_cand_label = []
-        result.update({
-            'cand_graphs': cand_graphs,
-            'atom_x_dec': atom_x_dec,
-            'bond_x_dec': bond_x_dec,
-            'tree_mess_src_e': tree_mess_src_e,
-            'tree_mess_tgt_e': tree_mess_tgt_e,
-            'tree_mess_tgt_n': tree_mess_tgt_n,
-            'stereo_cand_graphs': stereo_cand_graphs,
-            'stereo_atom_x_enc': stereo_atom_x_enc,
-            'stereo_bond_x_enc': stereo_bond_x_enc,
-            'stereo_cand_label': stereo_cand_label,
-            })
-        return result
-class JTNNCollator(object):
-    def __init__(self, vocab, training):
-        self.vocab = vocab
-        self.training = training
-    @staticmethod
-    def _batch_and_set(graphs, atom_x, bond_x, flatten):
-        if flatten:
-            graphs = [g for f in graphs for g in f]
-        graph_batch = dgl.batch(graphs)
-        graph_batch.ndata['x'] = atom_x
-        graph_batch.edata.update({
-            'x': bond_x,
-            'src_x': atom_x.new(bond_x.shape[0], atom_x.shape[1]).zero_(),
-            })
-        return graph_batch
-    def __call__(self, examples):
-        # get list of trees
-        mol_trees = _unpack_field(examples, 'mol_tree')
-        wid = _unpack_field(examples, 'wid')
-        for _wid, mol_tree in zip(wid, mol_trees):
-            mol_tree.ndata['wid'] = torch.LongTensor(_wid)
-        # TODO: either support pickling or get around ctypes pointers using scipy
-        # batch molecule graphs
-        mol_graphs = _unpack_field(examples, 'mol_graph')
-        atom_x = torch.cat(_unpack_field(examples, 'atom_x_enc'))
-        bond_x = torch.cat(_unpack_field(examples, 'bond_x_enc'))
-        mol_graph_batch = self._batch_and_set(mol_graphs, atom_x, bond_x, False)
-        result = {
-                'mol_trees': mol_trees,
-                'mol_graph_batch': mol_graph_batch,
-                }
-        if not self.training:
-            return result
-        # batch candidate graphs
-        cand_graphs = _unpack_field(examples, 'cand_graphs')
-        cand_batch_idx = []
-        atom_x = torch.cat(_unpack_field(examples, 'atom_x_dec'))
-        bond_x = torch.cat(_unpack_field(examples, 'bond_x_dec'))
-        tree_mess_src_e = _unpack_field(examples, 'tree_mess_src_e')
-        tree_mess_tgt_e = _unpack_field(examples, 'tree_mess_tgt_e')
-        tree_mess_tgt_n = _unpack_field(examples, 'tree_mess_tgt_n')
-        n_graph_nodes = 0
-        n_tree_nodes = 0
-        for i in range(len(cand_graphs)):
-            tree_mess_tgt_e[i] += n_graph_nodes
-            tree_mess_src_e[i] += n_tree_nodes
-            tree_mess_tgt_n[i] += n_graph_nodes
-            n_graph_nodes += sum(g.number_of_nodes() for g in cand_graphs[i])
-            n_tree_nodes += mol_trees[i].number_of_nodes()
-            cand_batch_idx.extend([i] * len(cand_graphs[i]))
-        tree_mess_tgt_e = torch.cat(tree_mess_tgt_e)
-        tree_mess_src_e = torch.cat(tree_mess_src_e)
-        tree_mess_tgt_n = torch.cat(tree_mess_tgt_n)
-        cand_graph_batch = self._batch_and_set(cand_graphs, atom_x, bond_x, True)
-        # batch stereoisomers
-        stereo_cand_graphs = _unpack_field(examples, 'stereo_cand_graphs')
-        atom_x = torch.cat(_unpack_field(examples, 'stereo_atom_x_enc'))
-        bond_x = torch.cat(_unpack_field(examples, 'stereo_bond_x_enc'))
-        stereo_cand_batch_idx = []
-        for i in range(len(stereo_cand_graphs)):
-            stereo_cand_batch_idx.extend([i] * len(stereo_cand_graphs[i]))
-        if len(stereo_cand_batch_idx) > 0:
-            stereo_cand_labels = [
-                    (label, length)
-                    for ex in _unpack_field(examples, 'stereo_cand_label')
-                    for label, length in ex
-                    ]
-            stereo_cand_labels, stereo_cand_lengths = zip(*stereo_cand_labels)
-            stereo_cand_graph_batch = self._batch_and_set(
-                    stereo_cand_graphs, atom_x, bond_x, True)
-        else:
-            stereo_cand_labels = []
-            stereo_cand_lengths = []
-            stereo_cand_graph_batch = None
-            stereo_cand_batch_idx = []
-        result.update({
-            'cand_graph_batch': cand_graph_batch,
-            'cand_batch_idx': cand_batch_idx,
-            'tree_mess_tgt_e': tree_mess_tgt_e,
-            'tree_mess_src_e': tree_mess_src_e,
-            'tree_mess_tgt_n': tree_mess_tgt_n,
-            'stereo_cand_graph_batch': stereo_cand_graph_batch,
-            'stereo_cand_batch_idx': stereo_cand_batch_idx,
-            'stereo_cand_labels': stereo_cand_labels,
-            'stereo_cand_lengths': stereo_cand_lengths,
-            })
-        return result
--- a/apps/life_sci/examples/generative_models/jtnn/jtnn/mol_tree.py
+++ b/apps/life_sci/examples/generative_models/jtnn/jtnn/mol_tree.py
-import copy
-import numpy as np
-import rdkit.Chem as Chem
-from dgl import DGLGraph
-from .chemutils import get_clique_mol, tree_decomp, get_mol, get_smiles, \
-    set_atommap, enum_assemble_nx, decode_stereo
-def get_slots(smiles):
-    mol = Chem.MolFromSmiles(smiles)
-    return [(atom.GetSymbol(), atom.GetFormalCharge(), atom.GetTotalNumHs()) for atom in mol.GetAtoms()]
-class Vocab(object):
-    def __init__(self, smiles_list):
-        self.vocab = smiles_list
-        self.vmap = {x: i for i, x in enumerate(self.vocab)}
-        self.slots = [get_slots(smiles) for smiles in self.vocab]
-    def get_index(self, smiles):
-        return self.vmap[smiles]
-    def get_smiles(self, idx):
-        return self.vocab[idx]
-    def get_slots(self, idx):
-        return copy.deepcopy(self.slots[idx])
-    def size(self):
-        return len(self.vocab)
-class DGLMolTree(DGLGraph):
-    def __init__(self, smiles):
-        DGLGraph.__init__(self)
-        self.nodes_dict = {}
-        if smiles is None:
-            return
-        self.smiles = smiles
-        self.mol = get_mol(smiles)
-        # Stereo Generation
-        mol = Chem.MolFromSmiles(smiles)
-        self.smiles3D = Chem.MolToSmiles(mol, isomericSmiles=True)
-        self.smiles2D = Chem.MolToSmiles(mol)
-        self.stereo_cands = decode_stereo(self.smiles2D)
-        # cliques: a list of list of atom indices
-        cliques, edges = tree_decomp(self.mol)
-        root = 0
-        for i, c in enumerate(cliques):
-            cmol = get_clique_mol(self.mol, c)
-            csmiles = get_smiles(cmol)
-            self.nodes_dict[i] = dict(
-                smiles=csmiles,
-                mol=get_mol(csmiles),
-                clique=c,
-            )
-            if min(c) == 0:
-                root = i
-        self.add_nodes(len(cliques))
-        # The clique with atom ID 0 becomes root
-        if root > 0:
-            for attr in self.nodes_dict[0]:
-                self.nodes_dict[0][attr], self.nodes_dict[root][attr] = \
-                    self.nodes_dict[root][attr], self.nodes_dict[0][attr]
-        src = np.zeros((len(edges) * 2,), dtype='int')
-        dst = np.zeros((len(edges) * 2,), dtype='int')
-        for i, (_x, _y) in enumerate(edges):
-            x = 0 if _x == root else root if _x == 0 else _x
-            y = 0 if _y == root else root if _y == 0 else _y
-            src[2 * i] = x
-            dst[2 * i] = y
-            src[2 * i + 1] = y
-            dst[2 * i + 1] = x
-        self.add_edges(src, dst)
-        for i in self.nodes_dict:
-            self.nodes_dict[i]['nid'] = i + 1
-            if self.out_degree(i) > 1:  # Leaf node mol is not marked
-                set_atommap(self.nodes_dict[i]['mol'], self.nodes_dict[i]['nid'])
-            self.nodes_dict[i]['is_leaf'] = (self.out_degree(i) == 1)
-    def treesize(self):
-        return self.number_of_nodes()
-    def _recover_node(self, i, original_mol):
-        node = self.nodes_dict[i]
-        clique = []
-        clique.extend(node['clique'])
-        if not node['is_leaf']:
-            for cidx in node['clique']:
-                original_mol.GetAtomWithIdx(cidx).SetAtomMapNum(node['nid'])
-        for j in self.successors(i).numpy():
-            nei_node = self.nodes_dict[j]
-            clique.extend(nei_node['clique'])
-            if nei_node['is_leaf']:  # Leaf node, no need to mark
-                continue
-            for cidx in nei_node['clique']:
-                # allow singleton node override the atom mapping
-                if cidx not in node['clique'] or len(nei_node['clique']) == 1:
-                    atom = original_mol.GetAtomWithIdx(cidx)
-                    atom.SetAtomMapNum(nei_node['nid'])
-        clique = list(set(clique))
-        label_mol = get_clique_mol(original_mol, clique)
-        node['label'] = Chem.MolToSmiles(Chem.MolFromSmiles(get_smiles(label_mol)))
-        node['label_mol'] = get_mol(node['label'])
-        for cidx in clique:
-            original_mol.GetAtomWithIdx(cidx).SetAtomMapNum(0)
-        return node['label']
-    def _assemble_node(self, i):
-        neighbors = [self.nodes_dict[j] for j in self.successors(i).numpy()
-                     if self.nodes_dict[j]['mol'].GetNumAtoms() > 1]
-        neighbors = sorted(neighbors, key=lambda x: x['mol'].GetNumAtoms(), reverse=True)
-        singletons = [self.nodes_dict[j] for j in self.successors(i).numpy()
-                      if self.nodes_dict[j]['mol'].GetNumAtoms() == 1]
-        neighbors = singletons + neighbors
-        cands = enum_assemble_nx(self.nodes_dict[i], neighbors)
-        if len(cands) > 0:
-            self.nodes_dict[i]['cands'], self.nodes_dict[i]['cand_mols'], _ = list(zip(*cands))
-            self.nodes_dict[i]['cands'] = list(self.nodes_dict[i]['cands'])
-            self.nodes_dict[i]['cand_mols'] = list(self.nodes_dict[i]['cand_mols'])
-        else:
-            self.nodes_dict[i]['cands'] = []
-            self.nodes_dict[i]['cand_mols'] = []
-    def recover(self):
-        for i in self.nodes_dict:
-            self._recover_node(i, self.mol)
-    def assemble(self):
-        for i in self.nodes_dict:
-            self._assemble_node(i)
--- a/apps/life_sci/examples/generative_models/jtnn/reconstruct_eval.py
+++ b/apps/life_sci/examples/generative_models/jtnn/reconstruct_eval.py
-import argparse
-import rdkit
-import torch
-from dgllife.model import DGLJTNNVAE, load_pretrained
-from dgllife.model.model_zoo.jtnn.nnutils import cuda
-from torch.utils.data import DataLoader
-from jtnn import *
-def worker_init_fn(id_):
-    lg = rdkit.RDLogger.logger()
-    lg.setLevel(rdkit.RDLogger.CRITICAL)
-worker_init_fn(None)
-parser = argparse.ArgumentParser(description="Evaluation for JTNN",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("-t", "--train", dest="train",
-                    default='test', help='Training file name')
-parser.add_argument("-v", "--vocab", dest="vocab",
-                    default='vocab', help='Vocab file name')
-parser.add_argument("-m", "--model", dest="model_path", default=None,
-                    help="Pre-trained model to be loaded for evalutaion. If not specified,"
-                         " would use pre-trained model from model zoo")
-parser.add_argument("-w", "--hidden", dest="hidden_size", default=450,
-                    help="Hidden size of representation vector, "
-                         "should be consistent with pre-trained model")
-parser.add_argument("-l", "--latent", dest="latent_size", default=56,
-                    help="Latent Size of node(atom) features and edge(atom) features, "
-                         "should be consistent with pre-trained model")
-parser.add_argument("-d", "--depth", dest="depth", default=3,
-                    help="Depth of message passing hops, "
-                         "should be consistent with pre-trained model")
-args = parser.parse_args()
-dataset = JTNNDataset(data=args.train, vocab=args.vocab, training=False)
-vocab_file = dataset.vocab_file
-hidden_size = int(args.hidden_size)
-latent_size = int(args.latent_size)
-depth = int(args.depth)
-model = DGLJTNNVAE(vocab_file=vocab_file,
-                   hidden_size=hidden_size,
-                   latent_size=latent_size,
-                   depth=depth)
-if args.model_path is not None:
-    model.load_state_dict(torch.load(args.model_path))
-else:
-    model = load_pretrained("JTNN_ZINC")
-model = cuda(model)
-model.eval()
-print("Model #Params: %dK" %
-      (sum([x.nelement() for x in model.parameters()]) / 1000,))
-MAX_EPOCH = 100
-PRINT_ITER = 20
-def reconstruct():
-    dataset.training = False
-    dataloader = DataLoader(
-        dataset,
-        batch_size=1,
-        shuffle=False,
-        num_workers=0,
-        collate_fn=JTNNCollator(dataset.vocab, False),
-        drop_last=True,
-        worker_init_fn=worker_init_fn)
-    # Just an example of molecule decoding; in reality you may want to sample
-    # tree and molecule vectors.
-    acc = 0.0
-    tot = 0
-    with torch.no_grad():
-        for it, batch in enumerate(dataloader):
-            gt_smiles = batch['mol_trees'][0].smiles
-            # print(gt_smiles)
-            model.move_to_cuda(batch)
-            try:
-                _, tree_vec, mol_vec = model.encode(batch)
-                tree_mean = model.T_mean(tree_vec)
-                # Following Mueller et al.
-                tree_log_var = -torch.abs(model.T_var(tree_vec))
-                mol_mean = model.G_mean(mol_vec)
-                # Following Mueller et al.
-                mol_log_var = -torch.abs(model.G_var(mol_vec))
-                epsilon = torch.randn(1, model.latent_size // 2).cuda()
-                tree_vec = tree_mean + torch.exp(tree_log_var // 2) * epsilon
-                epsilon = torch.randn(1, model.latent_size // 2).cuda()
-                mol_vec = mol_mean + torch.exp(mol_log_var // 2) * epsilon
-                dec_smiles = model.decode(tree_vec, mol_vec)
-                if dec_smiles == gt_smiles:
-                    acc += 1
-                tot += 1
-            except Exception as e:
-                print("Failed to encode: {}".format(gt_smiles))
-                print(e)
-            if it % 20 == 1:
-                print("Progress {}/{}; Current Reconstruction Accuracy: {:.4f}".format(it,
-                                                                                       len(dataloader), acc / tot))
-    return acc / tot
-if __name__ == '__main__':
-    reconstruct_acc = reconstruct()
-    print("Reconstruction Accuracy: {}".format(reconstruct_acc))
--- a/apps/life_sci/examples/generative_models/jtnn/train.py
+++ b/apps/life_sci/examples/generative_models/jtnn/train.py
--- a/apps/life_sci/examples/generative_models/jtnn/vocab.py
+++ b/apps/life_sci/examples/generative_models/jtnn/vocab.py
-"""Generate vocabulary for a new dataset."""
-if __name__ == '__main__':
-    import argparse
-    import os
-    import rdkit
-    from dgl.data.utils import _get_dgl_url, download, get_download_dir, extract_archive
-    from jtnn.mol_tree import DGLMolTree
-    parser = argparse.ArgumentParser('Generate vocabulary for a molecule dataset')
-    parser.add_argument('-d', '--data-path', type=str,
-                        help='Path to the dataset')
-    parser.add_argument('-v', '--vocab', type=str,
-                        help='Path to the vocabulary file to save')
-    args = parser.parse_args()
-    lg = rdkit.RDLogger.logger()
-    lg.setLevel(rdkit.RDLogger.CRITICAL)
-    vocab = set()
-    with open(args.data_path, 'r') as f:
-        for line in f:
-            smiles = line.strip()
-            mol = DGLMolTree(smiles)
-            for i in mol.nodes_dict:
-                vocab.add(mol.nodes_dict[i]['smiles'])
-    with open(args.vocab, 'w') as f:
-        for v in vocab:
-            f.write(v + '\n')
-    # Get the vocabulary used for the pre-trained model
-    default_dir = get_download_dir()
-    vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab')
-    if not os.path.exists(vocab_file):
-        zip_file_path = '{}/jtnn.zip'.format(default_dir)
-        download(_get_dgl_url('dgllife/jtnn.zip'), path=zip_file_path)
-        extract_archive(zip_file_path, '{}/jtnn'.format(default_dir))
-    default_vocab = set()
-    with open(vocab_file, 'r') as f:
-        for line in f:
-            default_vocab.add(line.strip())
-    print('The new vocabulary is a subset of the default vocabulary: {}'.format(
-        vocab.issubset(default_vocab)))
--- a/apps/life_sci/examples/property_prediction/README.md
+++ b/apps/life_sci/examples/property_prediction/README.md
--- a/apps/life_sci/examples/property_prediction/classification.py
+++ b/apps/life_sci/examples/property_prediction/classification.py
--- a/apps/life_sci/examples/property_prediction/configure.py
+++ b/apps/life_sci/examples/property_prediction/configure.py
--- a/apps/life_sci/examples/property_prediction/regression.py
+++ b/apps/life_sci/examples/property_prediction/regression.py
--- a/apps/life_sci/examples/property_prediction/utils.py
+++ b/apps/life_sci/examples/property_prediction/utils.py
--- a/apps/life_sci/examples/reaction_prediction/rexgen_direct/README.md
+++ b/apps/life_sci/examples/reaction_prediction/rexgen_direct/README.md
--- a/apps/life_sci/examples/reaction_prediction/rexgen_direct/candidate_ranking_eval.py
+++ b/apps/life_sci/examples/reaction_prediction/rexgen_direct/candidate_ranking_eval.py
--- a/apps/life_sci/examples/reaction_prediction/rexgen_direct/candidate_ranking_train.py
+++ b/apps/life_sci/examples/reaction_prediction/rexgen_direct/candidate_ranking_train.py
--- a/apps/life_sci/examples/reaction_prediction/rexgen_direct/configure.py
+++ b/apps/life_sci/examples/reaction_prediction/rexgen_direct/configure.py
--- a/apps/life_sci/examples/reaction_prediction/rexgen_direct/find_reaction_center_eval.py
+++ b/apps/life_sci/examples/reaction_prediction/rexgen_direct/find_reaction_center_eval.py
--- a/apps/life_sci/examples/reaction_prediction/rexgen_direct/find_reaction_center_train.py
+++ b/apps/life_sci/examples/reaction_prediction/rexgen_direct/find_reaction_center_train.py
--- a/apps/life_sci/examples/reaction_prediction/rexgen_direct/utils.py
+++ b/apps/life_sci/examples/reaction_prediction/rexgen_direct/utils.py
--- a/apps/life_sci/python/dgllife/__init__.py
+++ b/apps/life_sci/python/dgllife/__init__.py
-"""DGL-based package for applications in life science."""
-from . import data
-from . import model
-from . import utils
-from .libinfo import __version__
--- a/apps/life_sci/python/dgllife/data/__init__.py
+++ b/apps/life_sci/python/dgllife/data/__init__.py
-"""Dataset classes."""
-from .alchemy import *
-from .csv_dataset import *
-from .pdbbind import *
-from .pubchem_aromaticity import *
-from .tox21 import *
-from .uspto import *