rdkit_utils.py 7.19 KB
Newer Older
1
2
"""Utils for RDKit, mostly adapted from DeepChem
(https://github.com/deepchem/deepchem/blob/master/deepchem)."""
3
# pylint: disable= no-member, arguments-differ, invalid-name
4
5
6
7
8
9
10
import warnings

from functools import partial
from multiprocessing import Pool
from rdkit import Chem
from rdkit.Chem import AllChem

11
__all__ = ['get_mol_3d_coordinates',
12
13
14
           'load_molecule',
           'multiprocess_load_molecules']

15
16
# pylint: disable=W0702
def get_mol_3d_coordinates(mol):
17
18
    """Get 3D coordinates of the molecule.

Mufei Li's avatar
Mufei Li committed
19
20
    This function requires that molecular conformation has been initialized.

21
22
23
24
25
26
27
28
29
30
    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule instance.

    Returns
    -------
    numpy.ndarray of shape (N, 3) or None
        The 3D coordinates of atoms in the molecule. N for the number of atoms in
        the molecule. For failures in getting the conformations, None will be returned.
Mufei Li's avatar
Mufei Li committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

    Examples
    --------
    An error will occur in the example below since the molecule object does not
    carry conformation information.

    >>> from rdkit import Chem
    >>> from dgllife.utils import get_mol_3d_coordinates

    >>> mol = Chem.MolFromSmiles('CCO')

    Below we give a working example based on molecule conformation initialized from calculation.

    >>> from rdkit.Chem import AllChem
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> print(coords)
    array([[ 1.20967478, -0.25802181,  0.        ],
           [-0.05021255,  0.57068079,  0.        ],
           [-1.15946223, -0.31265898,  0.        ]])
52
53
54
55
56
57
58
59
60
61
62
63
64
    """
    try:
        conf = mol.GetConformer()
        conf_num_atoms = conf.GetNumAtoms()
        mol_num_atoms = mol.GetNumAtoms()
        assert mol_num_atoms == conf_num_atoms, \
            'Expect the number of atoms in the molecule and its conformation ' \
            'to be the same, got {:d} and {:d}'.format(mol_num_atoms, conf_num_atoms)
        return conf.GetPositions()
    except:
        warnings.warn('Unable to get conformation of the molecule.')
        return None

65
# pylint: disable=E1101
66
67
def load_molecule(molecule_file, sanitize=False, calc_charges=False,
                  remove_hs=False, use_conformation=True):
Mufei Li's avatar
Mufei Li committed
68
    """Load a molecule from a file of format ``.mol2`` or ``.sdf`` or ``.pdbqt`` or ``.pdb``.
69
70
71
72

    Parameters
    ----------
    molecule_file : str
Mufei Li's avatar
Mufei Li committed
73
74
        Path to file for storing a molecule, which can be of format ``.mol2`` or ``.sdf``
        or ``.pdbqt`` or ``.pdb``.
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    sanitize : bool
        Whether sanitization is performed in initializing RDKit molecule instances. See
        https://www.rdkit.org/docs/RDKit_Book.html for details of the sanitization.
        Default to False.
    calc_charges : bool
        Whether to add Gasteiger charges via RDKit. Setting this to be True will enforce
        ``sanitize`` to be True. Default to False.
    remove_hs : bool
        Whether to remove hydrogens via RDKit. Note that removing hydrogens can be quite
        slow for large molecules. Default to False.
    use_conformation : bool
        Whether we need to extract molecular conformation from proteins and ligands.
        Default to True.

    Returns
    -------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule instance for the loaded molecule.
    coordinates : np.ndarray of shape (N, 3) or None
        The 3D coordinates of atoms in the molecule. N for the number of atoms in
        the molecule. None will be returned if ``use_conformation`` is False or
        we failed to get conformation information.
    """
    if molecule_file.endswith('.mol2'):
        mol = Chem.MolFromMol2File(molecule_file, sanitize=False, removeHs=False)
    elif molecule_file.endswith('.sdf'):
        supplier = Chem.SDMolSupplier(molecule_file, sanitize=False, removeHs=False)
        mol = supplier[0]
    elif molecule_file.endswith('.pdbqt'):
104
105
        with open(molecule_file) as file:
            pdbqt_data = file.readlines()
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
        pdb_block = ''
        for line in pdbqt_data:
            pdb_block += '{}\n'.format(line[:66])
        mol = Chem.MolFromPDBBlock(pdb_block, sanitize=False, removeHs=False)
    elif molecule_file.endswith('.pdb'):
        mol = Chem.MolFromPDBFile(molecule_file, sanitize=False, removeHs=False)
    else:
        return ValueError('Expect the format of the molecule_file to be '
                          'one of .mol2, .sdf, .pdbqt and .pdb, got {}'.format(molecule_file))

    try:
        if sanitize or calc_charges:
            Chem.SanitizeMol(mol)

        if calc_charges:
            # Compute Gasteiger charges on the molecule.
            try:
                AllChem.ComputeGasteigerCharges(mol)
            except:
                warnings.warn('Unable to compute charges for the molecule.')

        if remove_hs:
            mol = Chem.RemoveHs(mol)
    except:
        return None, None

    if use_conformation:
133
        coordinates = get_mol_3d_coordinates(mol)
134
135
136
137
138
139
140
    else:
        coordinates = None

    return mol, coordinates

def multiprocess_load_molecules(files, sanitize=False, calc_charges=False,
                                remove_hs=False, use_conformation=True, num_processes=2):
Mufei Li's avatar
Mufei Li committed
141
142
    """Load molecules from files with multiprocessing, which can be of format ``.mol2`` or
    ``.sdf`` or ``.pdbqt`` or ``.pdb``.
143
144
145
146

    Parameters
    ----------
    files : list of str
Mufei Li's avatar
Mufei Li committed
147
148
        Each element is a path to a file storing a molecule, which can be of format ``.mol2``,
        ``.sdf``, ``.pdbqt``, or ``.pdb``.
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    sanitize : bool
        Whether sanitization is performed in initializing RDKit molecule instances. See
        https://www.rdkit.org/docs/RDKit_Book.html for details of the sanitization.
        Default to False.
    calc_charges : bool
        Whether to add Gasteiger charges via RDKit. Setting this to be True will enforce
        ``sanitize`` to be True. Default to False.
    remove_hs : bool
        Whether to remove hydrogens via RDKit. Note that removing hydrogens can be quite
        slow for large molecules. Default to False.
    use_conformation : bool
        Whether we need to extract molecular conformation from proteins and ligands.
        Default to True.
    num_processes : int or None
        Number of worker processes to use. If None,
        then we will use the number of CPUs in the systetm. Default to 2.

    Returns
    -------
    list of 2-tuples
        The first element of each 2-tuple is an RDKit molecule instance. The second element
        of each 2-tuple is the 3D atom coordinates of the corresponding molecule if
        use_conformation is True and the coordinates has been successfully loaded. Otherwise,
        it will be None.
    """
    if num_processes == 1:
        mols_loaded = []
176
        for f in files:
177
178
179
180
181
182
183
184
185
186
187
            mols_loaded.append(load_molecule(
                f, sanitize=sanitize, calc_charges=calc_charges,
                remove_hs=remove_hs, use_conformation=use_conformation))
    else:
        with Pool(processes=num_processes) as pool:
            mols_loaded = pool.map_async(partial(
                load_molecule, sanitize=sanitize, calc_charges=calc_charges,
                remove_hs=remove_hs, use_conformation=use_conformation), files)
            mols_loaded = mols_loaded.get()

    return mols_loaded