Unverified Commit cd9fb7ba authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Model Zoo] Refactor GCN on Tox21 (#766)

* [Model zoo] Model zoo (#765)

* tox21

* fix ci

* fix ci

* fix urls to url

* add doc

* remove binary

* model zoo

* test

* markdown

* fix typo

* fix typo

* fix typo

* raise error

* fix lint

* remove unnecessary

* fix doc

* fix

* fix

* fix

* fix

* fix

* fix

* Update

* CI

* Fix

* Fix

* Fix

* Fix

* Fix

* CI
parent bdcba9c8
Model Zoo
==========
Here are examples of using the model zoo.
# Property Prediction
## Classification
Classification tasks require assigning discrete labels to a molecule, e.g. molecule toxicity.
### Datasets
- **Tox21**. The ["Toxicology in the 21st Century" (Tox21)](https://tripod.nih.gov/tox21/challenge/) initiative created
a public database measuring toxicity of compounds, which has been used in the 2014 Tox21 Data Challenge. The dataset
contains qualitative toxicity measurements for 8014 compounds on 12 different targets, including nuclear receptors and
stress response pathways. Each target yields a binary prediction problem. MoleculeNet [1] randomly splits the dataset
into training, validation and test set with a 80/10/10 ratio. By default we follow their split method.
### Models
- **Graph Convolutional Network** [2]. Graph Convolutional Networks (GCN) have been one of the most popular graph neural
networks and they can be easily extended for graph level prediction. MoleculeNet [1] reports baseline results of graph
convolutions over multiple datasets.
### Usage
To train a model from scratch, simply call `python classification.py`. To skip training and use the pre-trained model,
call `python classification.py -p`.
We use GPU whenever it is available.
### Performance
#### GCN on Tox21
| Source | Averaged ROC-AUC Score |
| ---------------- | ---------------------- |
| MoleculeNet [1] | 0.829 |
| [DeepChem example](https://github.com/deepchem/deepchem/blob/master/examples/tox21/tox21_tensorgraph_graph_conv.py) | 0.813 |
| Pretrained model | 0.827 |
Note that due to some possible randomness you may get different numbers for DeepChem example and our model. To get
match exact results for this model, please use the pre-trained model as in the usage section.
## Dataset Customization
To customize your own dataset, see the instructions
[here](https://github.com/dmlc/dgl/tree/master/python/dgl/data/chem).
### References
[1] Wu et al. (2017) MoleculeNet: a benchmark for molecular machine learning. *Chemical Science* 9, 513-530.
[2] Kipf et al. (2017) Semi-Supervised Classification with Graph Convolutional Networks.
*The International Conference on Learning Representations (ICLR)*.
from dgl.data import Tox21
from dgl.data.utils import split_dataset
from dgl import model_zoo
import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils.data import DataLoader
from utils import Meter, EarlyStopping, collate_molgraphs, set_random_seed
def main(args):
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 128
learning_rate = 0.001
num_epochs = 100
set_random_seed()
# Interchangeable with other Dataset
dataset = Tox21()
atom_data_field = 'h'
trainset, valset, testset = split_dataset(dataset, [0.8, 0.1, 0.1])
train_loader = DataLoader(
trainset, batch_size=batch_size, collate_fn=collate_molgraphs)
val_loader = DataLoader(
valset, batch_size=batch_size, collate_fn=collate_molgraphs)
test_loader = DataLoader(
testset, batch_size=batch_size, collate_fn=collate_molgraphs)
if args.pre_trained:
num_epochs = 0
model = model_zoo.chem.load_pretrained('GCN_Tox21')
else:
# Interchangeable with other models
model = model_zoo.chem.GCNClassifier(in_feats=74,
gcn_hidden_feats=[64, 64],
n_tasks=dataset.n_tasks)
loss_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor(
dataset.task_pos_weights).to(device), reduction='none')
optimizer = Adam(model.parameters(), lr=learning_rate)
stopper = EarlyStopping(patience=10)
model.to(device)
for epoch in range(num_epochs):
model.train()
print('Start training')
train_meter = Meter()
for batch_id, batch_data in enumerate(train_loader):
smiles, bg, labels, mask = batch_data
atom_feats = bg.ndata.pop(atom_data_field)
atom_feats, labels, mask = atom_feats.to(device), labels.to(device), mask.to(device)
logits = model(atom_feats, bg)
# Mask non-existing labels
loss = (loss_criterion(logits, labels)
* (mask != 0).float()).mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
print('epoch {:d}/{:d}, batch {:d}/{:d}, loss {:.4f}'.format(
epoch + 1, num_epochs, batch_id + 1, len(train_loader), loss.item()))
train_meter.update(logits, labels, mask)
train_roc_auc = train_meter.roc_auc_averaged_over_tasks()
print('epoch {:d}/{:d}, training roc-auc score {:.4f}'.format(
epoch + 1, num_epochs, train_roc_auc))
val_meter = Meter()
model.eval()
with torch.no_grad():
for batch_id, batch_data in enumerate(val_loader):
smiles, bg, labels, mask = batch_data
atom_feats = bg.ndata.pop(atom_data_field)
atom_feats, labels = atom_feats.to(device), labels.to(device)
logits = model(atom_feats, bg)
val_meter.update(logits, labels, mask)
val_roc_auc = val_meter.roc_auc_averaged_over_tasks()
if stopper.step(val_roc_auc, model):
break
print('epoch {:d}/{:d}, validation roc-auc score {:.4f}, best validation roc-auc score {:.4f}'.format(
epoch + 1, num_epochs, val_roc_auc, stopper.best_score))
test_meter = Meter()
model.eval()
for batch_id, batch_data in enumerate(test_loader):
smiles, bg, labels, mask = batch_data
atom_feats = bg.ndata.pop(atom_data_field)
atom_feats, labels = atom_feats.to(device), labels.to(device)
logits = model(atom_feats, bg)
test_meter.update(logits, labels, mask)
print('test roc-auc score {:.4f}'.format(test_meter.roc_auc_averaged_over_tasks()))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Molecule Classification')
parser.add_argument('-p', '--pre-trained', action='store_true',
help='Whether to skip training and use a pre-trained model')
args = parser.parse_args()
main(args)
{
"cells": [
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"from dgl import model_zoo\n",
"import torch\n",
"import rdkit\n",
"from rdkit import Chem\n",
"from rdkit.Chem.Draw import IPythonConsole\n",
"from dgl.data.chem.utils import smile2graph\n",
"import dgl"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading pretrained model...\n"
]
},
{
"data": {
"text/plain": [
"GCNClassifier(\n",
" (gcn_layers): ModuleList(\n",
" (0): GCNLayer(\n",
" (graph_conv): GraphConv(in=74, out=64, normalization=False, activation=<function relu at 0x7efd7f46e158>)\n",
" (dropout): Dropout(p=0.0)\n",
" (res_connection): Linear(in_features=74, out_features=64, bias=True)\n",
" (bn_layer): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" )\n",
" (1): GCNLayer(\n",
" (graph_conv): GraphConv(in=64, out=64, normalization=False, activation=<function relu at 0x7efd7f46e158>)\n",
" (dropout): Dropout(p=0.0)\n",
" (res_connection): Linear(in_features=64, out_features=64, bias=True)\n",
" (bn_layer): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" )\n",
" )\n",
" (atom_weighting): Sequential(\n",
" (0): Linear(in_features=64, out_features=1, bias=True)\n",
" (1): Sigmoid()\n",
" )\n",
" (soft_classifier): MLPBinaryClassifier(\n",
" (predict): Sequential(\n",
" (0): Dropout(p=0.0)\n",
" (1): Linear(in_features=128, out_features=64, bias=True)\n",
" (2): ReLU()\n",
" (3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (4): Linear(in_features=64, out_features=12, bias=True)\n",
" )\n",
" )\n",
")"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = model_zoo.chem.load_pretrained(\"GCN_Tox21\")\n",
"model.eval()\n",
"model"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase',\n",
" 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE',\n",
" 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"smiles = \"CC[NH+](CC)c1ccc(/C=C2\\Oc3c(ccc(OCC(N)=O)c3C)C2=O)cc1\""
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<rdkit.Chem.rdchem.Mol at 0x7efd736958f0>"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m = Chem.MolFromSmiles(smiles)\n",
"m"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"g = smile2graph(smiles)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DGLGraph(num_nodes=28, num_edges=60,\n",
" ndata_schemes={'h': Scheme(shape=(74,), dtype=torch.float32)}\n",
" edata_schemes={})"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"g"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"bg = dgl.batch([g])"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([28, 74])"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bg.ndata['h'].shape"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/playground/mz_dgl/python/dgl/base.py:18: UserWarning: Initializer is not set. Use zero initializer instead. To suppress this warning, use `set_initializer` to explicitly specify which initializer to use.\n",
" warnings.warn(msg, warn_type)\n"
]
}
],
"source": [
"logits = model(bg.ndata['h'], bg)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"preds = logits.data.numpy() > 0.5"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>NR-AR</th>\n",
" <th>NR-AR-LBD</th>\n",
" <th>NR-AhR</th>\n",
" <th>NR-Aromatase</th>\n",
" <th>NR-ER</th>\n",
" <th>NR-ER-LBD</th>\n",
" <th>NR-PPAR-gamma</th>\n",
" <th>SR-ARE</th>\n",
" <th>SR-ATAD5</th>\n",
" <th>SR-HSE</th>\n",
" <th>SR-MMP</th>\n",
" <th>SR-p53</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" NR-AR NR-AR-LBD NR-AhR NR-Aromatase NR-ER NR-ER-LBD NR-PPAR-gamma \\\n",
"0 False False True False True False True \n",
"\n",
" SR-ARE SR-ATAD5 SR-HSE SR-MMP SR-p53 \n",
"0 False True False False True "
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"pd.DataFrame(preds, columns=tasks)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_miniconda3-latest)",
"language": "python",
"name": "conda_miniconda3-latest"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
import dgl
import numpy as np
import os
import random
import torch
from sklearn.metrics import roc_auc_score
def set_random_seed(seed=0):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
class Meter(object):
def __init__(self):
self.mask = []
self.y_pred = []
self.y_true = []
def update(self, y_pred, y_true, mask):
self.y_pred.append(y_pred)
self.y_true.append(y_true)
self.mask.append(mask)
# Todo: Allow different evaluation metrics
def roc_auc_averaged_over_tasks(self):
"""Compute roc-auc score for each task and return the average."""
mask = torch.cat(self.mask, dim=0)
y_pred = torch.cat(self.y_pred, dim=0)
y_true = torch.cat(self.y_true, dim=0)
# Todo: support categorical classes
# This assumes binary case only
y_pred = torch.sigmoid(y_pred)
n_tasks = y_true.shape[1]
total_score = 0
for task in range(n_tasks):
task_w = mask[:, task]
task_y_true = y_true[:, task][task_w != 0].cpu().numpy()
task_y_pred = y_pred[:, task][task_w != 0].cpu().detach().numpy()
total_score += roc_auc_score(task_y_true, task_y_pred)
return total_score / n_tasks
class EarlyStopping(object):
def __init__(self, patience=10, filename="es_checkpoint.pth"):
assert not os.path.exists(filename), \
'Filename {} is occupied. Either rename it or delete it.'.format(filename)
self.patience = patience
self.counter = 0
self.filename = filename
self.best_score = None
self.early_stop = False
def step(self, acc, model):
score = acc
if self.best_score is None:
self.best_score = score
self.save_checkpoint(model)
# Todo: this is not true for all metrics.
elif score < self.best_score:
self.counter += 1
print(
f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.save_checkpoint(model)
self.counter = 0
return self.early_stop
def save_checkpoint(self, model):
'''Saves model when the metric on the validation set gets improved.'''
torch.save(model.state_dict(), self.filename)
def load_checkpoint(self, model):
'''Load model saved with early stopping.'''
model.load_state_dict(torch.load(self.filename))
def collate_molgraphs(data):
"""Batching a list of datapoints for dataloader
Parameters
----------
data : list of 4-tuples
Each tuple is for a single datapoint, consisting of
A SMILE, a DGLGraph, all-task labels and all-task weights
Returns
-------
smiles : list
List of smiles
bg : BatchedDGLGraph
Batched DGLGraphs
labels : Tensor of dtype float32 and shape (B, T)
Batched datapoint labels. B is len(data) and
T is the number of total tasks.
weights : Tensor of dtype float32 and shape (B, T)
Batched datapoint weights. T is the number of
total tasks.
"""
smiles, graphs, labels, mask = map(list, zip(*data))
bg = dgl.batch(graphs)
bg.set_n_initializer(dgl.init.zero_initializer)
bg.set_e_initializer(dgl.init.zero_initializer)
labels = torch.stack(labels, dim=0)
mask = torch.stack(mask, dim=0)
return smiles, bg, labels, mask
# Customize Dataset
Generally we follow the practise of PyTorch.
A Dataset class should implement `__getitem__(self, index)` and `__len__(self)`method
```python
class CustomDataset:
def __init__(self):
# Initialize Dataset and preprocess data
def __getitem__(self, index):
# Return the corresponding DGLGraph/label needed for training/evaluation based on index
return self.graphs[index], self.labels[index]
def __len__(self):
return len(self.graphs)
```
DGL supports various backends such as MXNet and PyTorch, therefore we want our dataset to be also backend agnostic.
We prefer user using numpy array in the dataset, and not including any operator/tensor from the specific backend.
If you want to convert the numpy array to the corresponding tensor, you can use the following code
```python
import dgl.backend as F
# g is a DGLGraph, h is a numpy array
g.ndata['h'] = F.zerocopy_from_numpy(h)
# Now g.ndata is a PyTorch Tensor or a MXNet NDArray based on backend used
```
If your dataset is in `.csv` format, you may use
[`CSVDataset`](https://github.com/dmlc/dgl/blob/master/python/dgl/data/chem/csv_dataset.py).
......@@ -8,14 +8,15 @@ import sys
from dgl import DGLGraph
from .utils import smile2graph
from ..utils import download, get_download_dir, _get_dgl_url, Subset
class CSVDataset(object):
"""CSVDataset
This is a general class for loading data from csv or pd.DataFrame.
In data pre-processing, we set non-existing labels to be 0, and returning mask with 1 where label exists.
In data pre-processing, we set non-existing labels to be 0,
and returning mask with 1 where label exists.
All molecules are converted into DGLGraphs. After the first-time construction, the
DGLGraphs will be saved for reloading so that we do not need to reconstruct them every time.
......@@ -38,13 +39,16 @@ class CSVDataset(object):
Path to store the preprocessed data
"""
def __init__(self, df, smile2graph=smile2graph, smile_column='smiles', cache_file_path="csvdata_dglgraph.pkl"):
def __init__(self, df, smile2graph=smile2graph, smile_column='smiles',
cache_file_path="csvdata_dglgraph.pkl"):
if 'rdkit' not in sys.modules:
from ...base import dgl_warning
dgl_warning("Please install RDKit (Recommended Version is 2018.09.3)")
dgl_warning(
"Please install RDKit (Recommended Version is 2018.09.3)")
self.df = df
self.smiles = self.df[smile_column].tolist()
self.task_names = self.df.columns.drop([smile_column]).tolist()
self.n_tasks = len(self.task_names)
self.cache_file_path = cache_file_path
self._pre_process(smile2graph)
......@@ -62,17 +66,14 @@ class CSVDataset(object):
with open(self.cache_file_path, 'rb') as f:
self.graphs = pickle.load(f)
else:
self.graphs = []
for id, s in enumerate(self.smiles):
self.graphs.append(smile2graph(s))
self.graphs = [smile2graph(s) for s in self.smiles]
with open(self.cache_file_path, 'wb') as f:
pickle.dump(self.graphs, f)
_label_values = self.df[self.task_names].values
# np.nan_to_num will also turn inf into a very large number
self.labels = F.zerocopy_from_numpy(np.nan_to_num(_label_values))
self.mask = F.zerocopy_from_numpy(~np.isnan(_label_values).astype(np.float32))
self.labels = np.nan_to_num(_label_values).astype(np.float32)
self.mask = (~np.isnan(_label_values)).astype(np.float32)
def __getitem__(self, item):
"""Get the ith datapoint
......@@ -88,7 +89,9 @@ class CSVDataset(object):
Tensor of dtype float32
Weights of the datapoint for all tasks
"""
return self.smiles[item], self.graphs[item], self.labels[item], self.mask[item]
return self.smiles[item], self.graphs[item], \
F.zerocopy_from_numpy(self.labels[item]), \
F.zerocopy_from_numpy(self.mask[item])
def __len__(self):
"""Length of Dataset
......
......@@ -3,15 +3,13 @@ import sys
from .csv_dataset import CSVDataset
from .utils import smile2graph
from ..utils import get_download_dir, download, _get_dgl_url, Subset
from ..utils import get_download_dir, download, _get_dgl_url
try:
import pandas as pd
except ImportError:
pass
class Tox21(CSVDataset):
_url = 'dataset/tox21.csv.gz'
......@@ -49,6 +47,7 @@ class Tox21(CSVDataset):
self.id = df['mol_id']
df = df.drop(columns=['mol_id'])
super().__init__(df, smile2graph, cache_file_path="tox21_dglgraph.pkl")
self._weight_balancing()
......
import dgl.backend as F
import numpy as np
import os
import pickle
from dgl import DGLGraph
......@@ -30,7 +28,6 @@ def one_hot_encoding(x, allowable_set):
"""
return list(map(lambda s: x == s, allowable_set))
class BaseAtomFeaturizer(object):
"""An abstract class for atom featurizers
......@@ -45,8 +42,7 @@ class BaseAtomFeaturizer(object):
def __call__(self, mol):
return NotImplementedError
class DefaultAtomFeaturizer(BaseAtomFeaturizer):
class CanonicalAtomFeaturizer(BaseAtomFeaturizer):
"""A default featurizer for atoms.
The atom features include:
......@@ -76,7 +72,7 @@ class DefaultAtomFeaturizer(BaseAtomFeaturizer):
"""
def __init__(self, atom_data_field='h'):
super(DefaultAtomFeaturizer, self).__init__()
super(CanonicalAtomFeaturizer, self).__init__()
self.atom_data_field = atom_data_field
@property
......@@ -140,8 +136,7 @@ class DefaultAtomFeaturizer(BaseAtomFeaturizer):
return {self.atom_data_field: atom_features}
def smile2graph(smile, add_self_loop=False, atom_featurizer=None, bond_featurizer=None):
def smile2graph(smile, add_self_loop=False, atom_featurizer=CanonicalAtomFeaturizer(), bond_featurizer=None):
"""Convert SMILES into a DGLGraph.
The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the
......@@ -163,7 +158,7 @@ def smile2graph(smile, add_self_loop=False, atom_featurizer=None, bond_featurize
Whether to add self loops in DGLGraphs.
atom_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for atoms in a molecule, which can be used to update
ndata for a DGLGraph.
ndata for a DGLGraph. Default to CanonicalAtomFeaturizer().
bond_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
Featurization for bonds in a molecule, which can be used to update
edata for a DGLGraph.
......
Model Zoo API
==================
We provide two major APIs for the model zoo. For the time being, only PyTorch is supported.
- `model_zoo.chem.[Model_Name]` to load the model skeleton
- `model_zoo.chem.load_pretrained([Pretrained_Model_Name])` to load the model with pretrained weights
Models would be placed in `python/dgl/model_zoo/chem`.
Each Model should contain the following elements:
- Papers related to the model
- Model's input and output
- Dataset compatible with the model
- Documentation for all the customizable configs
- Credits (Contributor infomation)
"""Package for model zoo."""
from . import chem
# DGL for Chemistry
With atoms being nodes and bonds being edges, molecular graphs are among the core objects for study in drug discovery.
As drug discovery is known to be costly and time consuming, deep learning on graphs can be potentially beneficial for
improving the efficiency of drug discovery [1], [2].
With pre-trained models and training scripts, we hope this model zoo will be helpful for both
the chemistry community and the deep learning community to further their research.
## Dependencies
Before you proceed, make sure you have installed the dependencies below:
- PyTorch 1.2
- Check the [official website](https://pytorch.org/) for installation guide
- pandas 0.24.2
- Install with either `conda install pandas` or `pip install pandas`
- RDKit 2018.09.3
- We recommend installation with `conda install -c conda-forge rdkit==2018.09.3`. For other installation recipes,
see the [official documentation](https://www.rdkit.org/docs/Install.html).
- requests 2.22.0
- Install with `pip install requests`
- scikit-learn 0.21.2
- Install with `pip install -U scikit-learn` or `conda install scikit-learn`
## Property Prediction
[**Get started with our example code!**](https://github.com/dmlc/dgl/tree/master/examples/pytorch/model_zoo/chem/property_prediction)
To evaluate molecules for drug candidates, we need to know their properties and activities. In practice, this is
mostly achieved via wet lab experiments. We can cast the problem as a regression or classification problem.
In practice, this can be quite difficult due to the scarcity of labeled data.
### Featurization and Representation Learning
Fingerprint has been a widely used concept in cheminformatics. Chemists developed hand designed rules to convert
molecules into binary strings where each bit indicates the presence or absence of a particular substructure. The
development of fingerprints makes the comparison of molecules a lot easier. Previous machine learning methods are
mostly developed based on molecule fingerprints.
Graph neural networks make it possible for a data-driven representation of molecules out of the atoms, bonds and
molecular graph topology, which may be viewed as a learned fingerprint [3].
### Models
- **Graph Convolutional Network**: Graph Convolutional Networks (GCN) have been one of the most popular graph neural
networks and they can be easily extended for graph level prediction.
## References
[1] Chen et al. (2018) The rise of deep learning in drug discovery. *Drug Discov Today* 6, 1241-1250.
[2] Vamathevan et al. (2019) Applications of machine learning in drug discovery and development.
*Nature Reviews Drug Discovery* 18, 463-477.
[3] Duvenaud et al. (2015) Convolutional networks on graphs for learning molecular fingerprints. *Advances in neural
information processing systems (NeurIPS)*, 2224-2232.
# pylint: disable=C0111
"""Model Zoo Package"""
from .gcn import GCNClassifier
from .pretrain import load_pretrained
# pylint: disable=C0111, C0103, C0200
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.nn.pytorch import GraphConv
class GCNLayer(nn.Module):
def __init__(self, in_feats, out_feats, activation=F.relu,
residual=True, batchnorm=True, dropout=0.):
"""Single layer GCN for updating node features
Parameters
----------
in_feats : int
Number of input atom features
out_feats : int
Number of output atom features
activation : activation function
Default to be ReLU
residual : bool
Whether to use residual connection, default to be True
batchnorm : bool
Whether to use batch normalization on the output,
default to be True
dropout : float
The probability for dropout. Default to be 0., i.e. no
dropout is performed.
"""
super(GCNLayer, self).__init__()
self.activation = activation
self.graph_conv = GraphConv(in_feats=in_feats, out_feats=out_feats,
norm=False, activation=activation)
self.dropout = nn.Dropout(dropout)
self.residual = residual
if residual:
self.res_connection = nn.Linear(in_feats, out_feats)
self.bn = batchnorm
if batchnorm:
self.bn_layer = nn.BatchNorm1d(out_feats)
def forward(self, feats, bg):
"""Update atom representations
Parameters
----------
feats : FloatTensor of shape (N, M1)
* N is the total number of atoms in the batched graph
* M1 is the input atom feature size, must match in_feats in initialization
bg : BatchedDGLGraph
Batched DGLGraphs for processing multiple molecules in parallel
Returns
-------
new_feats : FloatTensor of shape (N, M2)
* M2 is the output atom feature size, must match out_feats in initialization
"""
new_feats = self.graph_conv(feats, bg)
if self.residual:
res_feats = self.activation(self.res_connection(feats))
new_feats = new_feats + res_feats
new_feats = self.dropout(new_feats)
if self.bn:
new_feats = self.bn_layer(new_feats)
return new_feats
class MLPBinaryClassifier(nn.Module):
def __init__(self, in_feats, hidden_feats, n_tasks, dropout=0.):
"""MLP for soft binary classification over multiple tasks from molecule representations.
Parameters
----------
in_feats : int
Number of input molecular graph features
hidden_feats : int
Number of molecular graph features in hidden layers
n_tasks : int
Number of tasks, also output size
dropout : float
The probability for dropout. Default to be 0., i.e. no
dropout is performed.
"""
super(MLPBinaryClassifier, self).__init__()
self.predict = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(in_feats, hidden_feats),
nn.ReLU(),
nn.BatchNorm1d(hidden_feats),
nn.Linear(hidden_feats, n_tasks)
)
def forward(self, h):
"""Perform soft binary classification over multiple tasks
Parameters
----------
h : FloatTensor of shape (B, M3)
* B is the number of molecules in a batch
* M3 is the input molecule feature size, must match in_feats in initialization
Returns
-------
FloatTensor of shape (B, n_tasks)
"""
return self.predict(h)
class GCNClassifier(nn.Module):
def __init__(self, in_feats, gcn_hidden_feats, n_tasks, classifier_hidden_feats=128,
dropout=0., atom_data_field='h', atom_weight_field='w'):
"""GCN based predictor for multitask prediction on molecular graphs
We assume each task requires to perform a binary classification.
Parameters
----------
in_feats : int
Number of input atom features
gcn_hidden_feats : list of int
gcn_hidden_feats[i] gives the number of output atom features
in the i+1-th gcn layer
n_tasks : int
Number of prediction tasks
classifier_hidden_feats : int
Number of molecular graph features in hidden layers of the MLP Classifier
dropout : float
The probability for dropout. Default to be 0., i.e. no
dropout is performed.
atom_data_field : str
Name for storing atom features in DGLGraphs
atom_weight_field : str
Name for storing atom weights in DGLGraphs
"""
super(GCNClassifier, self).__init__()
self.atom_data_field = atom_data_field
self.gcn_layers = nn.ModuleList()
for i in range(len(gcn_hidden_feats)):
out_feats = gcn_hidden_feats[i]
self.gcn_layers.append(GCNLayer(in_feats, out_feats))
in_feats = out_feats
self.atom_weight_field = atom_weight_field
self.atom_weighting = nn.Sequential(
nn.Linear(in_feats, 1),
nn.Sigmoid()
)
self.g_feats = 2 * in_feats
self.soft_classifier = MLPBinaryClassifier(
self.g_feats, classifier_hidden_feats, n_tasks, dropout)
def forward(self, feats, bg):
"""Multi-task prediction for a batch of molecules
Parameters
----------
feats : FloatTensor of shape (N, M0)
Initial features for all atoms in the batch of molecules
bg : BatchedDGLGraph
B Batched DGLGraphs for processing multiple molecules in parallel
Returns
-------
FloatTensor of shape (B, n_tasks)
Soft prediction for all tasks on the batch of molecules
"""
# Update atom features
for gcn in self.gcn_layers:
feats = gcn(feats, bg)
# Compute molecule features from atom features
bg.ndata[self.atom_data_field] = feats
bg.ndata[self.atom_weight_field] = self.atom_weighting(feats)
h_g_sum = dgl.sum_nodes(
bg, self.atom_data_field, self.atom_weight_field)
h_g_max = dgl.max_nodes(bg, self.atom_data_field)
h_g = torch.cat([h_g_sum, h_g_max], dim=1)
# Multi-task prediction
return self.soft_classifier(h_g)
"""Utilities for using pretrained models."""
import torch
from .gcn import GCNClassifier
from ...data.utils import _get_dgl_url, download
def load_pretrained(model_name):
"""Load a pretrained model
Parameters
----------
model_name : str
Returns
-------
model
"""
if model_name == "GCN_Tox21":
print('Loading pretrained model...')
url_to_pretrained = _get_dgl_url('pre_trained/gcn_tox21.pth')
local_pretrained_path = 'pre_trained.pth'
download(url_to_pretrained, path=local_pretrained_path)
model = GCNClassifier(in_feats=74,
gcn_hidden_feats=[64, 64],
n_tasks=12,
classifier_hidden_feats=64)
checkpoint = torch.load(local_pretrained_path)
model.load_state_dict(checkpoint['model_state_dict'])
return model
else:
raise RuntimeError("Cannot find a pretrained model with name {}".format(model_name))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment