Unverified Commit f8c1b24d authored by paoxiaode's avatar paoxiaode Committed by GitHub
Browse files

[Dataset] Add peptides structural dataset in LRGB (#6337)

parent 8da2f832
...@@ -70,6 +70,14 @@ from .heterophilous_graphs import ( ...@@ -70,6 +70,14 @@ from .heterophilous_graphs import (
RomanEmpireDataset, RomanEmpireDataset,
TolokersDataset, TolokersDataset,
) )
# RDKit is required for Peptides-Structural dataset.
# Exception handling was added to prevent crashes for users who are using other
# datasets.
try:
from .lrgb import PeptidesStructuralDataset
except ImportError:
pass
from .pattern import PATTERNDataset from .pattern import PATTERNDataset
from .wikics import WikiCSDataset from .wikics import WikiCSDataset
from .yelp import YelpDataset from .yelp import YelpDataset
......
import hashlib
import os
import pickle
import pandas as pd
from ogb.utils import smiles2graph
from tqdm import tqdm
from .. import backend as F
from ..convert import graph as dgl_graph
from .dgl_dataset import DGLDataset
from .utils import download, load_graphs, save_graphs, Subset
class PeptidesStructuralDataset(DGLDataset):
r"""Peptides structure dataset for the graph regression task.
DGL dataset of 15,535 small peptides represented as their molecular
graph (SMILES) with 11 regression targets derived from the peptide's
3D structure.
The 11 regression targets were precomputed from molecules' 3D structure:
Inertia_mass_[a-c]: The principal component of the inertia of the
mass, with some normalizations. (Sorted)
Inertia_valence_[a-c]: The principal component of the inertia of the
Hydrogen atoms. This is basically a measure of the 3D
distribution of hydrogens. (Sorted)
length_[a-c]: The length around the 3 main geometric axis of
the 3D objects (without considering atom types). (Sorted)
Spherocity: SpherocityIndex descriptor computed by
rdkit.Chem.rdMolDescriptors.CalcSpherocityIndex
Plane_best_fit: Plane of best fit (PBF) descriptor computed by
rdkit.Chem.rdMolDescriptors.CalcPBF
Reference `<https://arxiv.org/abs/2206.08164.pdf>`_
Statistics:
- Train examples: 10,873
- Valid examples: 2,331
- Test examples: 2,331
- Average number of nodes: 150.94
- Average number of edges: 307.30
- Number of atom types: 9
- Number of bond types: 3
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: "~/.dgl/".
force_reload : bool
Whether to reload the dataset.
Default: False.
verbose : bool
Whether to print out progress information.
Default: False.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
smiles2graph : callable
A callable function that converts a SMILES string into a graph object.
* The default smiles2graph requires rdkit to be installed *
Examples
---------
>>> from dgl.data import PeptidesStructuralDataset
>>> dataset = PeptidesStructuralDataset()
>>> len(dataset)
15535
>>> dataset.num_atom_types
9
>>> graph, label = dataset[0]
>>> graph
Graph(num_nodes=119, num_edges=244,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
>>> split_dict = dataset.get_idx_split()
>>> trainset = dataset[split_dict["train"]]
>>> graph, label = trainset[0]
>>> graph
Graph(num_nodes=338, num_edges=682,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
"""
def __init__(
self,
raw_dir=None,
force_reload=None,
verbose=None,
transform=None,
smiles2graph=smiles2graph,
):
self.smiles2graph = smiles2graph
# MD5 hash of the dataset file.
self.md5sum_data = "9786061a34298a0684150f2e4ff13f47"
self.url_stratified_split = """
https://www.dropbox.com/s/9dfifzft1hqgow6/splits_random_stratified_peptide_structure.pickle?dl=1
"""
self.md5sum_stratified_split = "5a0114bdadc80b94fc7ae974f13ef061"
super(PeptidesStructuralDataset, self).__init__(
name="Peptides-struc",
raw_dir=raw_dir,
url="""
https://www.dropbox.com/s/464u3303eu2u4zp/peptide_structure_dataset.csv.gz?dl=1
""",
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
@property
def raw_data_path(self):
return os.path.join(self.raw_path, "peptide_structure_dataset.csv.gz")
@property
def split_data_path(self):
return os.path.join(
self.raw_path, "splits_random_stratified_peptide_structure.pickle"
)
@property
def graph_path(self):
return os.path.join(self.save_path, "Peptides-struc.bin")
@property
def num_atom_types(self):
return 9
@property
def num_bond_types(self):
return 3
def _md5sum(self, path):
hash_md5 = hashlib.md5()
with open(path, "rb") as f:
buffer = f.read()
hash_md5.update(buffer)
return hash_md5.hexdigest()
def download(self):
path = download(self.url, path=self.raw_data_path)
# Save to disk the MD5 hash of the downloaded file.
hash = self._md5sum(path)
if hash != self.md5sum_data:
raise ValueError("Unexpected MD5 hash of the downloaded file")
open(os.path.join(self.raw_path, hash), "w").close()
# Download train/val/test splits.
path_split = download(
self.url_stratified_split, path=self.split_data_path
)
hash_split = self._md5sum(path_split)
if hash_split != self.md5sum_stratified_split:
raise ValueError("Unexpected MD5 hash of the split file")
def process(self):
data_df = pd.read_csv(self.raw_data_path)
smiles_list = data_df["smiles"]
target_names = [
"Inertia_mass_a",
"Inertia_mass_b",
"Inertia_mass_c",
"Inertia_valence_a",
"Inertia_valence_b",
"Inertia_valence_c",
"length_a",
"length_b",
"length_c",
"Spherocity",
"Plane_best_fit",
]
# Normalize to zero mean and unit standard deviation.
data_df.loc[:, target_names] = data_df.loc[:, target_names].apply(
lambda x: (x - x.mean()) / x.std(), axis=0
)
if self.verbose:
print("Converting SMILES strings into graphs...")
self.graphs = []
self.labels = []
for i in tqdm(range(len(smiles_list))):
smiles = smiles_list[i]
y = data_df.iloc[i][target_names]
graph = self.smiles2graph(smiles)
assert len(graph["edge_feat"]) == graph["edge_index"].shape[1]
assert len(graph["node_feat"]) == graph["num_nodes"]
DGLgraph = dgl_graph(
(graph["edge_index"][0], graph["edge_index"][1]),
num_nodes=graph["num_nodes"],
)
DGLgraph.edata["feat"] = F.zerocopy_from_numpy(
graph["edge_feat"]
).to(F.int64)
DGLgraph.ndata["feat"] = F.zerocopy_from_numpy(
graph["node_feat"]
).to(F.int64)
self.graphs.append(DGLgraph)
self.labels.append(y)
self.labels = F.tensor(self.labels, dtype=F.float32)
def load(self):
self.graphs, label_dict = load_graphs(self.graph_path)
self.labels = label_dict["labels"]
def save(self):
save_graphs(
self.graph_path, self.graphs, labels={"labels": self.labels}
)
def has_cache(self):
return os.path.exists(self.graph_path)
def get_idx_split(self):
"""Get dataset splits.
Returns:
Dict with 'train', 'val', 'test', splits indices.
"""
with open(self.split_data_path, "rb") as f:
split_dict = pickle.load(f)
for key in split_dict.keys():
split_dict[key] = F.zerocopy_from_numpy(split_dict[key])
return split_dict
def __len__(self):
return len(self.graphs)
def __getitem__(self, idx):
"""Get datapoint with index"""
if F.is_tensor(idx) and idx.dim() == 1:
return Subset(self, idx.cpu())
if self._transform is None:
return self.graphs[idx], self.labels[idx]
else:
return self._transform(self.graphs[idx]), self.labels[idx]
...@@ -55,6 +55,25 @@ def test_fakenews(): ...@@ -55,6 +55,25 @@ def test_fakenews():
assert g2.num_edges() - g.num_edges() == g.num_nodes() assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_peptides_structural():
transform = dgl.AddSelfLoop(allow_duplicate=True)
dataset1 = data.PeptidesStructuralDataset()
g1, label = dataset1[0]
dataset2 = data.PeptidesStructuralDataset(transform=transform)
g2, _ = dataset2[0]
assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
# return a scalar tensor
assert not label.shape
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "gpu", F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.", reason="Datasets don't need to be tested on GPU.",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment