Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
f8c1b24d
Unverified
Commit
f8c1b24d
authored
Sep 21, 2023
by
paoxiaode
Committed by
GitHub
Sep 21, 2023
Browse files
[Dataset] Add peptides structural dataset in LRGB (#6337)
parent
8da2f832
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
271 additions
and
0 deletions
+271
-0
python/dgl/data/__init__.py
python/dgl/data/__init__.py
+8
-0
python/dgl/data/lrgb.py
python/dgl/data/lrgb.py
+244
-0
tests/integration/test_data.py
tests/integration/test_data.py
+19
-0
No files found.
python/dgl/data/__init__.py
View file @
f8c1b24d
...
...
@@ -70,6 +70,14 @@ from .heterophilous_graphs import (
RomanEmpireDataset
,
TolokersDataset
,
)
# RDKit is required for Peptides-Structural dataset.
# Exception handling was added to prevent crashes for users who are using other
# datasets.
try
:
from
.lrgb
import
PeptidesStructuralDataset
except
ImportError
:
pass
from
.pattern
import
PATTERNDataset
from
.wikics
import
WikiCSDataset
from
.yelp
import
YelpDataset
...
...
python/dgl/data/lrgb.py
0 → 100644
View file @
f8c1b24d
import
hashlib
import
os
import
pickle
import
pandas
as
pd
from
ogb.utils
import
smiles2graph
from
tqdm
import
tqdm
from
..
import
backend
as
F
from
..convert
import
graph
as
dgl_graph
from
.dgl_dataset
import
DGLDataset
from
.utils
import
download
,
load_graphs
,
save_graphs
,
Subset
class
PeptidesStructuralDataset
(
DGLDataset
):
r
"""Peptides structure dataset for the graph regression task.
DGL dataset of 15,535 small peptides represented as their molecular
graph (SMILES) with 11 regression targets derived from the peptide's
3D structure.
The 11 regression targets were precomputed from molecules' 3D structure:
Inertia_mass_[a-c]: The principal component of the inertia of the
mass, with some normalizations. (Sorted)
Inertia_valence_[a-c]: The principal component of the inertia of the
Hydrogen atoms. This is basically a measure of the 3D
distribution of hydrogens. (Sorted)
length_[a-c]: The length around the 3 main geometric axis of
the 3D objects (without considering atom types). (Sorted)
Spherocity: SpherocityIndex descriptor computed by
rdkit.Chem.rdMolDescriptors.CalcSpherocityIndex
Plane_best_fit: Plane of best fit (PBF) descriptor computed by
rdkit.Chem.rdMolDescriptors.CalcPBF
Reference `<https://arxiv.org/abs/2206.08164.pdf>`_
Statistics:
- Train examples: 10,873
- Valid examples: 2,331
- Test examples: 2,331
- Average number of nodes: 150.94
- Average number of edges: 307.30
- Number of atom types: 9
- Number of bond types: 3
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: "~/.dgl/".
force_reload : bool
Whether to reload the dataset.
Default: False.
verbose : bool
Whether to print out progress information.
Default: False.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
smiles2graph : callable
A callable function that converts a SMILES string into a graph object.
* The default smiles2graph requires rdkit to be installed *
Examples
---------
>>> from dgl.data import PeptidesStructuralDataset
>>> dataset = PeptidesStructuralDataset()
>>> len(dataset)
15535
>>> dataset.num_atom_types
9
>>> graph, label = dataset[0]
>>> graph
Graph(num_nodes=119, num_edges=244,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
>>> split_dict = dataset.get_idx_split()
>>> trainset = dataset[split_dict["train"]]
>>> graph, label = trainset[0]
>>> graph
Graph(num_nodes=338, num_edges=682,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
"""
def
__init__
(
self
,
raw_dir
=
None
,
force_reload
=
None
,
verbose
=
None
,
transform
=
None
,
smiles2graph
=
smiles2graph
,
):
self
.
smiles2graph
=
smiles2graph
# MD5 hash of the dataset file.
self
.
md5sum_data
=
"9786061a34298a0684150f2e4ff13f47"
self
.
url_stratified_split
=
"""
https://www.dropbox.com/s/9dfifzft1hqgow6/splits_random_stratified_peptide_structure.pickle?dl=1
"""
self
.
md5sum_stratified_split
=
"5a0114bdadc80b94fc7ae974f13ef061"
super
(
PeptidesStructuralDataset
,
self
).
__init__
(
name
=
"Peptides-struc"
,
raw_dir
=
raw_dir
,
url
=
"""
https://www.dropbox.com/s/464u3303eu2u4zp/peptide_structure_dataset.csv.gz?dl=1
"""
,
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
,
)
@
property
def
raw_data_path
(
self
):
return
os
.
path
.
join
(
self
.
raw_path
,
"peptide_structure_dataset.csv.gz"
)
@
property
def
split_data_path
(
self
):
return
os
.
path
.
join
(
self
.
raw_path
,
"splits_random_stratified_peptide_structure.pickle"
)
@
property
def
graph_path
(
self
):
return
os
.
path
.
join
(
self
.
save_path
,
"Peptides-struc.bin"
)
@
property
def
num_atom_types
(
self
):
return
9
@
property
def
num_bond_types
(
self
):
return
3
def
_md5sum
(
self
,
path
):
hash_md5
=
hashlib
.
md5
()
with
open
(
path
,
"rb"
)
as
f
:
buffer
=
f
.
read
()
hash_md5
.
update
(
buffer
)
return
hash_md5
.
hexdigest
()
def
download
(
self
):
path
=
download
(
self
.
url
,
path
=
self
.
raw_data_path
)
# Save to disk the MD5 hash of the downloaded file.
hash
=
self
.
_md5sum
(
path
)
if
hash
!=
self
.
md5sum_data
:
raise
ValueError
(
"Unexpected MD5 hash of the downloaded file"
)
open
(
os
.
path
.
join
(
self
.
raw_path
,
hash
),
"w"
).
close
()
# Download train/val/test splits.
path_split
=
download
(
self
.
url_stratified_split
,
path
=
self
.
split_data_path
)
hash_split
=
self
.
_md5sum
(
path_split
)
if
hash_split
!=
self
.
md5sum_stratified_split
:
raise
ValueError
(
"Unexpected MD5 hash of the split file"
)
def
process
(
self
):
data_df
=
pd
.
read_csv
(
self
.
raw_data_path
)
smiles_list
=
data_df
[
"smiles"
]
target_names
=
[
"Inertia_mass_a"
,
"Inertia_mass_b"
,
"Inertia_mass_c"
,
"Inertia_valence_a"
,
"Inertia_valence_b"
,
"Inertia_valence_c"
,
"length_a"
,
"length_b"
,
"length_c"
,
"Spherocity"
,
"Plane_best_fit"
,
]
# Normalize to zero mean and unit standard deviation.
data_df
.
loc
[:,
target_names
]
=
data_df
.
loc
[:,
target_names
].
apply
(
lambda
x
:
(
x
-
x
.
mean
())
/
x
.
std
(),
axis
=
0
)
if
self
.
verbose
:
print
(
"Converting SMILES strings into graphs..."
)
self
.
graphs
=
[]
self
.
labels
=
[]
for
i
in
tqdm
(
range
(
len
(
smiles_list
))):
smiles
=
smiles_list
[
i
]
y
=
data_df
.
iloc
[
i
][
target_names
]
graph
=
self
.
smiles2graph
(
smiles
)
assert
len
(
graph
[
"edge_feat"
])
==
graph
[
"edge_index"
].
shape
[
1
]
assert
len
(
graph
[
"node_feat"
])
==
graph
[
"num_nodes"
]
DGLgraph
=
dgl_graph
(
(
graph
[
"edge_index"
][
0
],
graph
[
"edge_index"
][
1
]),
num_nodes
=
graph
[
"num_nodes"
],
)
DGLgraph
.
edata
[
"feat"
]
=
F
.
zerocopy_from_numpy
(
graph
[
"edge_feat"
]
).
to
(
F
.
int64
)
DGLgraph
.
ndata
[
"feat"
]
=
F
.
zerocopy_from_numpy
(
graph
[
"node_feat"
]
).
to
(
F
.
int64
)
self
.
graphs
.
append
(
DGLgraph
)
self
.
labels
.
append
(
y
)
self
.
labels
=
F
.
tensor
(
self
.
labels
,
dtype
=
F
.
float32
)
def
load
(
self
):
self
.
graphs
,
label_dict
=
load_graphs
(
self
.
graph_path
)
self
.
labels
=
label_dict
[
"labels"
]
def
save
(
self
):
save_graphs
(
self
.
graph_path
,
self
.
graphs
,
labels
=
{
"labels"
:
self
.
labels
}
)
def
has_cache
(
self
):
return
os
.
path
.
exists
(
self
.
graph_path
)
def
get_idx_split
(
self
):
"""Get dataset splits.
Returns:
Dict with 'train', 'val', 'test', splits indices.
"""
with
open
(
self
.
split_data_path
,
"rb"
)
as
f
:
split_dict
=
pickle
.
load
(
f
)
for
key
in
split_dict
.
keys
():
split_dict
[
key
]
=
F
.
zerocopy_from_numpy
(
split_dict
[
key
])
return
split_dict
def
__len__
(
self
):
return
len
(
self
.
graphs
)
def
__getitem__
(
self
,
idx
):
"""Get datapoint with index"""
if
F
.
is_tensor
(
idx
)
and
idx
.
dim
()
==
1
:
return
Subset
(
self
,
idx
.
cpu
())
if
self
.
_transform
is
None
:
return
self
.
graphs
[
idx
],
self
.
labels
[
idx
]
else
:
return
self
.
_transform
(
self
.
graphs
[
idx
]),
self
.
labels
[
idx
]
tests/integration/test_data.py
View file @
f8c1b24d
...
...
@@ -55,6 +55,25 @@ def test_fakenews():
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"gpu"
,
reason
=
"Datasets don't need to be tested on GPU."
,
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"only supports pytorch"
)
def
test_peptides_structural
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
dataset1
=
data
.
PeptidesStructuralDataset
()
g1
,
label
=
dataset1
[
0
]
dataset2
=
data
.
PeptidesStructuralDataset
(
transform
=
transform
)
g2
,
_
=
dataset2
[
0
]
assert
g2
.
num_edges
()
-
g1
.
num_edges
()
==
g1
.
num_nodes
()
# return a scalar tensor
assert
not
label
.
shape
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"gpu"
,
reason
=
"Datasets don't need to be tested on GPU."
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment