Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
ae3102d3
Unverified
Commit
ae3102d3
authored
Oct 21, 2019
by
Mufei Li
Committed by
GitHub
Oct 21, 2019
Browse files
[Dataset] Migration for Chemistry Datasets (#926)
* Update * Update
parent
c37076df
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
47 additions
and
43 deletions
+47
-43
examples/pytorch/model_zoo/chem/property_prediction/classification.py
...orch/model_zoo/chem/property_prediction/classification.py
+2
-2
python/dgl/data/chem/__init__.py
python/dgl/data/chem/__init__.py
+1
-1
python/dgl/data/chem/alchemy.py
python/dgl/data/chem/alchemy.py
+16
-15
python/dgl/data/chem/csv_dataset.py
python/dgl/data/chem/csv_dataset.py
+22
-20
python/dgl/data/chem/tox21.py
python/dgl/data/chem/tox21.py
+6
-5
No files found.
examples/pytorch/model_zoo/chem/property_prediction/classification.py
View file @
ae3102d3
...
...
@@ -76,8 +76,8 @@ def main(args):
classifier_hidden_feats
=
args
[
'classifier_hidden_feats'
],
n_tasks
=
dataset
.
n_tasks
)
loss_criterion
=
BCEWithLogitsLoss
(
pos_weight
=
torch
.
tensor
(
dataset
.
task_pos_weights
).
to
(
args
[
'device'
]),
reduction
=
'none'
)
loss_criterion
=
BCEWithLogitsLoss
(
pos_weight
=
dataset
.
task_pos_weights
.
to
(
args
[
'device'
]),
reduction
=
'none'
)
optimizer
=
Adam
(
model
.
parameters
(),
lr
=
args
[
'lr'
])
stopper
=
EarlyStopping
(
patience
=
args
[
'patience'
])
model
.
to
(
args
[
'device'
])
...
...
python/dgl/data/chem/__init__.py
View file @
ae3102d3
from
.utils
import
*
from
.csv_dataset
import
CSVDataset
from
.csv_dataset
import
Molecule
CSVDataset
from
.tox21
import
Tox21
from
.alchemy
import
TencentAlchemyDataset
python/dgl/data/chem/alchemy.py
View file @
ae3102d3
...
...
@@ -11,7 +11,7 @@ import zipfile
from
collections
import
defaultdict
from
.utils
import
mol_to_complete_graph
from
..utils
import
download
,
get_download_dir
,
_get_dgl_url
from
..utils
import
download
,
get_download_dir
,
_get_dgl_url
,
save_graphs
,
load_graphs
from
...
import
backend
as
F
try
:
...
...
@@ -172,7 +172,7 @@ class TencentAlchemyDataset(object):
file_dir
=
osp
.
join
(
get_download_dir
(),
'Alchemy_data'
)
if
not
from_raw
:
file_name
=
"%s_processed"
%
(
mode
)
file_name
=
"%s_processed
_dgl
"
%
(
mode
)
else
:
file_name
=
"%s_single_sdf"
%
(
mode
)
self
.
file_dir
=
pathlib
.
Path
(
file_dir
,
file_name
)
...
...
@@ -189,10 +189,11 @@ class TencentAlchemyDataset(object):
def
_load
(
self
):
if
not
self
.
from_raw
:
with
open
(
osp
.
join
(
self
.
file_dir
,
"%s_graphs.pkl"
%
self
.
mode
),
"rb"
)
as
f
:
self
.
graphs
=
pickle
.
load
(
f
)
with
open
(
osp
.
join
(
self
.
file_dir
,
"%s_labels.pkl"
%
self
.
mode
),
"rb"
)
as
f
:
self
.
labels
=
pickle
.
load
(
f
)
self
.
graphs
,
label_dict
=
load_graphs
(
osp
.
join
(
self
.
file_dir
,
"%s_graphs.bin"
%
self
.
mode
))
self
.
labels
=
label_dict
[
'labels'
]
with
open
(
osp
.
join
(
self
.
file_dir
,
"%s_smiles.txt"
%
self
.
mode
),
'r'
)
as
f
:
smiles_
=
f
.
readlines
()
self
.
smiles
=
[
s
.
strip
()
for
s
in
smiles_
]
else
:
print
(
'Start preprocessing dataset...'
)
target_file
=
pathlib
.
Path
(
self
.
file_dir
,
"%s_target.csv"
%
self
.
mode
)
...
...
@@ -201,7 +202,7 @@ class TencentAlchemyDataset(object):
index_col
=
0
,
usecols
=
[
'gdb_idx'
,]
+
[
'property_%d'
%
x
for
x
in
range
(
12
)])
self
.
target
=
self
.
target
[[
'property_%d'
%
x
for
x
in
range
(
12
)]]
self
.
graphs
,
self
.
labels
=
[],
[]
self
.
graphs
,
self
.
labels
,
self
.
smiles
=
[],
[],
[]
supp
=
Chem
.
SDMolSupplier
(
osp
.
join
(
self
.
file_dir
,
self
.
mode
+
".sdf"
))
cnt
=
0
...
...
@@ -211,16 +212,17 @@ class TencentAlchemyDataset(object):
print
(
'Processing molecule {:d}/{:d}'
.
format
(
cnt
,
dataset_size
))
graph
=
mol_to_complete_graph
(
mol
,
atom_featurizer
=
alchemy_nodes
,
bond_featurizer
=
alchemy_edges
)
smile
=
Chem
.
MolToSmiles
(
mol
)
graph
.
smile
=
smile
smile
s
=
Chem
.
MolToSmiles
(
mol
)
self
.
smile
s
.
append
(
smile
s
)
self
.
graphs
.
append
(
graph
)
label
=
F
.
tensor
(
np
.
array
(
label
[
1
].
tolist
()).
astype
(
np
.
float32
))
self
.
labels
.
append
(
label
)
with
open
(
osp
.
join
(
self
.
file_dir
,
"%s_graphs.pkl"
%
self
.
mode
),
"wb"
)
as
f
:
pickle
.
dump
(
self
.
graphs
,
f
)
with
open
(
osp
.
join
(
self
.
file_dir
,
"%s_labels.pkl"
%
self
.
mode
),
"wb"
)
as
f
:
pickle
.
dump
(
self
.
labels
,
f
)
save_graphs
(
osp
.
join
(
self
.
file_dir
,
"%s_graphs.bin"
%
self
.
mode
),
self
.
graphs
,
labels
=
{
'labels'
:
F
.
stack
(
self
.
labels
,
dim
=
0
)})
with
open
(
osp
.
join
(
self
.
file_dir
,
"%s_smiles.txt"
%
self
.
mode
),
'w'
)
as
f
:
for
s
in
self
.
smiles
:
f
.
write
(
s
+
'
\n
'
)
self
.
set_mean_and_std
()
print
(
len
(
self
.
graphs
),
"loaded!"
)
...
...
@@ -242,8 +244,7 @@ class TencentAlchemyDataset(object):
Tensor of dtype float32
Labels of the datapoint for all tasks
"""
g
,
l
=
self
.
graphs
[
item
],
self
.
labels
[
item
]
return
g
.
smile
,
g
,
l
return
self
.
smiles
[
item
],
self
.
graphs
[
item
],
self
.
labels
[
item
]
def
__len__
(
self
):
"""Length of the dataset
...
...
python/dgl/data/chem/csv_dataset.py
View file @
ae3102d3
...
...
@@ -3,17 +3,17 @@ from __future__ import absolute_import
import
dgl.backend
as
F
import
numpy
as
np
import
os
import
pickle
import
sys
from
dgl
import
DGLGraph
from
.utils
import
smile_to_bigraph
from
..utils
import
save_graphs
,
load_graphs
from
...
import
backend
as
F
from
...graph
import
DGLGraph
class
MoleculeCSVDataset
(
object
):
"""MoleculeCSVDataset
class
CSVDataset
(
object
):
"""CSVDataset
This is a general class for loading data from csv or pd.DataFrame.
This is a general class for loading molecular data from csv or pd.DataFrame.
In data pre-processing, we set non-existing labels to be 0,
and returning mask with 1 where label exists.
...
...
@@ -36,7 +36,7 @@ class CSVDataset(object):
Path to store the preprocessed data
"""
def
__init__
(
self
,
df
,
smile_to_graph
=
smile_to_bigraph
,
smile_column
=
'smiles'
,
cache_file_path
=
"csvdata_dglgraph.
pkl
"
):
cache_file_path
=
"csvdata_dglgraph.
bin
"
):
if
'rdkit'
not
in
sys
.
modules
:
from
...base
import
dgl_warning
dgl_warning
(
...
...
@@ -64,17 +64,21 @@ class CSVDataset(object):
if
os
.
path
.
exists
(
self
.
cache_file_path
):
# DGLGraphs have been constructed before, reload them
print
(
'Loading previously saved dgl graphs...'
)
with
open
(
self
.
cache_file_path
,
'rb'
)
as
f
:
self
.
graphs
=
pickle
.
load
(
f
)
self
.
graphs
,
label_dict
=
load_graphs
(
self
.
cache_file_path
)
self
.
labels
=
label_dict
[
'labels'
]
self
.
mask
=
label_dict
[
'mask'
]
else
:
self
.
graphs
=
[
smile_to_graph
(
s
)
for
s
in
self
.
smiles
]
with
open
(
self
.
cache_file_path
,
'wb'
)
as
f
:
pickle
.
dump
(
self
.
graphs
,
f
)
print
(
'Processing dgl graphs from scratch...'
)
self
.
graphs
=
[]
for
i
,
s
in
enumerate
(
self
.
smiles
):
print
(
'Processing molecule {:d}/{:d}'
.
format
(
i
+
1
,
len
(
self
)))
self
.
graphs
.
append
(
smile_to_graph
(
s
))
_label_values
=
self
.
df
[
self
.
task_names
].
values
# np.nan_to_num will also turn inf into a very large number
self
.
labels
=
np
.
nan_to_num
(
_label_values
).
astype
(
np
.
float32
)
self
.
mask
=
(
~
np
.
isnan
(
_label_values
)).
astype
(
np
.
float32
)
self
.
labels
=
F
.
zerocopy_from_numpy
(
np
.
nan_to_num
(
_label_values
).
astype
(
np
.
float32
))
self
.
mask
=
F
.
zerocopy_from_numpy
((
~
np
.
isnan
(
_label_values
)).
astype
(
np
.
float32
))
save_graphs
(
self
.
cache_file_path
,
self
.
graphs
,
labels
=
{
'labels'
:
self
.
labels
,
'mask'
:
self
.
mask
})
def
__getitem__
(
self
,
item
):
"""Get datapoint with index
...
...
@@ -95,9 +99,7 @@ class CSVDataset(object):
Tensor of dtype float32
Binary masks indicating the existence of labels for all tasks
"""
return
self
.
smiles
[
item
],
self
.
graphs
[
item
],
\
F
.
zerocopy_from_numpy
(
self
.
labels
[
item
]),
\
F
.
zerocopy_from_numpy
(
self
.
mask
[
item
])
return
self
.
smiles
[
item
],
self
.
graphs
[
item
],
self
.
labels
[
item
],
self
.
mask
[
item
]
def
__len__
(
self
):
"""Length of the dataset
...
...
python/dgl/data/chem/tox21.py
View file @
ae3102d3
import
numpy
as
np
import
sys
from
.csv_dataset
import
CSVDataset
from
.csv_dataset
import
Molecule
CSVDataset
from
.utils
import
smile_to_bigraph
from
..utils
import
get_download_dir
,
download
,
_get_dgl_url
from
...
import
backend
as
F
try
:
import
pandas
as
pd
except
ImportError
:
pass
class
Tox21
(
CSVDataset
):
class
Tox21
(
Molecule
CSVDataset
):
"""Tox21 dataset.
The Toxicology in the 21st Century (https://tripod.nih.gov/tox21/challenge/)
...
...
@@ -46,7 +47,7 @@ class Tox21(CSVDataset):
df
=
df
.
drop
(
columns
=
[
'mol_id'
])
super
().
__init__
(
df
,
smile_to_graph
,
cache_file_path
=
"tox21_dglgraph.
pkl
"
)
super
().
__init__
(
df
,
smile_to_graph
,
cache_file_path
=
"tox21_dglgraph.
bin
"
)
self
.
_weight_balancing
()
...
...
@@ -67,8 +68,8 @@ class Tox21(CSVDataset):
* self._task_pos_weights is set, which is a list of positive sample weights
for each task.
"""
num_pos
=
np
.
sum
(
self
.
labels
,
axis
=
0
)
num_indices
=
np
.
sum
(
self
.
mask
,
axis
=
0
)
num_pos
=
F
.
sum
(
self
.
labels
,
dim
=
0
)
num_indices
=
F
.
sum
(
self
.
mask
,
dim
=
0
)
self
.
_task_pos_weights
=
(
num_indices
-
num_pos
)
/
num_pos
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment