Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
8b8fd2c0
Unverified
Commit
8b8fd2c0
authored
Feb 15, 2022
by
Mufei Li
Committed by
GitHub
Feb 15, 2022
Browse files
[Dataset] Add transform argument to built-in datasets (#3733)
* Update * Fix * Update
parent
b3d3a2c4
Changes
22
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
87 additions
and
19 deletions
+87
-19
python/dgl/data/tu.py
python/dgl/data/tu.py
+27
-15
tests/compute/test_data.py
tests/compute/test_data.py
+60
-4
No files found.
python/dgl/data/tu.py
View file @
8b8fd2c0
...
...
@@ -26,6 +26,10 @@ class LegacyTUDataset(DGLBuiltinDataset):
max_allow_node : int
Remove graphs that contains more nodes than ``max_allow_node``.
Default : None
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
...
...
@@ -73,7 +77,7 @@ class LegacyTUDataset(DGLBuiltinDataset):
def
__init__
(
self
,
name
,
use_pandas
=
False
,
hidden_size
=
10
,
max_allow_node
=
None
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
False
):
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
False
,
transform
=
None
):
url
=
self
.
_url
.
format
(
name
)
self
.
hidden_size
=
hidden_size
...
...
@@ -81,7 +85,7 @@ class LegacyTUDataset(DGLBuiltinDataset):
self
.
use_pandas
=
use_pandas
super
(
LegacyTUDataset
,
self
).
__init__
(
name
=
name
,
url
=
url
,
raw_dir
=
raw_dir
,
hash_key
=
(
name
,
use_pandas
,
hidden_size
,
max_allow_node
),
force_reload
=
force_reload
,
verbose
=
verbose
)
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
)
def
process
(
self
):
self
.
data_mode
=
None
...
...
@@ -211,6 +215,8 @@ class LegacyTUDataset(DGLBuiltinDataset):
And its label.
"""
g
=
self
.
graph_lists
[
idx
]
if
self
.
_transform
is
not
None
:
g
=
self
.
_transform
(
g
)
return
g
,
self
.
graph_labels
[
idx
]
def
__len__
(
self
):
...
...
@@ -247,6 +253,10 @@ class TUDataset(DGLBuiltinDataset):
name : str
Dataset Name, such as ``ENZYMES``, ``DD``, ``COLLAB``, ``MUTAG``, can be the
datasets name on `<https://chrsmrrs.github.io/datasets/docs/datasets/>`_.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Attributes
----------
...
...
@@ -304,11 +314,11 @@ class TUDataset(DGLBuiltinDataset):
_url
=
r
"https://www.chrsmrrs.com/graphkerneldatasets/{}.zip"
def
__init__
(
self
,
name
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
False
):
def
__init__
(
self
,
name
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
False
,
transform
=
None
):
url
=
self
.
_url
.
format
(
name
)
super
(
TUDataset
,
self
).
__init__
(
name
=
name
,
url
=
url
,
raw_dir
=
raw_dir
,
force_reload
=
force_reload
,
verbose
=
verbose
)
verbose
=
verbose
,
transform
=
transform
)
def
process
(
self
):
DS_edge_list
=
self
.
_idx_from_zero
(
...
...
@@ -404,6 +414,8 @@ class TUDataset(DGLBuiltinDataset):
And its label.
"""
g
=
self
.
graph_lists
[
idx
]
if
self
.
_transform
is
not
None
:
g
=
self
.
_transform
(
g
)
return
g
,
self
.
graph_labels
[
idx
]
def
__len__
(
self
):
...
...
tests/compute/test_data.py
View file @
8b8fd2c0
...
...
@@ -7,6 +7,7 @@ import os
import
pandas
as
pd
import
yaml
import
pytest
import
dgl
import
dgl.data
as
data
from
dgl
import
DGLError
import
dgl
...
...
@@ -16,7 +17,11 @@ def test_minigc():
ds
=
data
.
MiniGCDataset
(
16
,
10
,
20
)
g
,
l
=
list
(
zip
(
*
ds
))
print
(
g
,
l
)
g1
=
ds
[
0
][
0
]
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
ds
=
data
.
MiniGCDataset
(
16
,
10
,
20
,
transform
=
transform
)
g2
=
ds
[
0
][
0
]
assert
g2
.
num_edges
()
-
g1
.
num_edges
()
==
g1
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_gin
():
...
...
@@ -27,37 +32,64 @@ def test_gin():
'PROTEINS'
:
1113
,
'PTC'
:
344
,
}
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
for
name
,
n_graphs
in
ds_n_graphs
.
items
():
ds
=
data
.
GINDataset
(
name
,
self_loop
=
False
,
degree_as_nlabel
=
False
)
assert
len
(
ds
)
==
n_graphs
,
(
len
(
ds
),
name
)
g1
=
ds
[
0
][
0
]
ds
=
data
.
GINDataset
(
name
,
self_loop
=
False
,
degree_as_nlabel
=
False
,
transform
=
transform
)
g2
=
ds
[
0
][
0
]
assert
g2
.
num_edges
()
-
g1
.
num_edges
()
==
g1
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_fraud
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
g
=
data
.
FraudDataset
(
'amazon'
)[
0
]
assert
g
.
num_nodes
()
==
11944
num_edges1
=
g
.
num_edges
()
g2
=
data
.
FraudDataset
(
'amazon'
,
transform
=
transform
)[
0
]
# 3 edge types
assert
g2
.
num_edges
()
-
num_edges1
==
g
.
num_nodes
()
*
3
g
=
data
.
FraudAmazonDataset
()[
0
]
assert
g
.
num_nodes
()
==
11944
g2
=
data
.
FraudAmazonDataset
(
transform
=
transform
)[
0
]
# 3 edge types
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
*
3
g
=
data
.
FraudYelpDataset
()[
0
]
assert
g
.
num_nodes
()
==
45954
g2
=
data
.
FraudYelpDataset
(
transform
=
transform
)[
0
]
# 3 edge types
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
*
3
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_fakenews
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
ds
=
data
.
FakeNewsDataset
(
'politifact'
,
'bert'
)
assert
len
(
ds
)
==
314
g
=
ds
[
0
][
0
]
g2
=
data
.
FakeNewsDataset
(
'politifact'
,
'bert'
,
transform
=
transform
)[
0
][
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
ds
=
data
.
FakeNewsDataset
(
'gossipcop'
,
'profile'
)
assert
len
(
ds
)
==
5464
g
=
ds
[
0
][
0
]
g2
=
data
.
FakeNewsDataset
(
'gossipcop'
,
'profile'
,
transform
=
transform
)[
0
][
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_tudataset_regression
():
ds
=
data
.
TUDataset
(
'ZINC_test'
,
force_reload
=
True
)
assert
len
(
ds
)
==
5000
g
=
ds
[
0
][
0
]
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
ds
=
data
.
TUDataset
(
'ZINC_test'
,
force_reload
=
True
,
transform
=
transform
)
g2
=
ds
[
0
][
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_data_hash
():
...
...
@@ -78,12 +110,16 @@ def test_data_hash():
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_citation_graph
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
# cora
g
=
data
.
CoraGraphDataset
()[
0
]
assert
g
.
num_nodes
()
==
2708
assert
g
.
num_edges
()
==
10556
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
CoraGraphDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
# Citeseer
g
=
data
.
CiteseerGraphDataset
()[
0
]
...
...
@@ -91,6 +127,8 @@ def test_citation_graph():
assert
g
.
num_edges
()
==
9228
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
CiteseerGraphDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
# Pubmed
g
=
data
.
PubmedGraphDataset
()[
0
]
...
...
@@ -98,16 +136,22 @@ def test_citation_graph():
assert
g
.
num_edges
()
==
88651
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
PubmedGraphDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_gnn_benchmark
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
# AmazonCoBuyComputerDataset
g
=
data
.
AmazonCoBuyComputerDataset
()[
0
]
assert
g
.
num_nodes
()
==
13752
assert
g
.
num_edges
()
==
491722
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
AmazonCoBuyComputerDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
# AmazonCoBuyPhotoDataset
g
=
data
.
AmazonCoBuyPhotoDataset
()[
0
]
...
...
@@ -115,6 +159,8 @@ def test_gnn_benchmark():
assert
g
.
num_edges
()
==
238163
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
AmazonCoBuyPhotoDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
# CoauthorPhysicsDataset
g
=
data
.
CoauthorPhysicsDataset
()[
0
]
...
...
@@ -122,6 +168,8 @@ def test_gnn_benchmark():
assert
g
.
num_edges
()
==
495924
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
CoauthorPhysicsDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
# CoauthorCSDataset
g
=
data
.
CoauthorCSDataset
()[
0
]
...
...
@@ -129,6 +177,8 @@ def test_gnn_benchmark():
assert
g
.
num_edges
()
==
163788
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
CoauthorCSDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
# CoraFullDataset
g
=
data
.
CoraFullDataset
()[
0
]
...
...
@@ -136,6 +186,8 @@ def test_gnn_benchmark():
assert
g
.
num_edges
()
==
126842
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
g2
=
data
.
CoraFullDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
...
...
@@ -147,6 +199,10 @@ def test_reddit():
dst
=
F
.
asnumpy
(
g
.
edges
()[
1
])
assert
np
.
array_equal
(
dst
,
np
.
sort
(
dst
))
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
g2
=
data
.
RedditDataset
(
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
'gpu'
,
reason
=
"Datasets don't need to be tested on GPU."
)
def
test_extract_archive
():
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment