Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dgl
Commits
921476c8
Unverified
Commit
921476c8
authored
Jun 15, 2023
by
OlegPlatonov
Committed by
GitHub
Jun 15, 2023
Browse files
[Dataset] Add heterophilous graphs. (#5853)
Co-authored-by:
Mufei Li
<
mufeili1996@gmail.com
>
parent
894ae31f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
566 additions
and
0 deletions
+566
-0
docs/source/api/python/dgl.data.rst
docs/source/api/python/dgl.data.rst
+5
-0
python/dgl/data/__init__.py
python/dgl/data/__init__.py
+8
-0
python/dgl/data/heterophilous_graphs.py
python/dgl/data/heterophilous_graphs.py
+456
-0
tests/python/common/data/data/test_heterophilous_graphs.py
tests/python/common/data/data/test_heterophilous_graphs.py
+97
-0
No files found.
docs/source/api/python/dgl.data.rst
View file @
921476c8
...
...
@@ -62,6 +62,11 @@ Datasets for node classification/regression tasks
CornellDataset
TexasDataset
WisconsinDataset
RomanEmpireDataset
AmazonRatingsDataset
MinesweeperDataset
TolokersDataset
QuestionsDataset
Edge Prediction Datasets
---------------------------------------
...
...
python/dgl/data/__init__.py
View file @
921476c8
...
...
@@ -61,6 +61,14 @@ from .geom_gcn import (
TexasDataset
,
WisconsinDataset
,
)
from
.heterophilous_graphs
import
(
AmazonRatingsDataset
,
MinesweeperDataset
,
QuestionsDataset
,
RomanEmpireDataset
,
TolokersDataset
,
)
from
.pattern
import
PATTERNDataset
from
.wikics
import
WikiCSDataset
from
.yelp
import
YelpDataset
...
...
python/dgl/data/heterophilous_graphs.py
0 → 100644
View file @
921476c8
"""
Datasets introduced in the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We
Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
"""
import
os
import
numpy
as
np
from
..convert
import
graph
from
..transforms.functional
import
to_bidirected
from
.dgl_dataset
import
DGLBuiltinDataset
from
.utils
import
download
class
HeterophilousGraphDataset
(
DGLBuiltinDataset
):
r
"""Datasets introduced in the 'A Critical Look at the Evaluation of GNNs under Heterophily:
Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
Parameters
----------
name : str
Name of the dataset. One of 'roman-empire', 'amazon-ratings', 'minesweeper', 'tolokers',
'questions'.
raw_dir : str
Raw file directory to store the processed data.
force_reload : bool
Whether to re-download the data source.
verbose : bool
Whether to print progress information.
transform : callable
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
"""
def
__init__
(
self
,
name
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
True
,
transform
=
None
,
):
name
=
name
.
lower
().
replace
(
"-"
,
"_"
)
url
=
f
"https://github.com/yandex-research/heterophilous-graphs/raw/main/data/
{
name
}
.npz"
super
(
HeterophilousGraphDataset
,
self
).
__init__
(
name
=
name
,
url
=
url
,
raw_dir
=
raw_dir
,
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
,
)
def
download
(
self
):
download
(
url
=
self
.
url
,
path
=
os
.
path
.
join
(
self
.
raw_path
,
f
"
{
self
.
name
}
.npz"
)
)
def
process
(
self
):
"""Load and process the data."""
try
:
import
torch
except
ImportError
:
raise
ModuleNotFoundError
(
"This dataset requires PyTorch to be the backend."
)
data
=
np
.
load
(
os
.
path
.
join
(
self
.
raw_path
,
f
"
{
self
.
name
}
.npz"
))
src
=
torch
.
from_numpy
(
data
[
"edges"
][:,
0
])
dst
=
torch
.
from_numpy
(
data
[
"edges"
][:,
1
])
features
=
torch
.
from_numpy
(
data
[
"node_features"
])
labels
=
torch
.
from_numpy
(
data
[
"node_labels"
])
train_masks
=
torch
.
from_numpy
(
data
[
"train_masks"
].
T
)
val_masks
=
torch
.
from_numpy
(
data
[
"val_masks"
].
T
)
test_masks
=
torch
.
from_numpy
(
data
[
"test_masks"
].
T
)
num_nodes
=
len
(
labels
)
num_classes
=
len
(
labels
.
unique
())
self
.
_num_classes
=
num_classes
self
.
_g
=
to_bidirected
(
graph
((
src
,
dst
),
num_nodes
=
num_nodes
))
self
.
_g
.
ndata
[
"feat"
]
=
features
self
.
_g
.
ndata
[
"label"
]
=
labels
self
.
_g
.
ndata
[
"train_mask"
]
=
train_masks
self
.
_g
.
ndata
[
"val_mask"
]
=
val_masks
self
.
_g
.
ndata
[
"test_mask"
]
=
test_masks
def
has_cache
(
self
):
return
os
.
path
.
exists
(
self
.
raw_path
)
def
load
(
self
):
self
.
process
()
def
__getitem__
(
self
,
idx
):
assert
idx
==
0
,
"This dataset has only one graph."
if
self
.
_transform
is
None
:
return
self
.
_g
else
:
return
self
.
_transform
(
self
.
_g
)
def
__len__
(
self
):
return
1
@
property
def
num_classes
(
self
):
return
self
.
_num_classes
class
RomanEmpireDataset
(
HeterophilousGraphDataset
):
r
"""Roman-empire dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
This dataset is based on the Roman Empire article from English Wikipedia, which was selected
since it is one of the longest articles on Wikipedia. Each node in the graph corresponds to one
(non-unique) word in the text. Thus, the number of nodes in the graph is equal to the article’s
length. Two words are connected with an edge if at least one of the following two conditions
holds: either these words follow each other in the text, or these words are connected in the
dependency tree of the sentence (one word depends on the other). Thus, the graph is a chain
graph with additional shortcut edges corresponding to syntactic dependencies between words. The
class of a node is its syntactic role (17 most frequent roles were selected as unique classes
and all the other roles were grouped into the 18th class). Node features are word embeddings.
Statistics:
- Nodes: 22662
- Edges: 65854
- Classes: 18
- Node features: 300
- 10 train/val/test splits
Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> from dgl.data import RomanEmpireDataset
>>> dataset = RomanEmpireDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes
>>> # get node features
>>> feat = g.ndata["feat"]
>>> # get the first data split
>>> train_mask = g.ndata["train_mask"][:, 0]
>>> val_mask = g.ndata["val_mask"][:, 0]
>>> test_mask = g.ndata["test_mask"][:, 0]
>>> # get labels
>>> label = g.ndata['label']
"""
def
__init__
(
self
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
True
,
transform
=
None
):
super
(
RomanEmpireDataset
,
self
).
__init__
(
name
=
"roman-empire"
,
raw_dir
=
raw_dir
,
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
,
)
class
AmazonRatingsDataset
(
HeterophilousGraphDataset
):
r
"""Amazon-ratings dataset from the 'A Critical Look at the Evaluation of GNNs under
Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
This dataset is based on the Amazon product co-purchasing data. Nodes are products (books, music
CDs, DVDs, VHS video tapes), and edges connect products that are frequently bought together. The
task is to predict the average rating given to a product by reviewers. All possible rating
values were grouped into five classes. Node features are the mean of word embeddings for words
in the product description.
Statistics:
- Nodes: 24492
- Edges: 186100
- Classes: 5
- Node features: 300
- 10 train/val/test splits
Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> from dgl.data import AmazonRatingsDataset
>>> dataset = AmazonRatingsDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes
>>> # get node features
>>> feat = g.ndata["feat"]
>>> # get the first data split
>>> train_mask = g.ndata["train_mask"][:, 0]
>>> val_mask = g.ndata["val_mask"][:, 0]
>>> test_mask = g.ndata["test_mask"][:, 0]
>>> # get labels
>>> label = g.ndata['label']
"""
def
__init__
(
self
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
True
,
transform
=
None
):
super
(
AmazonRatingsDataset
,
self
).
__init__
(
name
=
"amazon-ratings"
,
raw_dir
=
raw_dir
,
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
,
)
class
MinesweeperDataset
(
HeterophilousGraphDataset
):
r
"""Minesweeper dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
This dataset is inspired by the Minesweeper game. The graph is a regular 100x100 grid where each
node (cell) is connected to eight neighboring nodes (with the exception of nodes at the edge of
the grid, which have fewer neighbors). 20% of the nodes are randomly selected as mines. The task
is to predict which nodes are mines. The node features are one-hot-encoded numbers of
neighboring mines. However, for randomly selected 50% of the nodes, the features are unknown,
which is indicated by a separate binary feature.
Statistics:
- Nodes: 10000
- Edges: 78804
- Classes: 2
- Node features: 7
- 10 train/val/test splits
Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> from dgl.data import MinesweeperDataset
>>> dataset = MinesweeperDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes
>>> # get node features
>>> feat = g.ndata["feat"]
>>> # get the first data split
>>> train_mask = g.ndata["train_mask"][:, 0]
>>> val_mask = g.ndata["val_mask"][:, 0]
>>> test_mask = g.ndata["test_mask"][:, 0]
>>> # get labels
>>> label = g.ndata['label']
"""
def
__init__
(
self
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
True
,
transform
=
None
):
super
(
MinesweeperDataset
,
self
).
__init__
(
name
=
"minesweeper"
,
raw_dir
=
raw_dir
,
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
,
)
class
TolokersDataset
(
HeterophilousGraphDataset
):
r
"""Tolokers dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
This dataset is based on data from the Toloka crowdsourcing platform. The nodes represent
tolokers (workers). An edge connects two tolokers if they have worked on the same task. The goal
is to predict which tolokers have been banned in one of the projects. Node features are based on
the worker’s profile information and task performance statistics.
Statistics:
- Nodes: 11758
- Edges: 1038000
- Classes: 2
- Node features: 10
- 10 train/val/test splits
Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> from dgl.data import TolokersDataset
>>> dataset = TolokersDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes
>>> # get node features
>>> feat = g.ndata["feat"]
>>> # get the first data split
>>> train_mask = g.ndata["train_mask"][:, 0]
>>> val_mask = g.ndata["val_mask"][:, 0]
>>> test_mask = g.ndata["test_mask"][:, 0]
>>> # get labels
>>> label = g.ndata['label']
"""
def
__init__
(
self
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
True
,
transform
=
None
):
super
(
TolokersDataset
,
self
).
__init__
(
name
=
"tolokers"
,
raw_dir
=
raw_dir
,
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
,
)
class
QuestionsDataset
(
HeterophilousGraphDataset
):
r
"""Questions dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily:
Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper.
This dataset is based on data from the question-answering website Yandex Q. Nodes are users, and
an edge connects two nodes if one user answered the other user’s question. The task is to
predict which users remained active on the website (were not deleted or blocked). Node features
are the mean of word embeddings for words in the user description. Users that do not have
description are indicated by a separate binary feature.
Statistics:
- Nodes: 48921
- Edges: 307080
- Classes: 2
- Node features: 301
- 10 train/val/test splits
Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> from dgl.data import QuestionsDataset
>>> dataset = QuestionsDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes
>>> # get node features
>>> feat = g.ndata["feat"]
>>> # get the first data split
>>> train_mask = g.ndata["train_mask"][:, 0]
>>> val_mask = g.ndata["val_mask"][:, 0]
>>> test_mask = g.ndata["test_mask"][:, 0]
>>> # get labels
>>> label = g.ndata['label']
"""
def
__init__
(
self
,
raw_dir
=
None
,
force_reload
=
False
,
verbose
=
True
,
transform
=
None
):
super
(
QuestionsDataset
,
self
).
__init__
(
name
=
"questions"
,
raw_dir
=
raw_dir
,
force_reload
=
force_reload
,
verbose
=
verbose
,
transform
=
transform
,
)
tests/python/common/data/data/test_heterophilous_graphs.py
0 → 100644
View file @
921476c8
import
unittest
import
backend
as
F
import
dgl
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"gpu"
,
reason
=
"Datasets don't need to be tested on GPU."
,
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only supports PyTorch backend."
,
)
def
test_roman_empire
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
g
=
dgl
.
data
.
RomanEmpireDataset
(
force_reload
=
True
)[
0
]
assert
g
.
num_nodes
()
==
22662
assert
g
.
num_edges
()
==
65854
g2
=
dgl
.
data
.
RomanEmpireDataset
(
force_reload
=
True
,
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"gpu"
,
reason
=
"Datasets don't need to be tested on GPU."
,
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only supports PyTorch backend."
,
)
def
test_amazon_ratings
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
g
=
dgl
.
data
.
AmazonRatingsDataset
(
force_reload
=
True
)[
0
]
assert
g
.
num_nodes
()
==
24492
assert
g
.
num_edges
()
==
186100
g2
=
dgl
.
data
.
AmazonRatingsDataset
(
force_reload
=
True
,
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"gpu"
,
reason
=
"Datasets don't need to be tested on GPU."
,
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only supports PyTorch backend."
,
)
def
test_minesweeper
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
g
=
dgl
.
data
.
MinesweeperDataset
(
force_reload
=
True
)[
0
]
assert
g
.
num_nodes
()
==
10000
assert
g
.
num_edges
()
==
78804
g2
=
dgl
.
data
.
MinesweeperDataset
(
force_reload
=
True
,
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"gpu"
,
reason
=
"Datasets don't need to be tested on GPU."
,
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only supports PyTorch backend."
,
)
def
test_tolokers
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
g
=
dgl
.
data
.
TolokersDataset
(
force_reload
=
True
)[
0
]
assert
g
.
num_nodes
()
==
11758
assert
g
.
num_edges
()
==
1038000
g2
=
dgl
.
data
.
TolokersDataset
(
force_reload
=
True
,
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
@
unittest
.
skipIf
(
F
.
_default_context_str
==
"gpu"
,
reason
=
"Datasets don't need to be tested on GPU."
,
)
@
unittest
.
skipIf
(
dgl
.
backend
.
backend_name
!=
"pytorch"
,
reason
=
"Only supports PyTorch backend."
,
)
def
test_questions
():
transform
=
dgl
.
AddSelfLoop
(
allow_duplicate
=
True
)
g
=
dgl
.
data
.
QuestionsDataset
(
force_reload
=
True
)[
0
]
assert
g
.
num_nodes
()
==
48921
assert
g
.
num_edges
()
==
307080
g2
=
dgl
.
data
.
QuestionsDataset
(
force_reload
=
True
,
transform
=
transform
)[
0
]
assert
g2
.
num_edges
()
-
g
.
num_edges
()
==
g
.
num_nodes
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment