Unverified Commit dd4c74ff authored by Mufei Li's avatar Mufei Li Committed by GitHub
Browse files

[Dataset] CornellDataset and TexasDataset (#5513)


Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-36-188.ap-northeast-1.compute.internal>
parent 10c43456
...@@ -59,6 +59,8 @@ Datasets for node classification/regression tasks ...@@ -59,6 +59,8 @@ Datasets for node classification/regression tasks
ChameleonDataset ChameleonDataset
SquirrelDataset SquirrelDataset
ActorDataset ActorDataset
CornellDataset
TexasDataset
Edge Prediction Datasets Edge Prediction Datasets
--------------------------------------- ---------------------------------------
......
...@@ -54,8 +54,13 @@ from .tree import SST, SSTDataset ...@@ -54,8 +54,13 @@ from .tree import SST, SSTDataset
from .tu import LegacyTUDataset, TUDataset from .tu import LegacyTUDataset, TUDataset
from .utils import * from .utils import *
from .cluster import CLUSTERDataset from .cluster import CLUSTERDataset
from .geom_gcn import (
ChameleonDataset,
CornellDataset,
SquirrelDataset,
TexasDataset,
)
from .pattern import PATTERNDataset from .pattern import PATTERNDataset
from .wiki_network import ChameleonDataset, SquirrelDataset
from .wikics import WikiCSDataset from .wikics import WikiCSDataset
from .yelp import YelpDataset from .yelp import YelpDataset
from .zinc import ZINCDataset from .zinc import ZINCDataset
......
""" """Datasets introduced in the Geom-GCN paper."""
Wikipedia page-page networks on two topics: chameleons and squirrels.
"""
import os import os
import numpy as np import numpy as np
...@@ -10,11 +8,10 @@ from .dgl_dataset import DGLBuiltinDataset ...@@ -10,11 +8,10 @@ from .dgl_dataset import DGLBuiltinDataset
from .utils import _get_dgl_url from .utils import _get_dgl_url
class WikiNetworkDataset(DGLBuiltinDataset): class GeomGCNDataset(DGLBuiltinDataset):
r"""Wikipedia page-page networks from `Multi-scale Attributed r"""Datasets introduced in
Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by
`Geom-GCN: Geometric Graph Convolutional Networks `Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>` <https://arxiv.org/abs/2002.05287>`__
Parameters Parameters
---------- ----------
...@@ -34,7 +31,7 @@ class WikiNetworkDataset(DGLBuiltinDataset): ...@@ -34,7 +31,7 @@ class WikiNetworkDataset(DGLBuiltinDataset):
def __init__(self, name, raw_dir, force_reload, verbose, transform): def __init__(self, name, raw_dir, force_reload, verbose, transform):
url = _get_dgl_url(f"dataset/{name}.zip") url = _get_dgl_url(f"dataset/{name}.zip")
super(WikiNetworkDataset, self).__init__( super(GeomGCNDataset, self).__init__(
name=name, name=name,
url=url, url=url,
raw_dir=raw_dir, raw_dir=raw_dir,
...@@ -106,11 +103,11 @@ class WikiNetworkDataset(DGLBuiltinDataset): ...@@ -106,11 +103,11 @@ class WikiNetworkDataset(DGLBuiltinDataset):
return self._num_classes return self._num_classes
class ChameleonDataset(WikiNetworkDataset): class ChameleonDataset(GeomGCNDataset):
r"""Wikipedia page-page network on chameleons from `Multi-scale Attributed r"""Wikipedia page-page network on chameleons from `Multi-scale Attributed
Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by
`Geom-GCN: Geometric Graph Convolutional Networks `Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>` <https://arxiv.org/abs/2002.05287>`__
Nodes represent articles from the English Wikipedia, edges reflect mutual Nodes represent articles from the English Wikipedia, edges reflect mutual
links between them. Node features indicate the presence of particular nouns links between them. Node features indicate the presence of particular nouns
...@@ -182,11 +179,11 @@ class ChameleonDataset(WikiNetworkDataset): ...@@ -182,11 +179,11 @@ class ChameleonDataset(WikiNetworkDataset):
) )
class SquirrelDataset(WikiNetworkDataset): class SquirrelDataset(GeomGCNDataset):
r"""Wikipedia page-page network on squirrels from `Multi-scale Attributed r"""Wikipedia page-page network on squirrels from `Multi-scale Attributed
Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by Node Embedding <https://arxiv.org/abs/1909.13021>`__ and later modified by
`Geom-GCN: Geometric Graph Convolutional Networks `Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>` <https://arxiv.org/abs/2002.05287>`__
Nodes represent articles from the English Wikipedia, edges reflect mutual Nodes represent articles from the English Wikipedia, edges reflect mutual
links between them. Node features indicate the presence of particular nouns links between them. Node features indicate the presence of particular nouns
...@@ -256,3 +253,155 @@ class SquirrelDataset(WikiNetworkDataset): ...@@ -256,3 +253,155 @@ class SquirrelDataset(WikiNetworkDataset):
verbose=verbose, verbose=verbose,
transform=transform, transform=transform,
) )
class CornellDataset(GeomGCNDataset):
r"""Cornell subset of
`WebKB <http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/>`__,
later modified by `Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>`__
Nodes represent web pages. Edges represent hyperlinks between them. Node
features are the bag-of-words representation of web pages. The web pages
are manually classified into the five categories, student, project, course,
staff, and faculty.
Statistics:
- Nodes: 183
- Edges: 298
- Number of Classes: 5
- 10 train/val/test splits
- Train: 87
- Val: 59
- Test: 37
Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None
Attributes
----------
num_classes : int
Number of node classes
Notes
-----
The graph does not come with edges for both directions.
Examples
--------
>>> from dgl.data import CornellDataset
>>> dataset = CornellDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes
>>> # get node features
>>> feat = g.ndata["feat"]
>>> # get data split
>>> train_mask = g.ndata["train_mask"]
>>> val_mask = g.ndata["val_mask"]
>>> test_mask = g.ndata["test_mask"]
>>> # get labels
>>> label = g.ndata['label']
"""
def __init__(
self, raw_dir=None, force_reload=False, verbose=True, transform=None
):
super(CornellDataset, self).__init__(
name="cornell",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
class TexasDataset(GeomGCNDataset):
r"""Texas subset of
`WebKB <http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-11/www/wwkb/>`__,
later modified by `Geom-GCN: Geometric Graph Convolutional Networks
<https://arxiv.org/abs/2002.05287>`__
Nodes represent web pages. Edges represent hyperlinks between them. Node
features are the bag-of-words representation of web pages. The web pages
are manually classified into the five categories, student, project, course,
staff, and faculty.
Statistics:
- Nodes: 183
- Edges: 325
- Number of Classes: 5
- 10 train/val/test splits
- Train: 87
- Val: 59
- Test: 37
Parameters
----------
raw_dir : str, optional
Raw file directory to store the processed data. Default: ~/.dgl/
force_reload : bool, optional
Whether to re-download the data source. Default: False
verbose : bool, optional
Whether to print progress information. Default: True
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access. Default: None
Attributes
----------
num_classes : int
Number of node classes
Notes
-----
The graph does not come with edges for both directions.
Examples
--------
>>> from dgl.data import TexasDataset
>>> dataset = TexasDataset()
>>> g = dataset[0]
>>> num_classes = dataset.num_classes
>>> # get node features
>>> feat = g.ndata["feat"]
>>> # get data split
>>> train_mask = g.ndata["train_mask"]
>>> val_mask = g.ndata["val_mask"]
>>> test_mask = g.ndata["test_mask"]
>>> # get labels
>>> label = g.ndata['label']
"""
def __init__(
self, raw_dir=None, force_reload=False, verbose=True, transform=None
):
super(TexasDataset, self).__init__(
name="texas",
raw_dir=raw_dir,
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
...@@ -37,3 +37,37 @@ def test_squirrel(): ...@@ -37,3 +37,37 @@ def test_squirrel():
assert g.num_edges() == 217073 assert g.num_edges() == 217073
g2 = dgl.data.SquirrelDataset(force_reload=True, transform=transform)[0] g2 = dgl.data.SquirrelDataset(force_reload=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes() assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_cornell():
transform = dgl.AddSelfLoop(allow_duplicate=True)
g = dgl.data.CornellDataset(force_reload=True)[0]
assert g.num_nodes() == 183
assert g.num_edges() == 298
g2 = dgl.data.CornellDataset(force_reload=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_texas():
transform = dgl.AddSelfLoop(allow_duplicate=True)
g = dgl.data.TexasDataset(force_reload=True)[0]
assert g.num_nodes() == 183
assert g.num_edges() == 325
g2 = dgl.data.TexasDataset(force_reload=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment