Unverified Commit defa292b authored by RecLusIve-F's avatar RecLusIve-F Committed by GitHub
Browse files

[Dataset] Add Flickr and Yelp dataset (#4099)



* Add Flickr and Yelp dataset

* Update flickr.py

* update

* Update yelp.py

* Update yelp.py

* update

* Update yelp.py

* Update test_data.py

* Update yelp.py

* update

* Update test_data.py

* Update yelp.py
Co-authored-by: default avatarMufei Li <mufeili1996@gmail.com>
parent 9501ed6a
...@@ -52,6 +52,8 @@ Datasets for node classification/regression tasks ...@@ -52,6 +52,8 @@ Datasets for node classification/regression tasks
TreeCycleDataset TreeCycleDataset
TreeGridDataset TreeGridDataset
WikiCSDataset WikiCSDataset
FlickrDataset
YelpDataset
Edge Prediction Datasets Edge Prediction Datasets
--------------------------------------- ---------------------------------------
......
...@@ -33,6 +33,8 @@ from .csv_dataset import CSVDataset ...@@ -33,6 +33,8 @@ from .csv_dataset import CSVDataset
from .adapter import * from .adapter import *
from .synthetic import BAShapeDataset, BACommunityDataset, TreeCycleDataset, TreeGridDataset, BA2MotifDataset from .synthetic import BAShapeDataset, BACommunityDataset, TreeCycleDataset, TreeGridDataset, BA2MotifDataset
from .wikics import WikiCSDataset from .wikics import WikiCSDataset
from .flickr import FlickrDataset
from .yelp import YelpDataset
def register_data_args(parser): def register_data_args(parser):
parser.add_argument( parser.add_argument(
......
"""Flickr Dataset"""
import os
import json
import numpy as np
import scipy.sparse as sp
from .. import backend as F
from ..convert import from_scipy
from ..transforms import reorder_graph
from .dgl_dataset import DGLBuiltinDataset
from .utils import generate_mask_tensor, load_graphs, save_graphs, _get_dgl_url
class FlickrDataset(DGLBuiltinDataset):
r"""Flickr dataset for node classification from `GraphSAINT: Graph Sampling Based Inductive
Learning Method <https://arxiv.org/abs/1907.04931>`_
The task of this dataset is categorizing types of images based on the descriptions and common
properties of online images.
Flickr dataset statistics:
- Nodes: 89,250
- Edges: 899,756
- Number of classes: 7
- Node feature size: 500
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset.
Default: False
verbose : bool
Whether to print out progress information.
Default: False
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
reorder : bool
Whether to reorder the graph using :func:`~dgl.reorder_graph`.
Default: False.
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> dataset = FlickrDataset()
>>> dataset.num_classes
7
>>> g = dataset[0]
>>> # get node feature
>>> feat = g.ndata['feat']
>>> # get node labels
>>> labels = g.ndata['label']
>>> # get data split
>>> train_mask = g.ndata['train_mask']
>>> val_mask = g.ndata['val_mask']
>>> test_mask = g.ndata['test_mask']
"""
def __init__(self, raw_dir=None, force_reload=False, verbose=False, transform=None,
reorder=False):
_url = _get_dgl_url('dataset/flickr.zip')
self._reorder = reorder
super(FlickrDataset, self).__init__(name='flickr',
raw_dir=raw_dir,
url=_url,
force_reload=force_reload,
verbose=verbose,
transform=transform)
def process(self):
"""process raw data to graph, labels and masks"""
coo_adj = sp.load_npz(os.path.join(self.raw_path, "adj_full.npz"))
g = from_scipy(coo_adj)
features = np.load(os.path.join(self.raw_path, 'feats.npy'))
features = F.tensor(features, dtype=F.float32)
y = [-1] * features.shape[0]
with open(os.path.join(self.raw_path, 'class_map.json')) as f:
class_map = json.load(f)
for key, item in class_map.items():
y[int(key)] = item
labels = F.tensor(np.array(y), dtype=F.int64)
with open(os.path.join(self.raw_path, 'role.json')) as f:
role = json.load(f)
train_mask = np.zeros(features.shape[0], dtype=bool)
train_mask[role['tr']] = True
val_mask = np.zeros(features.shape[0], dtype=bool)
val_mask[role['va']] = True
test_mask = np.zeros(features.shape[0], dtype=bool)
test_mask[role['te']] = True
g.ndata['feat'] = features
g.ndata['label'] = labels
g.ndata['train_mask'] = generate_mask_tensor(train_mask)
g.ndata['val_mask'] = generate_mask_tensor(val_mask)
g.ndata['test_mask'] = generate_mask_tensor(test_mask)
if self._reorder:
self._graph = reorder_graph(
g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
else:
self._graph = g
def has_cache(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
return os.path.exists(graph_path)
def save(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
save_graphs(graph_path, self._graph)
def load(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
g, _ = load_graphs(graph_path)
self._graph = g[0]
@property
def num_classes(self):
return 7
def __len__(self):
r"""The number of graphs in the dataset."""
return 1
def __getitem__(self, idx):
r""" Get graph object
Parameters
----------
idx : int
Item index, FlickrDataset has only one graph object
Returns
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['label']``: node label
- ``ndata['feat']``: node feature
- ``ndata['train_mask']``: mask for training node set
- ``ndata['val_mask']``: mask for validation node set
- ``ndata['test_mask']:`` mask for test node set
"""
assert idx == 0, "This dataset has only one graph"
if self._transform is None:
return self._graph
else:
return self._transform(self._graph)
\ No newline at end of file
...@@ -22,7 +22,7 @@ class WikiCSDataset(DGLBuiltinDataset): ...@@ -22,7 +22,7 @@ class WikiCSDataset(DGLBuiltinDataset):
- Nodes: 11,701 - Nodes: 11,701
- Edges: 431,726 (note that the original dataset has 216,123 edges but DGL adds - Edges: 431,726 (note that the original dataset has 216,123 edges but DGL adds
the reverse edges and removes the duplicate edges, hence with a different number) the reverse edges and removes the duplicate edges, hence with a different number)
- Number of Classes: 10 - Number of classes: 10
- Node feature size: 300 - Node feature size: 300
- Number of different train, validation, stopping splits: 20 - Number of different train, validation, stopping splits: 20
- Number of test split: 1 - Number of test split: 1
...@@ -59,9 +59,9 @@ class WikiCSDataset(DGLBuiltinDataset): ...@@ -59,9 +59,9 @@ class WikiCSDataset(DGLBuiltinDataset):
>>> # get node labels >>> # get node labels
>>> labels = g.ndata['label'] >>> labels = g.ndata['label']
>>> # get data split >>> # get data split
>>> train_mask = g.ndata['train_masks'] >>> train_mask = g.ndata['train_mask']
>>> val_mask = g.ndata['val_masks'] >>> val_mask = g.ndata['val_mask']
>>> stopping_mask = g.ndata['stopping_masks'] >>> stopping_mask = g.ndata['stopping_mask']
>>> test_mask = g.ndata['test_mask'] >>> test_mask = g.ndata['test_mask']
>>> # The shape of train, val and stopping masks are (num_nodes, num_splits). >>> # The shape of train, val and stopping masks are (num_nodes, num_splits).
>>> # The num_splits is the number of different train, validation, stopping splits. >>> # The num_splits is the number of different train, validation, stopping splits.
...@@ -150,8 +150,7 @@ class WikiCSDataset(DGLBuiltinDataset): ...@@ -150,8 +150,7 @@ class WikiCSDataset(DGLBuiltinDataset):
- ``ndata['label']``: node labels - ``ndata['label']``: node labels
- ``ndata['train_mask']``: train mask is for retrieving the nodes for training. - ``ndata['train_mask']``: train mask is for retrieving the nodes for training.
- ``ndata['val_mask']``: val mask is for retrieving the nodes for hyperparameter tuning. - ``ndata['val_mask']``: val mask is for retrieving the nodes for hyperparameter tuning.
- ``ndata['stopping_mask']``: stopping mask is for retrieving the nodes for early - ``ndata['stopping_mask']``: stopping mask is for retrieving the nodes for early stopping criterion.
stopping criterion.
- ``ndata['test_mask']``: test mask is for retrieving the nodes for testing. - ``ndata['test_mask']``: test mask is for retrieving the nodes for testing.
""" """
......
"""Yelp Dataset"""
import os
import json
import numpy as np
import scipy.sparse as sp
from .. import backend as F
from ..convert import from_scipy
from ..transforms import reorder_graph
from .dgl_dataset import DGLBuiltinDataset
from .utils import generate_mask_tensor, load_graphs, save_graphs, _get_dgl_url
class YelpDataset(DGLBuiltinDataset):
r"""Yelp dataset for node classification from `GraphSAINT: Graph Sampling Based Inductive
Learning Method <https://arxiv.org/abs/1907.04931>`_
The task of this dataset is categorizing types of businesses based on customer reviewers and
friendship.
Yelp dataset statistics:
- Nodes: 716,847
- Edges: 13,954,819
- Number of classes: 100 (Multi-class)
- Node feature size: 300
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset.
Default: False
verbose : bool
Whether to print out progress information.
Default: False
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
reorder : bool
Whether to reorder the graph using :func:`~dgl.reorder_graph`.
Default: False.
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> dataset = YelpDataset()
>>> dataset.num_classes
100
>>> g = dataset[0]
>>> # get node feature
>>> feat = g.ndata['feat']
>>> # get node labels
>>> labels = g.ndata['label']
>>> # get data split
>>> train_mask = g.ndata['train_mask']
>>> val_mask = g.ndata['val_mask']
>>> test_mask = g.ndata['test_mask']
"""
def __init__(self, raw_dir=None, force_reload=False, verbose=False, transform=None,
reorder=False):
_url = _get_dgl_url('dataset/yelp.zip')
self._reorder = reorder
super(YelpDataset, self).__init__(name='yelp',
raw_dir=raw_dir,
url=_url,
force_reload=force_reload,
verbose=verbose,
transform=transform)
def process(self):
"""process raw data to graph, labels and masks"""
coo_adj = sp.load_npz(os.path.join(self.raw_path, "adj_full.npz"))
g = from_scipy(coo_adj)
features = np.load(os.path.join(self.raw_path, 'feats.npy'))
features = F.tensor(features, dtype=F.float32)
y = [-1] * features.shape[0]
with open(os.path.join(self.raw_path, 'class_map.json')) as f:
class_map = json.load(f)
for key, item in class_map.items():
y[int(key)] = item
labels = F.tensor(np.array(y), dtype=F.int64)
with open(os.path.join(self.raw_path, 'role.json')) as f:
role = json.load(f)
train_mask = np.zeros(features.shape[0], dtype=bool)
train_mask[role['tr']] = True
val_mask = np.zeros(features.shape[0], dtype=bool)
val_mask[role['va']] = True
test_mask = np.zeros(features.shape[0], dtype=bool)
test_mask[role['te']] = True
g.ndata['feat'] = features
g.ndata['label'] = labels
g.ndata['train_mask'] = generate_mask_tensor(train_mask)
g.ndata['val_mask'] = generate_mask_tensor(val_mask)
g.ndata['test_mask'] = generate_mask_tensor(test_mask)
if self._reorder:
self._graph = reorder_graph(
g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
else:
self._graph = g
def has_cache(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
return os.path.exists(graph_path)
def save(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
save_graphs(graph_path, self._graph)
def load(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
g, _ = load_graphs(graph_path)
self._graph = g[0]
@property
def num_classes(self):
return 100
def __len__(self):
r"""The number of graphs in the dataset."""
return 1
def __getitem__(self, idx):
r""" Get graph object
Parameters
----------
idx : int
Item index, FlickrDataset has only one graph object
Returns
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['label']``: node label
- ``ndata['feat']``: node feature
- ``ndata['train_mask']``: mask for training node set
- ``ndata['val_mask']``: mask for validation node set
- ``ndata['test_mask']:`` mask for test node set
"""
assert idx == 0, "This dataset has only one graph"
if self._transform is None:
return self._graph
else:
return self._transform(self._graph)
...@@ -276,6 +276,29 @@ def test_wiki_cs(): ...@@ -276,6 +276,29 @@ def test_wiki_cs():
g2 = data.WikiCSDataset(transform=transform)[0] g2 = data.WikiCSDataset(transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes() assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skip(reason="Dataset too large to download for the latest CI.")
def test_yelp():
g = data.YelpDataset(reorder=True)[0]
assert g.num_nodes() == 716847
assert g.num_edges() == 13954819
dst = F.asnumpy(g.edges()[1])
assert np.array_equal(dst, np.sort(dst))
transform = dgl.AddSelfLoop(allow_duplicate=True)
g2 = data.YelpDataset(reorder=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_flickr():
g = data.FlickrDataset(reorder=True)[0]
assert g.num_nodes() == 89250
assert g.num_edges() == 899756
dst = F.asnumpy(g.edges()[1])
assert np.array_equal(dst, np.sort(dst))
transform = dgl.AddSelfLoop(allow_duplicate=True)
g2 = data.FlickrDataset(reorder=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.") @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_extract_archive(): def test_extract_archive():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment