Unverified Commit defa292b authored by RecLusIve-F's avatar RecLusIve-F Committed by GitHub
Browse files

[Dataset] Add Flickr and Yelp dataset (#4099)



* Add Flickr and Yelp dataset

* Update flickr.py

* update

* Update yelp.py

* Update yelp.py

* update

* Update yelp.py

* Update test_data.py

* Update yelp.py

* update

* Update test_data.py

* Update yelp.py
Co-authored-by: default avatarMufei Li <mufeili1996@gmail.com>
parent 9501ed6a
......@@ -52,6 +52,8 @@ Datasets for node classification/regression tasks
TreeCycleDataset
TreeGridDataset
WikiCSDataset
FlickrDataset
YelpDataset
Edge Prediction Datasets
---------------------------------------
......
......@@ -33,6 +33,8 @@ from .csv_dataset import CSVDataset
from .adapter import *
from .synthetic import BAShapeDataset, BACommunityDataset, TreeCycleDataset, TreeGridDataset, BA2MotifDataset
from .wikics import WikiCSDataset
from .flickr import FlickrDataset
from .yelp import YelpDataset
def register_data_args(parser):
parser.add_argument(
......
"""Flickr Dataset"""
import os
import json
import numpy as np
import scipy.sparse as sp
from .. import backend as F
from ..convert import from_scipy
from ..transforms import reorder_graph
from .dgl_dataset import DGLBuiltinDataset
from .utils import generate_mask_tensor, load_graphs, save_graphs, _get_dgl_url
class FlickrDataset(DGLBuiltinDataset):
r"""Flickr dataset for node classification from `GraphSAINT: Graph Sampling Based Inductive
Learning Method <https://arxiv.org/abs/1907.04931>`_
The task of this dataset is categorizing types of images based on the descriptions and common
properties of online images.
Flickr dataset statistics:
- Nodes: 89,250
- Edges: 899,756
- Number of classes: 7
- Node feature size: 500
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset.
Default: False
verbose : bool
Whether to print out progress information.
Default: False
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
reorder : bool
Whether to reorder the graph using :func:`~dgl.reorder_graph`.
Default: False.
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> dataset = FlickrDataset()
>>> dataset.num_classes
7
>>> g = dataset[0]
>>> # get node feature
>>> feat = g.ndata['feat']
>>> # get node labels
>>> labels = g.ndata['label']
>>> # get data split
>>> train_mask = g.ndata['train_mask']
>>> val_mask = g.ndata['val_mask']
>>> test_mask = g.ndata['test_mask']
"""
def __init__(self, raw_dir=None, force_reload=False, verbose=False, transform=None,
reorder=False):
_url = _get_dgl_url('dataset/flickr.zip')
self._reorder = reorder
super(FlickrDataset, self).__init__(name='flickr',
raw_dir=raw_dir,
url=_url,
force_reload=force_reload,
verbose=verbose,
transform=transform)
def process(self):
"""process raw data to graph, labels and masks"""
coo_adj = sp.load_npz(os.path.join(self.raw_path, "adj_full.npz"))
g = from_scipy(coo_adj)
features = np.load(os.path.join(self.raw_path, 'feats.npy'))
features = F.tensor(features, dtype=F.float32)
y = [-1] * features.shape[0]
with open(os.path.join(self.raw_path, 'class_map.json')) as f:
class_map = json.load(f)
for key, item in class_map.items():
y[int(key)] = item
labels = F.tensor(np.array(y), dtype=F.int64)
with open(os.path.join(self.raw_path, 'role.json')) as f:
role = json.load(f)
train_mask = np.zeros(features.shape[0], dtype=bool)
train_mask[role['tr']] = True
val_mask = np.zeros(features.shape[0], dtype=bool)
val_mask[role['va']] = True
test_mask = np.zeros(features.shape[0], dtype=bool)
test_mask[role['te']] = True
g.ndata['feat'] = features
g.ndata['label'] = labels
g.ndata['train_mask'] = generate_mask_tensor(train_mask)
g.ndata['val_mask'] = generate_mask_tensor(val_mask)
g.ndata['test_mask'] = generate_mask_tensor(test_mask)
if self._reorder:
self._graph = reorder_graph(
g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
else:
self._graph = g
def has_cache(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
return os.path.exists(graph_path)
def save(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
save_graphs(graph_path, self._graph)
def load(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
g, _ = load_graphs(graph_path)
self._graph = g[0]
@property
def num_classes(self):
return 7
def __len__(self):
r"""The number of graphs in the dataset."""
return 1
def __getitem__(self, idx):
r""" Get graph object
Parameters
----------
idx : int
Item index, FlickrDataset has only one graph object
Returns
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['label']``: node label
- ``ndata['feat']``: node feature
- ``ndata['train_mask']``: mask for training node set
- ``ndata['val_mask']``: mask for validation node set
- ``ndata['test_mask']:`` mask for test node set
"""
assert idx == 0, "This dataset has only one graph"
if self._transform is None:
return self._graph
else:
return self._transform(self._graph)
\ No newline at end of file
......@@ -22,7 +22,7 @@ class WikiCSDataset(DGLBuiltinDataset):
- Nodes: 11,701
- Edges: 431,726 (note that the original dataset has 216,123 edges but DGL adds
the reverse edges and removes the duplicate edges, hence with a different number)
- Number of Classes: 10
- Number of classes: 10
- Node feature size: 300
- Number of different train, validation, stopping splits: 20
- Number of test split: 1
......@@ -59,9 +59,9 @@ class WikiCSDataset(DGLBuiltinDataset):
>>> # get node labels
>>> labels = g.ndata['label']
>>> # get data split
>>> train_mask = g.ndata['train_masks']
>>> val_mask = g.ndata['val_masks']
>>> stopping_mask = g.ndata['stopping_masks']
>>> train_mask = g.ndata['train_mask']
>>> val_mask = g.ndata['val_mask']
>>> stopping_mask = g.ndata['stopping_mask']
>>> test_mask = g.ndata['test_mask']
>>> # The shape of train, val and stopping masks are (num_nodes, num_splits).
>>> # The num_splits is the number of different train, validation, stopping splits.
......@@ -150,8 +150,7 @@ class WikiCSDataset(DGLBuiltinDataset):
- ``ndata['label']``: node labels
- ``ndata['train_mask']``: train mask is for retrieving the nodes for training.
- ``ndata['val_mask']``: val mask is for retrieving the nodes for hyperparameter tuning.
- ``ndata['stopping_mask']``: stopping mask is for retrieving the nodes for early
stopping criterion.
- ``ndata['stopping_mask']``: stopping mask is for retrieving the nodes for early stopping criterion.
- ``ndata['test_mask']``: test mask is for retrieving the nodes for testing.
"""
......
"""Yelp Dataset"""
import os
import json
import numpy as np
import scipy.sparse as sp
from .. import backend as F
from ..convert import from_scipy
from ..transforms import reorder_graph
from .dgl_dataset import DGLBuiltinDataset
from .utils import generate_mask_tensor, load_graphs, save_graphs, _get_dgl_url
class YelpDataset(DGLBuiltinDataset):
r"""Yelp dataset for node classification from `GraphSAINT: Graph Sampling Based Inductive
Learning Method <https://arxiv.org/abs/1907.04931>`_
The task of this dataset is categorizing types of businesses based on customer reviewers and
friendship.
Yelp dataset statistics:
- Nodes: 716,847
- Edges: 13,954,819
- Number of classes: 100 (Multi-class)
- Node feature size: 300
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Default: ~/.dgl/
force_reload : bool
Whether to reload the dataset.
Default: False
verbose : bool
Whether to print out progress information.
Default: False
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
reorder : bool
Whether to reorder the graph using :func:`~dgl.reorder_graph`.
Default: False.
Attributes
----------
num_classes : int
Number of node classes
Examples
--------
>>> dataset = YelpDataset()
>>> dataset.num_classes
100
>>> g = dataset[0]
>>> # get node feature
>>> feat = g.ndata['feat']
>>> # get node labels
>>> labels = g.ndata['label']
>>> # get data split
>>> train_mask = g.ndata['train_mask']
>>> val_mask = g.ndata['val_mask']
>>> test_mask = g.ndata['test_mask']
"""
def __init__(self, raw_dir=None, force_reload=False, verbose=False, transform=None,
reorder=False):
_url = _get_dgl_url('dataset/yelp.zip')
self._reorder = reorder
super(YelpDataset, self).__init__(name='yelp',
raw_dir=raw_dir,
url=_url,
force_reload=force_reload,
verbose=verbose,
transform=transform)
def process(self):
"""process raw data to graph, labels and masks"""
coo_adj = sp.load_npz(os.path.join(self.raw_path, "adj_full.npz"))
g = from_scipy(coo_adj)
features = np.load(os.path.join(self.raw_path, 'feats.npy'))
features = F.tensor(features, dtype=F.float32)
y = [-1] * features.shape[0]
with open(os.path.join(self.raw_path, 'class_map.json')) as f:
class_map = json.load(f)
for key, item in class_map.items():
y[int(key)] = item
labels = F.tensor(np.array(y), dtype=F.int64)
with open(os.path.join(self.raw_path, 'role.json')) as f:
role = json.load(f)
train_mask = np.zeros(features.shape[0], dtype=bool)
train_mask[role['tr']] = True
val_mask = np.zeros(features.shape[0], dtype=bool)
val_mask[role['va']] = True
test_mask = np.zeros(features.shape[0], dtype=bool)
test_mask[role['te']] = True
g.ndata['feat'] = features
g.ndata['label'] = labels
g.ndata['train_mask'] = generate_mask_tensor(train_mask)
g.ndata['val_mask'] = generate_mask_tensor(val_mask)
g.ndata['test_mask'] = generate_mask_tensor(test_mask)
if self._reorder:
self._graph = reorder_graph(
g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
else:
self._graph = g
def has_cache(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
return os.path.exists(graph_path)
def save(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
save_graphs(graph_path, self._graph)
def load(self):
graph_path = os.path.join(self.save_path, 'dgl_graph.bin')
g, _ = load_graphs(graph_path)
self._graph = g[0]
@property
def num_classes(self):
return 100
def __len__(self):
r"""The number of graphs in the dataset."""
return 1
def __getitem__(self, idx):
r""" Get graph object
Parameters
----------
idx : int
Item index, FlickrDataset has only one graph object
Returns
-------
:class:`dgl.DGLGraph`
The graph contains:
- ``ndata['label']``: node label
- ``ndata['feat']``: node feature
- ``ndata['train_mask']``: mask for training node set
- ``ndata['val_mask']``: mask for validation node set
- ``ndata['test_mask']:`` mask for test node set
"""
assert idx == 0, "This dataset has only one graph"
if self._transform is None:
return self._graph
else:
return self._transform(self._graph)
......@@ -276,6 +276,29 @@ def test_wiki_cs():
g2 = data.WikiCSDataset(transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skip(reason="Dataset too large to download for the latest CI.")
def test_yelp():
g = data.YelpDataset(reorder=True)[0]
assert g.num_nodes() == 716847
assert g.num_edges() == 13954819
dst = F.asnumpy(g.edges()[1])
assert np.array_equal(dst, np.sort(dst))
transform = dgl.AddSelfLoop(allow_duplicate=True)
g2 = data.YelpDataset(reorder=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_flickr():
g = data.FlickrDataset(reorder=True)[0]
assert g.num_nodes() == 89250
assert g.num_edges() == 899756
dst = F.asnumpy(g.edges()[1])
assert np.array_equal(dst, np.sort(dst))
transform = dgl.AddSelfLoop(allow_duplicate=True)
g2 = data.FlickrDataset(reorder=True, transform=transform)[0]
assert g2.num_edges() - g.num_edges() == g.num_nodes()
@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
def test_extract_archive():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment