"git@developer.sourcefind.cn:OpenDAS/bitsandbytes.git" did not exist on "94168d79d74174ee4ba7c183e2cfc7dacc89c939"
Unverified Commit 39ed0966 authored by Tong He's avatar Tong He Committed by GitHub
Browse files

[Dataset] Builtin LegacyTUDataset and TUDataset (#1891)



* add LegacyTUDataset and TUDataset

* update docstrings and url

* update docstrings

* add hash to fix save/load

* update docstring and use hash
Co-authored-by: default avatarxiang song(charlie.song) <classicxsong@gmail.com>
parent 65e9644e
...@@ -3,62 +3,90 @@ import numpy as np ...@@ -3,62 +3,90 @@ import numpy as np
import os import os
import random import random
from .utils import download, extract_archive, get_download_dir, loadtxt
from ..utils import retry_method_with_fix from .dgl_dataset import DGLBuiltinDataset
from ..convert import graph from .utils import loadtxt, save_graphs, load_graphs, save_info, load_info
from .. import backend as F from .. import backend as F
from ..utils import retry_method_with_fix
from ..convert import graph as dgl_graph
class LegacyTUDataset(object): class LegacyTUDataset(DGLBuiltinDataset):
""" r"""LegacyTUDataset contains lots of graph kernel datasets for graph classification.
TUDataset contains lots of graph kernel datasets for graph classification.
Use provided node feature by default. If no feature provided, use one-hot node label instead.
If neither labels provided, use constant for node feature.
:param name: Dataset Name, such as `ENZYMES`, `DD`, `COLLAB` Parameters
:param use_pandas: Default: False. ----------
name : str
Dataset Name, such as `ENZYMES`, `DD`, `COLLAB`
use_pandas : bool
Numpy's file read function has performance issue when file is large, Numpy's file read function has performance issue when file is large,
using pandas can be faster. using pandas can be faster.
:param hidden_size: Default 10. Some dataset doesn't contain features. Default: False
hidden_size : int
Some dataset doesn't contain features.
Use constant node features initialization instead, with hidden size as `hidden_size`. Use constant node features initialization instead, with hidden size as `hidden_size`.
Default : 10
max_allow_node : int
Remove graphs that contains more nodes than `max_allow_node`.
Default : None
Attributes
----------
max_num_node : int
Maximum number of nodes
num_labels : int
Number of classes
Examples
--------
>>> data = LegacyTUDataset('DD')
**The dataset instance is an iterable**
>>> len(data)
1178
>>> g, label = data[1024]
>>> g
Graph(num_nodes=88, num_edges=410,
ndata_schemes={'feat': Scheme(shape=(89,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
>>> label
tensor(1)
**Batch the graphs and labels for mini-batch training*
>>> graphs, labels = zip(*[data[i] for i in range(16)])
>>> batched_graphs = dgl.batch(graphs)
>>> batched_labels = torch.tensor(labels)
>>> batched_graphs
Graph(num_nodes=9539, num_edges=47382,
ndata_schemes={'feat': Scheme(shape=(89,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
Notes
-----
LegacyTUDataset uses provided node feature by default. If no feature provided, it uses one-hot node label instead.
If neither labels provided, it uses constant for node feature.
""" """
_url = r"https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/{}.zip" _url = r"https://www.chrsmrrs.com/graphkerneldatasets/{}.zip"
def __init__(self, name, use_pandas=False, def __init__(self, name, use_pandas=False,
hidden_size=10, max_allow_node=None): hidden_size=10, max_allow_node=None,
raw_dir=None, force_reload=False, verbose=False):
self.name = name url = self._url.format(name)
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.extract_dir = self._get_extract_dir()
self._load(use_pandas, max_allow_node)
def _get_extract_dir(self):
download_dir = get_download_dir()
zip_file_path = os.path.join(
download_dir,
"tu_{}.zip".format(
self.name))
extract_dir = os.path.join(download_dir, "tu_{}".format(self.name))
return extract_dir
def _download(self):
download_dir = get_download_dir()
zip_file_path = os.path.join(
download_dir,
"tu_{}.zip".format(
self.name))
download(self._url.format(self.name), path=zip_file_path)
extract_dir = os.path.join(download_dir, "tu_{}".format(self.name))
extract_archive(zip_file_path, extract_dir)
@retry_method_with_fix(_download)
def _load(self, use_pandas, max_allow_node):
self.data_mode = None
self.max_allow_node = max_allow_node self.max_allow_node = max_allow_node
self.use_pandas = use_pandas
self.hash = abs(hash((name, use_pandas, hidden_size, max_allow_node)))
super(LegacyTUDataset, self).__init__(name=name, url=url, raw_dir=raw_dir,
hash_key=(name, use_pandas, hidden_size, max_allow_node),
force_reload=force_reload, verbose=verbose)
if use_pandas: def process(self):
self.data_mode = None
if self.use_pandas:
import pandas as pd import pandas as pd
DS_edge_list = self._idx_from_zero( DS_edge_list = self._idx_from_zero(
pd.read_csv(self._file_path("A"), delimiter=",", dtype=int, header=None).values) pd.read_csv(self._file_path("A"), delimiter=",", dtype=int, header=None).values)
...@@ -71,7 +99,7 @@ class LegacyTUDataset(object): ...@@ -71,7 +99,7 @@ class LegacyTUDataset(object):
DS_graph_labels = self._idx_from_zero( DS_graph_labels = self._idx_from_zero(
np.genfromtxt(self._file_path("graph_labels"), dtype=int)) np.genfromtxt(self._file_path("graph_labels"), dtype=int))
g = graph([]) g = dgl_graph([])
g.add_nodes(int(DS_edge_list.max()) + 1) g.add_nodes(int(DS_edge_list.max()) + 1)
g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1]) g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1])
...@@ -113,35 +141,63 @@ class LegacyTUDataset(object): ...@@ -113,35 +141,63 @@ class LegacyTUDataset(object):
for idxs, g in zip(node_idx_list, self.graph_lists): for idxs, g in zip(node_idx_list, self.graph_lists):
g.ndata['feat'] = np.ones((g.number_of_nodes(), hidden_size)) g.ndata['feat'] = np.ones((g.number_of_nodes(), hidden_size))
self.data_mode = "constant" self.data_mode = "constant"
print( if self.verbose:
"Use Constant one as Feature with hidden size {}".format(hidden_size)) print("Use Constant one as Feature with hidden size {}".format(hidden_size))
# remove graphs that are too large by user given standard # remove graphs that are too large by user given standard
# optional pre-processing steop in conformity with Rex Ying's original # optional pre-processing steop in conformity with Rex Ying's original
# DiffPool implementation # DiffPool implementation
if self.max_allow_node: if self.max_allow_node:
preserve_idx = [] preserve_idx = []
print("original dataset length : ", len(self.graph_lists)) if self.verbose:
print("original dataset length : ", len(self.graph_lists))
for (i, g) in enumerate(self.graph_lists): for (i, g) in enumerate(self.graph_lists):
if g.number_of_nodes() <= self.max_allow_node: if g.number_of_nodes() <= self.max_allow_node:
preserve_idx.append(i) preserve_idx.append(i)
self.graph_lists = [self.graph_lists[i] for i in preserve_idx] self.graph_lists = [self.graph_lists[i] for i in preserve_idx]
print( if self.verbose:
"after pruning graphs that are too big : ", len( print("after pruning graphs that are too big : ", len(self.graph_lists))
self.graph_lists))
self.graph_labels = [self.graph_labels[i] for i in preserve_idx] self.graph_labels = [self.graph_labels[i] for i in preserve_idx]
self.max_num_node = self.max_allow_node self.max_num_node = self.max_allow_node
self.graph_labels = F.tensor(self.graph_labels)
def save(self):
graph_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.bin'.format(self.name, self.hash))
info_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.pkl'.format(self.name, self.hash))
label_dict = {'labels': self.graph_labels}
info_dict = {'max_num_node': self.max_num_node,
'num_labels': self.num_labels}
save_graphs(str(graph_path), self.graph_lists, label_dict)
save_info(str(info_path), info_dict)
def load(self):
graph_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.bin'.format(self.name, self.hash))
info_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.pkl'.format(self.name, self.hash))
graphs, label_dict = load_graphs(str(graph_path))
info_dict = load_info(str(info_path))
self.graph_lists = graphs
self.graph_labels = label_dict['labels']
self.max_num_node = info_dict['max_num_node']
self.num_labels = info_dict['num_labels']
def has_cache(self):
graph_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.bin'.format(self.name, self.hash))
info_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.pkl'.format(self.name, self.hash))
if os.path.exists(graph_path) and os.path.exists(info_path):
return True
return False
def __getitem__(self, idx): def __getitem__(self, idx):
"""Get the i^th sample. """Get the idx-th sample.
Paramters Paramters
--------- ---------
idx : int idx : int
The sample index. The sample index.
Returns Returns
------- -------
(dgl.DGLGraph, int) (dgl.Graph, int)
DGLGraph with node feature stored in `feat` field and node label in `node_label` if available. Graph with node feature stored in `feat` field and node label in `node_label` if available.
And its label. And its label.
""" """
g = self.graph_lists[idx] g = self.graph_lists[idx]
...@@ -151,7 +207,7 @@ class LegacyTUDataset(object): ...@@ -151,7 +207,7 @@ class LegacyTUDataset(object):
return len(self.graph_lists) return len(self.graph_lists)
def _file_path(self, category): def _file_path(self, category):
return os.path.join(self.extract_dir, self.name, return os.path.join(self.raw_path, self.name,
"{}_{}.txt".format(self.name, category)) "{}_{}.txt".format(self.name, category))
@staticmethod @staticmethod
...@@ -171,24 +227,64 @@ class LegacyTUDataset(object): ...@@ -171,24 +227,64 @@ class LegacyTUDataset(object):
self.num_labels,\ self.num_labels,\
self.max_num_node self.max_num_node
class TUDataset(DGLBuiltinDataset):
class TUDataset(object): r"""
"""
TUDataset contains lots of graph kernel datasets for graph classification. TUDataset contains lots of graph kernel datasets for graph classification.
Graphs may have node labels, node attributes, edge labels, and edge attributes,
varing from different dataset.
:param name: Dataset Name, such as `ENZYMES`, `DD`, `COLLAB`, `MUTAG`, can be the Parameters
datasets name on https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets. ----------
name : str
Dataset Name, such as `ENZYMES`, `DD`, `COLLAB`, `MUTAG`, can be the
datasets name on https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets.
Attributes
----------
max_num_node : int
Maximum number of nodes
num_labels : int
Number of classes
Examples
--------
>>> data = TUDataset('DD')
**The dataset instance is an iterable**
>>> len(data)
188
>>> g, label = data[1024]
>>> g
Graph(num_nodes=88, num_edges=410,
ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
>>> label
tensor([1])
**Batch the graphs and labels for mini-batch training*
>>> graphs, labels = zip(*[data[i] for i in range(16)])
>>> batched_graphs = dgl.batch(graphs)
>>> batched_labels = torch.tensor(labels)
>>> batched_graphs
Graph(num_nodes=9539, num_edges=47382,
ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)}
edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
Notes
-----
Graphs may have node labels, node attributes, edge labels, and edge attributes,
varing from different dataset. This class does not perform additional process.
""" """
_url = r"https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/{}.zip" _url = r"https://www.chrsmrrs.com/graphkerneldatasets/{}.zip"
def __init__(self, name):
self.name = name
self.extract_dir = self._download()
def __init__(self, name, raw_dir=None, force_reload=False, verbose=False):
url = self._url.format(name)
super(TUDataset, self).__init__(name=name, url=url,
raw_dir=raw_dir, force_reload=force_reload,
verbose=verbose)
def process(self):
DS_edge_list = self._idx_from_zero( DS_edge_list = self._idx_from_zero(
loadtxt(self._file_path("A"), delimiter=",").astype(int)) loadtxt(self._file_path("A"), delimiter=",").astype(int))
DS_indicator = self._idx_from_zero( DS_indicator = self._idx_from_zero(
...@@ -196,7 +292,7 @@ class TUDataset(object): ...@@ -196,7 +292,7 @@ class TUDataset(object):
DS_graph_labels = self._idx_from_zero( DS_graph_labels = self._idx_from_zero(
loadtxt(self._file_path("graph_labels"), delimiter=",").astype(int)) loadtxt(self._file_path("graph_labels"), delimiter=",").astype(int))
g = DGLGraph() g = dgl_graph([])
g.add_nodes(int(DS_edge_list.max()) + 1) g.add_nodes(int(DS_edge_list.max()) + 1)
g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1]) g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1])
...@@ -209,7 +305,7 @@ class TUDataset(object): ...@@ -209,7 +305,7 @@ class TUDataset(object):
self.max_num_node = len(node_idx[0]) self.max_num_node = len(node_idx[0])
self.num_labels = max(DS_graph_labels) + 1 self.num_labels = max(DS_graph_labels) + 1
self.graph_labels = DS_graph_labels self.graph_labels = F.tensor(DS_graph_labels)
self.attr_dict = { self.attr_dict = {
'node_labels': ('ndata', 'node_labels'), 'node_labels': ('ndata', 'node_labels'),
...@@ -223,25 +319,50 @@ class TUDataset(object): ...@@ -223,25 +319,50 @@ class TUDataset(object):
data = loadtxt(self._file_path(filename), data = loadtxt(self._file_path(filename),
delimiter=',').astype(int) delimiter=',').astype(int)
if 'label' in filename: if 'label' in filename:
data = self._idx_from_zero(data) data = F.tensor(self._idx_from_zero(data))
getattr(g, field_name[0])[field_name[1]] = data getattr(g, field_name[0])[field_name[1]] = data
except IOError: except IOError:
pass pass
self.graph_lists = g.subgraphs(node_idx_list) self.graph_lists = [g.subgraph(node_idx) for node_idx in node_idx_list]
for g in self.graph_lists:
g.copy_from_parent() def save(self):
graph_path = os.path.join(self.save_path, 'tu_{}.bin'.format(self.name))
info_path = os.path.join(self.save_path, 'tu_{}.pkl'.format(self.name))
label_dict = {'labels': self.graph_labels}
info_dict = {'max_num_node': self.max_num_node,
'num_labels': self.num_labels}
save_graphs(str(graph_path), self.graph_lists, label_dict)
save_info(str(info_path), info_dict)
def load(self):
graph_path = os.path.join(self.save_path, 'tu_{}.bin'.format(self.name))
info_path = os.path.join(self.save_path, 'tu_{}.pkl'.format(self.name))
graphs, label_dict = load_graphs(str(graph_path))
info_dict = load_info(str(info_path))
self.graph_lists = graphs
self.graph_labels = label_dict['labels']
self.max_num_node = info_dict['max_num_node']
self.num_labels = info_dict['num_labels']
def has_cache(self):
graph_path = os.path.join(self.save_path, 'tu_{}.bin'.format(self.name))
info_path = os.path.join(self.save_path, 'legacy_tu_{}.pkl'.format(self.name))
if os.path.exists(graph_path) and os.path.exists(info_path):
return True
return False
def __getitem__(self, idx): def __getitem__(self, idx):
"""Get the i^th sample. """Get the idx-th sample.
Paramters Paramters
--------- ---------
idx : int idx : int
The sample index. The sample index.
Returns Returns
------- -------
(dgl.DGLGraph, int) (dgl.Graph, int)
DGLGraph with node feature stored in `feat` field and node label in `node_label` if available. Graph with node feature stored in `feat` field and node label in `node_label` if available.
And its label. And its label.
""" """
g = self.graph_lists[idx] g = self.graph_lists[idx]
...@@ -250,19 +371,8 @@ class TUDataset(object): ...@@ -250,19 +371,8 @@ class TUDataset(object):
def __len__(self): def __len__(self):
return len(self.graph_lists) return len(self.graph_lists)
def _download(self):
download_dir = get_download_dir()
zip_file_path = os.path.join(
download_dir,
"tu_{}.zip".format(
self.name))
download(self._url.format(self.name), path=zip_file_path)
extract_dir = os.path.join(download_dir, "tu_{}".format(self.name))
extract_archive(zip_file_path, extract_dir)
return extract_dir
def _file_path(self, category): def _file_path(self, category):
return os.path.join(self.extract_dir, self.name, return os.path.join(self.raw_path, self.name,
"{}_{}.txt".format(self.name, category)) "{}_{}.txt".format(self.name, category))
@staticmethod @staticmethod
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment