[Dataset] Builtin LegacyTUDataset and TUDataset (#1891)

* add LegacyTUDataset and TUDataset * update docstrings and url * update docstrings * add hash to fix save/load * update docstring and use hash Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com>

[Dataset] Builtin LegacyTUDataset and TUDataset (#1891)
* add LegacyTUDataset and TUDataset * update docstrings and url * update docstrings * add hash to fix save/load * update docstring and use hash Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com>
39ed0966 · Tong He · GitHub · 65e9644e · 39ed0966
Unverified Commit 39ed0966 authored Aug 03, 2020 by Tong He Committed by GitHub Aug 03, 2020
Show whitespace changes
Inline Side-by-side

Showing with 197 additions and 87 deletions

python/dgl/data/tu.py python/dgl/data/tu.py +197 -87

No files found.
--- a/python/dgl/data/tu.py
+++ b/python/dgl/data/tu.py
@@ -3,62 +3,90 @@ import numpy as np
 import os
 import random

-from .utils import download, extract_archive, get_download_dir, loadtxt
-from ..utils import retry_method_with_fix
-from ..convert import graph
+
+from .dgl_dataset import DGLBuiltinDataset
+from .utils import loadtxt, save_graphs, load_graphs, save_info, load_info
 from .. import backend as F
+from ..utils import retry_method_with_fix
+from ..convert import graph as dgl_graph

-class LegacyTUDataset(object):
-    """
-    TUDataset contains lots of graph kernel datasets for graph classification.
-    Use provided node feature by default. If no feature provided, use one-hot node label instead.
-    If neither labels provided, use constant for node feature.
+class LegacyTUDataset(DGLBuiltinDataset):
+    r"""LegacyTUDataset contains lots of graph kernel datasets for graph classification.

-    :param name: Dataset Name, such as `ENZYMES`, `DD`, `COLLAB`
-    :param use_pandas: Default: False.
+    Parameters
+    ----------
+    name : str
+        Dataset Name, such as `ENZYMES`, `DD`, `COLLAB`
+    use_pandas : bool
        Numpy's file read function has performance issue when file is large,
        using pandas can be faster.
-    :param hidden_size: Default 10. Some dataset doesn't contain features.
+        Default: False
+    hidden_size : int
+        Some dataset doesn't contain features.
        Use constant node features initialization instead, with hidden size as `hidden_size`.
-
+        Default : 10
+    max_allow_node : int
+        Remove graphs that contains more nodes than `max_allow_node`.
+        Default : None
+
+    Attributes
+    ----------
+    max_num_node : int
+        Maximum number of nodes
+    num_labels : int
+        Number of classes
+
+    Examples
+    --------
+    >>> data = LegacyTUDataset('DD')
+
+    **The dataset instance is an iterable**
+
+    >>> len(data)
+    1178
+    >>> g, label = data[1024]
+    >>> g
+    Graph(num_nodes=88, num_edges=410,
+          ndata_schemes={'feat': Scheme(shape=(89,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
+          edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
+    >>> label
+    tensor(1)
+
+    **Batch the graphs and labels for mini-batch training*
+
+    >>> graphs, labels = zip(*[data[i] for i in range(16)])
+    >>> batched_graphs = dgl.batch(graphs)
+    >>> batched_labels = torch.tensor(labels)
+    >>> batched_graphs
+    Graph(num_nodes=9539, num_edges=47382,
+          ndata_schemes={'feat': Scheme(shape=(89,), dtype=torch.float64), '_ID': Scheme(shape=(), dtype=torch.int64)}
+          edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
+
+    Notes
+    -----
+    LegacyTUDataset uses provided node feature by default. If no feature provided, it uses one-hot node label instead.
+    If neither labels provided, it uses constant for node feature.
    """

-    _url = r"https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/{}.zip"
+    _url = r"https://www.chrsmrrs.com/graphkerneldatasets/{}.zip"

    def __init__(self, name, use_pandas=False,
-                 hidden_size=10, max_allow_node=None):
+                 hidden_size=10, max_allow_node=None,
+                 raw_dir=None, force_reload=False, verbose=False):

-        self.name = name
+        url = self._url.format(name)
        self.hidden_size = hidden_size
-        self.extract_dir = self._get_extract_dir()
-        self._load(use_pandas, max_allow_node)
-
-    def _get_extract_dir(self):
-        download_dir = get_download_dir()
-        zip_file_path = os.path.join(
-            download_dir,
-            "tu_{}.zip".format(
-                self.name))
-        extract_dir = os.path.join(download_dir, "tu_{}".format(self.name))
-        return extract_dir
-
-    def _download(self):
-        download_dir = get_download_dir()
-        zip_file_path = os.path.join(
-            download_dir,
-            "tu_{}.zip".format(
-                self.name))
-
-        download(self._url.format(self.name), path=zip_file_path)
-        extract_dir = os.path.join(download_dir, "tu_{}".format(self.name))
-        extract_archive(zip_file_path, extract_dir)
-
-    @retry_method_with_fix(_download)
-    def _load(self, use_pandas, max_allow_node):
-        self.data_mode = None
        self.max_allow_node = max_allow_node
+        self.use_pandas = use_pandas
+        self.hash = abs(hash((name, use_pandas, hidden_size, max_allow_node)))
+        super(LegacyTUDataset, self).__init__(name=name, url=url, raw_dir=raw_dir,
+                                              hash_key=(name, use_pandas, hidden_size, max_allow_node),
+                                              force_reload=force_reload, verbose=verbose)

-        if use_pandas:
+    def process(self):
+        self.data_mode = None
+
+        if self.use_pandas:
            import pandas as pd
            DS_edge_list = self._idx_from_zero(
                pd.read_csv(self._file_path("A"), delimiter=",", dtype=int, header=None).values)
@@ -71,7 +99,7 @@ class LegacyTUDataset(object):
        DS_graph_labels = self._idx_from_zero(
            np.genfromtxt(self._file_path("graph_labels"), dtype=int))

-        g = graph([])
+        g = dgl_graph([])
        g.add_nodes(int(DS_edge_list.max()) + 1)
        g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1])

@@ -113,35 +141,63 @@ class LegacyTUDataset(object):
            for idxs, g in zip(node_idx_list, self.graph_lists):
                g.ndata['feat'] = np.ones((g.number_of_nodes(), hidden_size))
            self.data_mode = "constant"
-            print(
-                "Use Constant one as Feature with hidden size {}".format(hidden_size))
+            if self.verbose:
+                print("Use Constant one as Feature with hidden size {}".format(hidden_size))

        # remove graphs that are too large by user given standard
        # optional pre-processing steop in conformity with Rex Ying's original
        # DiffPool implementation
        if self.max_allow_node:
            preserve_idx = []
+            if self.verbose:
                print("original dataset length : ", len(self.graph_lists))
            for (i, g) in enumerate(self.graph_lists):
                if g.number_of_nodes() <= self.max_allow_node:
                    preserve_idx.append(i)
            self.graph_lists = [self.graph_lists[i] for i in preserve_idx]
-            print(
-                "after pruning graphs that are too big : ", len(
-                    self.graph_lists))
+            if self.verbose:
+                print("after pruning graphs that are too big : ", len(self.graph_lists))
            self.graph_labels = [self.graph_labels[i] for i in preserve_idx]
            self.max_num_node = self.max_allow_node
+        self.graph_labels = F.tensor(self.graph_labels)
+
+    def save(self):
+        graph_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.bin'.format(self.name, self.hash))
+        info_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.pkl'.format(self.name, self.hash))
+        label_dict = {'labels': self.graph_labels}
+        info_dict = {'max_num_node': self.max_num_node,
+                     'num_labels': self.num_labels}
+        save_graphs(str(graph_path), self.graph_lists, label_dict)
+        save_info(str(info_path), info_dict)
+
+    def load(self):
+        graph_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.bin'.format(self.name, self.hash))
+        info_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.pkl'.format(self.name, self.hash))
+        graphs, label_dict = load_graphs(str(graph_path))
+        info_dict = load_info(str(info_path))
+
+        self.graph_lists = graphs
+        self.graph_labels = label_dict['labels']
+        self.max_num_node = info_dict['max_num_node']
+        self.num_labels = info_dict['num_labels']
+
+    def has_cache(self):
+        graph_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.bin'.format(self.name, self.hash))
+        info_path = os.path.join(self.save_path, 'legacy_tu_{}_{}.pkl'.format(self.name, self.hash))
+        if os.path.exists(graph_path) and os.path.exists(info_path):
+            return True
+        return False

    def __getitem__(self, idx):
-        """Get the i^th sample.
+        """Get the idx-th sample.
        Paramters
        ---------
        idx : int
            The sample index.
        Returns
        -------
-        (dgl.DGLGraph, int)
-            DGLGraph with node feature stored in `feat` field and node label in `node_label` if available.
+        (dgl.Graph, int)
+            Graph with node feature stored in `feat` field and node label in `node_label` if available.
            And its label.
        """
        g = self.graph_lists[idx]
@@ -151,7 +207,7 @@ class LegacyTUDataset(object):
        return len(self.graph_lists)

    def _file_path(self, category):
-        return os.path.join(self.extract_dir, self.name,
+        return os.path.join(self.raw_path, self.name,
                            "{}_{}.txt".format(self.name, category))

    @staticmethod
@@ -171,24 +227,64 @@ class LegacyTUDataset(object):
            self.num_labels,\
            self.max_num_node

-
-class TUDataset(object):
-    """
+class TUDataset(DGLBuiltinDataset):
+    r"""
    TUDataset contains lots of graph kernel datasets for graph classification.
-    Graphs may have node labels, node attributes, edge labels, and edge attributes,
-    varing from different dataset.

-    :param name: Dataset Name, such as `ENZYMES`, `DD`, `COLLAB`, `MUTAG`, can be the
+    Parameters
+    ----------
+    name : str
+        Dataset Name, such as `ENZYMES`, `DD`, `COLLAB`, `MUTAG`, can be the 
        datasets name on https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets.
-    """

-    _url = r"https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/{}.zip"
+    Attributes
+    ----------
+    max_num_node : int
+        Maximum number of nodes
+    num_labels : int
+        Number of classes
+
+    Examples
+    --------
+    >>> data = TUDataset('DD')
+
+    **The dataset instance is an iterable**
+
+    >>> len(data)
+    188
+    >>> g, label = data[1024]
+    >>> g
+    Graph(num_nodes=88, num_edges=410,
+          ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64), 'node_labels': Scheme(shape=(1,), dtype=torch.int64)}
+          edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
+    >>> label
+    tensor([1])
+
+    **Batch the graphs and labels for mini-batch training*
+
+    >>> graphs, labels = zip(*[data[i] for i in range(16)])
+    >>> batched_graphs = dgl.batch(graphs)
+    >>> batched_labels = torch.tensor(labels)
+    >>> batched_graphs
+    Graph(num_nodes=9539, num_edges=47382,
+          ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)}
+          edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
+
+    Notes
+    -----
+    Graphs may have node labels, node attributes, edge labels, and edge attributes,
+    varing from different dataset. This class does not perform additional process.
+    """

-    def __init__(self, name):
+    _url = r"https://www.chrsmrrs.com/graphkerneldatasets/{}.zip"

-        self.name = name
-        self.extract_dir = self._download()
+    def __init__(self, name, raw_dir=None, force_reload=False, verbose=False):
+        url = self._url.format(name)
+        super(TUDataset, self).__init__(name=name, url=url,
+                                        raw_dir=raw_dir, force_reload=force_reload,
+                                        verbose=verbose)
    
+    def process(self):
        DS_edge_list = self._idx_from_zero(
            loadtxt(self._file_path("A"), delimiter=",").astype(int))
        DS_indicator = self._idx_from_zero(
@@ -196,7 +292,7 @@ class TUDataset(object):
        DS_graph_labels = self._idx_from_zero(
            loadtxt(self._file_path("graph_labels"), delimiter=",").astype(int))

-        g = DGLGraph()
+        g = dgl_graph([])
        g.add_nodes(int(DS_edge_list.max()) + 1)
        g.add_edges(DS_edge_list[:, 0], DS_edge_list[:, 1])

@@ -209,7 +305,7 @@ class TUDataset(object):
                self.max_num_node = len(node_idx[0])

        self.num_labels = max(DS_graph_labels) + 1
-        self.graph_labels = DS_graph_labels
+        self.graph_labels = F.tensor(DS_graph_labels)

        self.attr_dict = {
            'node_labels': ('ndata', 'node_labels'),
@@ -223,25 +319,50 @@ class TUDataset(object):
                data = loadtxt(self._file_path(filename),
                               delimiter=',').astype(int)
                if 'label' in filename:
-                    data = self._idx_from_zero(data)
+                    data = F.tensor(self._idx_from_zero(data))
                getattr(g, field_name[0])[field_name[1]] = data
            except IOError:
                pass

-        self.graph_lists = g.subgraphs(node_idx_list)
-        for g in self.graph_lists:
-            g.copy_from_parent()
+        self.graph_lists = [g.subgraph(node_idx) for node_idx in node_idx_list]
+
+    def save(self):
+        graph_path = os.path.join(self.save_path, 'tu_{}.bin'.format(self.name))
+        info_path = os.path.join(self.save_path, 'tu_{}.pkl'.format(self.name))
+        label_dict = {'labels': self.graph_labels}
+        info_dict = {'max_num_node': self.max_num_node,
+                     'num_labels': self.num_labels}
+        save_graphs(str(graph_path), self.graph_lists, label_dict)
+        save_info(str(info_path), info_dict)
+
+    def load(self):
+        graph_path = os.path.join(self.save_path, 'tu_{}.bin'.format(self.name))
+        info_path = os.path.join(self.save_path, 'tu_{}.pkl'.format(self.name))
+        graphs, label_dict = load_graphs(str(graph_path))
+        info_dict = load_info(str(info_path))
+
+        self.graph_lists = graphs
+        self.graph_labels = label_dict['labels']
+        self.max_num_node = info_dict['max_num_node']
+        self.num_labels = info_dict['num_labels']
+
+    def has_cache(self):
+        graph_path = os.path.join(self.save_path, 'tu_{}.bin'.format(self.name))
+        info_path = os.path.join(self.save_path, 'legacy_tu_{}.pkl'.format(self.name))
+        if os.path.exists(graph_path) and os.path.exists(info_path):
+            return True
+        return False

    def __getitem__(self, idx):
-        """Get the i^th sample.
+        """Get the idx-th sample.
        Paramters
        ---------
        idx : int
            The sample index.
        Returns
        -------
-        (dgl.DGLGraph, int)
-            DGLGraph with node feature stored in `feat` field and node label in `node_label` if available.
+        (dgl.Graph, int)
+            Graph with node feature stored in `feat` field and node label in `node_label` if available.
            And its label.
        """
        g = self.graph_lists[idx]
@@ -250,19 +371,8 @@ class TUDataset(object):
    def __len__(self):
        return len(self.graph_lists)

-    def _download(self):
-        download_dir = get_download_dir()
-        zip_file_path = os.path.join(
-            download_dir,
-            "tu_{}.zip".format(
-                self.name))
-        download(self._url.format(self.name), path=zip_file_path)
-        extract_dir = os.path.join(download_dir, "tu_{}".format(self.name))
-        extract_archive(zip_file_path, extract_dir)
-        return extract_dir
-
    def _file_path(self, category):
-        return os.path.join(self.extract_dir, self.name,
+        return os.path.join(self.raw_path, self.name,
                            "{}_{}.txt".format(self.name, category))

    @staticmethod