[Dataset & Transform] Synthetic Datasets for Explainability and SIGNDiffusion Transform (#3982)

* Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update

[Dataset & Transform] Synthetic Datasets for Explainability and SIGNDiffusion Transform (#3982)
* Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Fix * Update * Update * Update
65b0b9e8 · Mufei Li · GitHub · 03024f95 · 65b0b9e8 · 65b0b9e8
Unverified Commit 65b0b9e8 authored May 16, 2022 by Mufei Li Committed by GitHub May 16, 2022
8 changed files
--- a/docs/source/api/python/dgl.data.rst
+++ b/docs/source/api/python/dgl.data.rst
@@ -47,6 +47,11 @@ Datasets for node classification/regression tasks
    FraudDataset
    FraudYelpDataset
    FraudAmazonDataset
+    BAShapeDataset
+    BACommunityDataset
+    TreeCycleDataset
+    TreeGridDataset
+    BA2MotifDataset

 Edge Prediction Datasets
 ---------------------------------------

--- a/docs/source/api/python/transforms.rst
+++ b/docs/source/api/python/transforms.rst
@@ -32,3 +32,4 @@ dgl.transforms
    LaplacianPE
    FeatMask
    RowFeatNormalizer
+    SIGNDiffusion
--- a/python/dgl/data/__init__.py
+++ b/python/dgl/data/__init__.py
@@ -31,6 +31,7 @@ from .fraud import FraudDataset, FraudYelpDataset, FraudAmazonDataset
 from .fakenews import FakeNewsDataset
 from .csv_dataset import CSVDataset
 from .adapter import AsNodePredDataset, AsLinkPredDataset
+from .synthetic import BAShapeDataset, BACommunityDataset, TreeCycleDataset, TreeGridDataset, BA2MotifDataset

 def register_data_args(parser):
    parser.add_argument(

--- a/python/dgl/data/dgl_dataset.py
+++ b/python/dgl/data/dgl_dataset.py
@@ -17,7 +17,7 @@ class DGLDataset(object):
      1. Check whether there is a dataset cache on disk
         (already processed and stored on the disk) by
         invoking ``has_cache()``. If true, goto 5.
-      2. Call ``download()`` to download the data.
+      2. Call ``download()`` to download the data if ``url`` is not None.
      3. Call ``process()`` to process the data.
      4. Call ``save()`` to save the processed dataset on disk and goto 6.
      5. Call ``load()`` to load the processed dataset from disk.
@@ -31,7 +31,7 @@ class DGLDataset(object):
    name : str
        Name of the dataset
    url : str
-        Url to download the raw dataset
+        Url to download the raw dataset. Default: None
    raw_dir : str
        Specifying the directory that will store the
        downloaded data or the directory that
@@ -313,6 +313,7 @@ class DGLBuiltinDataset(DGLDataset):
    def download(self):
        r""" Automatically download data and extract it.
        """
-        zip_file_path = os.path.join(self.raw_dir, self.name + '.zip')
-        download(self.url, path=zip_file_path)
-        extract_archive(zip_file_path, self.raw_path)
+        if self.url is not None:
+            zip_file_path = os.path.join(self.raw_dir, self.name + '.zip')
+            download(self.url, path=zip_file_path)
+            extract_archive(zip_file_path, self.raw_path)
--- a/python/dgl/data/synthetic.py
+++ b/python/dgl/data/synthetic.py
+"""Synthetic graph datasets."""
+import math
+import networkx as nx
+import numpy as np
+import os
+import pickle
+import random
+
+from .dgl_dataset import DGLBuiltinDataset
+from .utils import save_graphs, load_graphs, _get_dgl_url, download
+from .. import backend as F
+from ..batch import batch
+from ..convert import graph
+from ..transforms import reorder_graph
+
+class BAShapeDataset(DGLBuiltinDataset):
+    r"""BA-SHAPES dataset from `GNNExplainer: Generating Explanations for Graph Neural Networks
+    <https://arxiv.org/abs/1903.03894>`__
+
+    This is a synthetic dataset for node classification. It is generated by performing the
+    following steps in order.
+
+    - Construct a base Barabási–Albert (BA) graph.
+    - Construct a set of five-node house-structured network motifs.
+    - Attach the motifs to randomly selected nodes of the base graph.
+    - Perturb the graph by adding random edges.
+    - Nodes are assigned to 4 classes. Nodes of label 0 belong to the base BA graph. Nodes of
+      label 1, 2, 3 are separately at the middle, bottom, or top of houses.
+    - Generate constant feature for all nodes, which is 1.
+
+    Parameters
+    ----------
+    num_base_nodes : int, optional
+        Number of nodes in the base BA graph. Default: 300
+    num_base_edges_per_node : int, optional
+        Number of edges to attach from a new node to existing nodes in constructing the base BA
+        graph. Default: 5
+    num_motifs : int, optional
+        Number of house-structured network motifs to use. Default: 80
+    perturb_ratio : float, optional
+        Number of random edges to add in perturbation divided by the number of edges in the
+        original graph. Default: 0.01
+    seed : integer, random_state, or None, optional
+        Indicator of random number generation state. Default: None
+    raw_dir : str, optional
+        Raw file directory to store the processed data. Default: ~/.dgl/
+    force_reload : bool, optional
+        Whether to always generate the data from scratch rather than load a cached version.
+        Default: False
+    verbose : bool, optional
+        Whether to print progress information. Default: True
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access. Default: None
+
+    Attributes
+    ----------
+    num_classes : int
+        Number of node classes
+
+    Examples
+    --------
+
+    >>> from dgl.data import BAShapeDataset
+    >>> dataset = BAShapeDataset()
+    >>> dataset.num_classes
+    4
+    >>> g = dataset[0]
+    >>> label = g.ndata['label']
+    >>> feat = g.ndata['feat']
+    """
+    def __init__(self,
+                 num_base_nodes=300,
+                 num_base_edges_per_node=5,
+                 num_motifs=80,
+                 perturb_ratio=0.01,
+                 seed=None,
+                 raw_dir=None,
+                 force_reload=False,
+                 verbose=True,
+                 transform=None):
+        self.num_base_nodes = num_base_nodes
+        self.num_base_edges_per_node = num_base_edges_per_node
+        self.num_motifs = num_motifs
+        self.perturb_ratio = perturb_ratio
+        self.seed = seed
+        super(BAShapeDataset, self).__init__(name='BA-SHAPES',
+                                             url=None,
+                                             raw_dir=raw_dir,
+                                             force_reload=force_reload,
+                                             verbose=verbose,
+                                             transform=transform)
+
+    def process(self):
+        g = nx.barabasi_albert_graph(self.num_base_nodes, self.num_base_edges_per_node, self.seed)
+        edges = list(g.edges())
+        src, dst = map(list, zip(*edges))
+        n = self.num_base_nodes
+
+        # Nodes in the base BA graph belong to class 0
+        node_labels = [0] * n
+        # The motifs will be evenly attached to the nodes in the base graph.
+        spacing = math.floor(n / self.num_motifs)
+
+        for motif_id in range(self.num_motifs):
+            # Construct a five-node house-structured network motif
+            motif_edges = [
+                (n, n + 1),
+                (n + 1, n + 2),
+                (n + 2, n + 3),
+                (n + 3, n),
+                (n + 4, n),
+                (n + 4, n + 1)
+            ]
+            motif_src, motif_dst = map(list, zip(*motif_edges))
+            src.extend(motif_src)
+            dst.extend(motif_dst)
+
+            # Nodes at the middle of a house belong to class 1
+            # Nodes at the bottom of a house belong to class 2
+            # Nodes at the top of a house belong to class 3
+            node_labels.extend([1, 1, 2, 2, 3])
+
+            # Attach the motif to the base BA graph
+            src.append(n)
+            dst.append(int(motif_id * spacing))
+            n += 5
+
+        g = graph((src, dst), num_nodes=n)
+
+        # Perturb the graph by adding non-self-loop random edges
+        num_real_edges = g.num_edges()
+        max_ratio = (n * (n - 1) - num_real_edges) / num_real_edges
+        assert self.perturb_ratio <= max_ratio, \
+            'perturb_ratio cannot exceed {:.4f}'.format(max_ratio)
+        num_random_edges = int(num_real_edges * self.perturb_ratio)
+
+        if self.seed is not None:
+            np.random.seed(self.seed)
+        for _ in range(num_random_edges):
+            while True:
+                u = np.random.randint(0, n)
+                v = np.random.randint(0, n)
+                if (not g.has_edges_between(u, v)) and (u != v):
+                    break
+            g.add_edges(u, v)
+
+        g.ndata['label'] = F.tensor(node_labels, F.int64)
+        g.ndata['feat'] = F.ones((n, 1), F.float32, F.cpu())
+        self._graph = reorder_graph(
+            g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
+
+    @property
+    def graph_path(self):
+        return os.path.join(self.save_path, '{}_dgl_graph.bin'.format(self.name))
+
+    def save(self):
+        save_graphs(str(self.graph_path), self._graph)
+
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+
+    def load(self):
+        graphs, _ = load_graphs(str(self.graph_path))
+        self._graph = graphs[0]
+
+    def __getitem__(self, idx):
+        assert idx == 0, "This dataset has only one graph."
+        if self._transform is None:
+            return self._graph
+        else:
+            return self._transform(self._graph)
+
+    def __len__(self):
+        return 1
+
+    @property
+    def num_classes(self):
+        return 4
+
+class BACommunityDataset(DGLBuiltinDataset):
+    r"""BA-COMMUNITY dataset from `GNNExplainer: Generating Explanations for Graph Neural Networks
+    <https://arxiv.org/abs/1903.03894>`__
+
+    This is a synthetic dataset for node classification. It is generated by performing the
+    following steps in order.
+
+    - Construct a base Barabási–Albert (BA) graph.
+    - Construct a set of five-node house-structured network motifs.
+    - Attach the motifs to randomly selected nodes of the base graph.
+    - Perturb the graph by adding random edges.
+    - Nodes are assigned to 4 classes. Nodes of label 0 belong to the base BA graph. Nodes of
+      label 1, 2, 3 are separately at the middle, bottom, or top of houses.
+    - Generate normally distributed features of length 10
+    - Repeat the above steps to generate another graph. Its nodes are assigned to class
+      4, 5, 6, 7. Its node features are generated with a distinct normal distribution.
+    - Join the two graphs by randomly adding edges between them.
+
+    Parameters
+    ----------
+    num_base_nodes : int, optional
+        Number of nodes in each base BA graph. Default: 300
+    num_base_edges_per_node : int, optional
+        Number of edges to attach from a new node to existing nodes in constructing a base BA
+        graph. Default: 4
+    num_motifs : int, optional
+        Number of house-structured network motifs to use in constructing each graph. Default: 80
+    perturb_ratio : float, optional
+        Number of random edges to add to a graph in perturbation divided by the number of original
+        edges in it. Default: 0.01
+    num_inter_edges : int, optional
+        Number of random edges to add between the two graphs. Default: 350
+    seed : integer, random_state, or None, optional
+        Indicator of random number generation state. Default: None
+    raw_dir : str, optional
+        Raw file directory to store the processed data. Default: ~/.dgl/
+    force_reload : bool, optional
+        Whether to always generate the data from scratch rather than load a cached version.
+        Default: False
+    verbose : bool, optional
+        Whether to print progress information. Default: True
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access. Default: None
+
+    Attributes
+    ----------
+    num_classes : int
+        Number of node classes
+
+    Examples
+    --------
+
+    >>> from dgl.data import BACommunityDataset
+    >>> dataset = BACommunityDataset()
+    >>> dataset.num_classes
+    8
+    >>> g = dataset[0]
+    >>> label = g.ndata['label']
+    >>> feat = g.ndata['feat']
+    """
+    def __init__(self,
+                 num_base_nodes=300,
+                 num_base_edges_per_node=4,
+                 num_motifs=80,
+                 perturb_ratio=0.01,
+                 num_inter_edges=350,
+                 seed=None,
+                 raw_dir=None,
+                 force_reload=False,
+                 verbose=True,
+                 transform=None):
+        self.num_base_nodes = num_base_nodes
+        self.num_base_edges_per_node = num_base_edges_per_node
+        self.num_motifs = num_motifs
+        self.perturb_ratio = perturb_ratio
+        self.num_inter_edges = num_inter_edges
+        self.seed = seed
+        super(BACommunityDataset, self).__init__(name='BA-COMMUNITY',
+                                                 url=None,
+                                                 raw_dir=raw_dir,
+                                                 force_reload=force_reload,
+                                                 verbose=verbose,
+                                                 transform=transform)
+
+    def process(self):
+        if self.seed is not None:
+            random.seed(self.seed)
+            np.random.seed(self.seed)
+
+        # Construct two BA-SHAPES graphs
+        g1 = BAShapeDataset(self.num_base_nodes,
+                            self.num_base_edges_per_node,
+                            self.num_motifs,
+                            self.perturb_ratio,
+                            force_reload=True,
+                            verbose=False)[0]
+        g2 = BAShapeDataset(self.num_base_nodes,
+                            self.num_base_edges_per_node,
+                            self.num_motifs,
+                            self.perturb_ratio,
+                            force_reload=True,
+                            verbose=False)[0]
+
+        # Join them and randomly add edges between them
+        g = batch([g1, g2])
+        num_nodes = g.num_nodes() // 2
+        src = np.random.randint(0, num_nodes, (self.num_inter_edges,))
+        dst = np.random.randint(num_nodes, 2 * num_nodes, (self.num_inter_edges,))
+        src = F.astype(F.zerocopy_from_numpy(src), g.idtype)
+        dst = F.astype(F.zerocopy_from_numpy(dst), g.idtype)
+        g.add_edges(src, dst)
+        g.ndata['label'] = F.cat([g1.ndata['label'], g2.ndata['label'] + 4], dim=0)
+
+        # feature generation
+        random_mu = [0.0] * 8
+        random_sigma = [1.0] * 8
+
+        mu_1, sigma_1 = np.array([-1.0] * 2 + random_mu), np.array([0.5] * 2 + random_sigma)
+        feat1 = np.random.multivariate_normal(mu_1, np.diag(sigma_1), num_nodes)
+
+        mu_2, sigma_2 = np.array([1.0] * 2 + random_mu), np.array([0.5] * 2 + random_sigma)
+        feat2 = np.random.multivariate_normal(mu_2, np.diag(sigma_2), num_nodes)
+
+        feat = np.concatenate([feat1, feat2])
+        g.ndata['feat'] = F.zerocopy_from_numpy(feat)
+        self._graph = reorder_graph(
+            g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
+
+    @property
+    def graph_path(self):
+        return os.path.join(self.save_path, '{}_dgl_graph.bin'.format(self.name))
+
+    def save(self):
+        save_graphs(str(self.graph_path), self._graph)
+
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+
+    def load(self):
+        graphs, _ = load_graphs(str(self.graph_path))
+        self._graph = graphs[0]
+
+    def __getitem__(self, idx):
+        assert idx == 0, "This dataset has only one graph."
+        if self._transform is None:
+            return self._graph
+        else:
+            return self._transform(self._graph)
+
+    def __len__(self):
+        return 1
+
+    @property
+    def num_classes(self):
+        return 8
+
+class TreeCycleDataset(DGLBuiltinDataset):
+    r"""TREE-CYCLES dataset from `GNNExplainer: Generating Explanations for Graph Neural Networks
+    <https://arxiv.org/abs/1903.03894>`__
+
+    This is a synthetic dataset for node classification. It is generated by performing the
+    following steps in order.
+
+    - Construct a balanced binary tree as the base graph.
+    - Construct a set of cycle motifs.
+    - Attach the motifs to randomly selected nodes of the base graph.
+    - Perturb the graph by adding random edges.
+    - Generate constant feature for all nodes, which is 1.
+    - Nodes in the tree belong to class 0 and nodes in cycles belong to class 1.
+
+    Parameters
+    ----------
+    tree_height : int, optional
+        Height of the balanced binary tree. Default: 8
+    num_motifs : int, optional
+        Number of cycle motifs to use. Default: 60
+    cycle_size : int, optional
+        Number of nodes in a cycle motif. Default: 6
+    perturb_ratio : float, optional
+        Number of random edges to add in perturbation divided by the
+        number of original edges in the graph. Default: 0.01
+    seed : integer, random_state, or None, optional
+        Indicator of random number generation state. Default: None
+    raw_dir : str, optional
+        Raw file directory to store the processed data. Default: ~/.dgl/
+    force_reload : bool, optional
+        Whether to always generate the data from scratch rather than load a cached version.
+        Default: False
+    verbose : bool, optional
+        Whether to print progress information. Default: True
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access. Default: None
+
+    Attributes
+    ----------
+    num_classes : int
+        Number of node classes
+
+    Examples
+    --------
+
+    >>> from dgl.data import TreeCycleDataset
+    >>> dataset = TreeCycleDataset()
+    >>> dataset.num_classes
+    2
+    >>> g = dataset[0]
+    >>> label = g.ndata['label']
+    >>> feat = g.ndata['feat']
+    """
+    def __init__(self,
+                 tree_height=8,
+                 num_motifs=60,
+                 cycle_size=6,
+                 perturb_ratio=0.01,
+                 seed=None,
+                 raw_dir=None,
+                 force_reload=False,
+                 verbose=True,
+                 transform=None):
+        self.tree_height = tree_height
+        self.num_motifs = num_motifs
+        self.cycle_size = cycle_size
+        self.perturb_ratio = perturb_ratio
+        self.seed = seed
+        super(TreeCycleDataset, self).__init__(name='TREE-CYCLES',
+                                               url=None,
+                                               raw_dir=raw_dir,
+                                               force_reload=force_reload,
+                                               verbose=verbose,
+                                               transform=transform)
+
+    def process(self):
+        if self.seed is not None:
+            np.random.seed(self.seed)
+
+        g = nx.balanced_tree(r=2, h=self.tree_height)
+        edges = list(g.edges())
+        src, dst = map(list, zip(*edges))
+        n = nx.number_of_nodes(g)
+
+        # Nodes in the base tree graph belong to class 0
+        node_labels = [0] * n
+        # The motifs will be evenly attached to the nodes in the base graph.
+        spacing = math.floor(n / self.num_motifs)
+
+        for motif_id in range(self.num_motifs):
+            # Construct a six-node cycle
+            motif_edges = [(n + i, n + i + 1) for i in range(5)]
+            motif_edges.append((n + 5, n))
+            motif_src, motif_dst = map(list, zip(*motif_edges))
+            src.extend(motif_src)
+            dst.extend(motif_dst)
+
+            # Nodes in cycles belong to class 1
+            node_labels.extend([1] * self.cycle_size)
+
+            # Attach the motif to the base tree graph
+            anchor = int(motif_id * spacing)
+            src.append(n)
+            dst.append(anchor)
+
+            if np.random.random() > 0.5:
+                a = np.random.randint(1, 4)
+                b = np.random.randint(1, 4)
+                src.append(n + a)
+                dst.append(anchor + b)
+
+            n += self.cycle_size
+
+        g = graph((src, dst), num_nodes=n)
+
+        # Perturb the graph by adding non-self-loop random edges
+        num_real_edges = g.num_edges()
+        max_ratio = (n * (n - 1) - num_real_edges) / num_real_edges
+        assert self.perturb_ratio <= max_ratio, \
+            'perturb_ratio cannot exceed {:.4f}'.format(max_ratio)
+        num_random_edges = int(num_real_edges * self.perturb_ratio)
+
+        for _ in range(num_random_edges):
+            while True:
+                u = np.random.randint(0, n)
+                v = np.random.randint(0, n)
+                if (not g.has_edges_between(u, v)) and (u != v):
+                    break
+            g.add_edges(u, v)
+
+        g.ndata['label'] = F.tensor(node_labels, F.int64)
+        g.ndata['feat'] = F.ones((n, 1), F.float32, F.cpu())
+        self._graph = reorder_graph(
+            g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
+
+    @property
+    def graph_path(self):
+        return os.path.join(self.save_path, '{}_dgl_graph.bin'.format(self.name))
+
+    def save(self):
+        save_graphs(str(self.graph_path), self._graph)
+
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+
+    def load(self):
+        graphs, _ = load_graphs(str(self.graph_path))
+        self._graph = graphs[0]
+
+    def __getitem__(self, idx):
+        assert idx == 0, "This dataset has only one graph."
+        if self._transform is None:
+            return self._graph
+        else:
+            return self._transform(self._graph)
+
+    def __len__(self):
+        return 1
+
+    @property
+    def num_classes(self):
+        return 2
+
+class TreeGridDataset(DGLBuiltinDataset):
+    r"""TREE-GRIDS dataset from `GNNExplainer: Generating Explanations for Graph Neural Networks
+    <https://arxiv.org/abs/1903.03894>`__
+
+    This is a synthetic dataset for node classification. It is generated by performing the
+    following steps in order.
+
+    - Construct a balanced binary tree as the base graph.
+    - Construct a set of n-by-n grid motifs.
+    - Attach the motifs to randomly selected nodes of the base graph.
+    - Perturb the graph by adding random edges.
+    - Generate constant feature for all nodes, which is 1.
+    - Nodes in the tree belong to class 0 and nodes in grids belong to class 1.
+
+    Parameters
+    ----------
+    tree_height : int, optional
+        Height of the balanced binary tree. Default: 8
+    num_motifs : int, optional
+        Number of grid motifs to use. Default: 80
+    grid_size : int, optional
+        The number of nodes in a grid motif will be grid_size ^ 2. Default: 3
+    perturb_ratio : float, optional
+        Number of random edges to add in perturbation divided by the
+        number of original edges in the graph. Default: 0.1
+    seed : integer, random_state, or None, optional
+        Indicator of random number generation state. Default: None
+    raw_dir : str, optional
+        Raw file directory to store the processed data. Default: ~/.dgl/
+    force_reload : bool, optional
+        Whether to always generate the data from scratch rather than load a cached version.
+        Default: False
+    verbose : bool, optional
+        Whether to print progress information. Default: True
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access. Default: None
+
+    Attributes
+    ----------
+    num_classes : int
+        Number of node classes
+
+    Examples
+    --------
+
+    >>> from dgl.data import TreeGridDataset
+    >>> dataset = TreeGridDataset()
+    >>> dataset.num_classes
+    2
+    >>> g = dataset[0]
+    >>> label = g.ndata['label']
+    >>> feat = g.ndata['feat']
+    """
+    def __init__(self,
+                 tree_height=8,
+                 num_motifs=80,
+                 grid_size=3,
+                 perturb_ratio=0.1,
+                 seed=None,
+                 raw_dir=None,
+                 force_reload=False,
+                 verbose=True,
+                 transform=None):
+        self.tree_height = tree_height
+        self.num_motifs = num_motifs
+        self.grid_size = grid_size
+        self.perturb_ratio = perturb_ratio
+        self.seed = seed
+        super(TreeGridDataset, self).__init__(name='TREE-GRIDS',
+                                              url=None,
+                                              raw_dir=raw_dir,
+                                              force_reload=force_reload,
+                                              verbose=verbose,
+                                              transform=transform)
+
+    def process(self):
+        if self.seed is not None:
+            np.random.seed(self.seed)
+
+        g = nx.balanced_tree(r=2, h=self.tree_height)
+        edges = list(g.edges())
+        src, dst = map(list, zip(*edges))
+        n = nx.number_of_nodes(g)
+
+        # Nodes in the base tree graph belong to class 0
+        node_labels = [0] * n
+        # The motifs will be evenly attached to the nodes in the base graph.
+        spacing = math.floor(n / self.num_motifs)
+
+        # Construct an n-by-n grid
+        motif_g = nx.grid_graph([self.grid_size, self.grid_size])
+        grid_size = nx.number_of_nodes(motif_g)
+        motif_g = nx.convert_node_labels_to_integers(motif_g, first_label=0)
+        motif_edges = list(motif_g.edges())
+        motif_src, motif_dst = map(list, zip(*motif_edges))
+        motif_src, motif_dst = np.array(motif_src), np.array(motif_dst)
+
+        for motif_id in range(self.num_motifs):
+            src.extend((motif_src + n).tolist())
+            dst.extend((motif_dst + n).tolist())
+
+            # Nodes in grids belong to class 1
+            node_labels.extend([1] * grid_size)
+
+            # Attach the motif to the base tree graph
+            src.append(n)
+            dst.append(int(motif_id * spacing))
+
+            n += grid_size
+
+        g = graph((src, dst), num_nodes=n)
+
+        # Perturb the graph by adding non-self-loop random edges
+        num_real_edges = g.num_edges()
+        max_ratio = (n * (n - 1) - num_real_edges) / num_real_edges
+        assert self.perturb_ratio <= max_ratio, \
+            'perturb_ratio cannot exceed {:.4f}'.format(max_ratio)
+        num_random_edges = int(num_real_edges * self.perturb_ratio)
+
+        for _ in range(num_random_edges):
+            while True:
+                u = np.random.randint(0, n)
+                v = np.random.randint(0, n)
+                if (not g.has_edges_between(u, v)) and (u != v):
+                    break
+            g.add_edges(u, v)
+
+        g.ndata['label'] = F.tensor(node_labels, F.int64)
+        g.ndata['feat'] = F.ones((n, 1), F.float32, F.cpu())
+        self._graph = reorder_graph(
+            g, node_permute_algo='rcmk', edge_permute_algo='dst', store_ids=False)
+
+    @property
+    def graph_path(self):
+        return os.path.join(self.save_path, '{}_dgl_graph.bin'.format(self.name))
+
+    def save(self):
+        save_graphs(str(self.graph_path), self._graph)
+
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+
+    def load(self):
+        graphs, _ = load_graphs(str(self.graph_path))
+        self._graph = graphs[0]
+
+    def __getitem__(self, idx):
+        assert idx == 0, "This dataset has only one graph."
+        if self._transform is None:
+            return self._graph
+        else:
+            return self._transform(self._graph)
+
+    def __len__(self):
+        return 1
+
+    @property
+    def num_classes(self):
+        return 2
+
+class BA2MotifDataset(DGLBuiltinDataset):
+    r"""BA-2motifs dataset from `Parameterized Explainer for Graph Neural Network
+    <https://arxiv.org/abs/2011.04573>`__
+
+    This is a synthetic dataset for graph classification. It was generated by
+    performing the following steps in order.
+
+    - Construct 1000 base Barabási–Albert (BA) graphs.
+    - Attach house-structured network motifs to half of the base BA graphs.
+    - Attach five-node cycle motifs to the rest base BA graphs.
+    - Assign each graph to one of two classes according to the type of the attached motif.
+
+    Parameters
+    ----------
+    raw_dir : str, optional
+        Raw file directory to download and store the data. Default: ~/.dgl/
+    force_reload : bool, optional
+        Whether to reload the dataset. Default: False
+    verbose : bool, optional
+        Whether to print progress information. Default: True
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access. Default: None
+
+    Attributes
+    ----------
+    num_classes : int
+        Number of node classes
+
+    Examples
+    --------
+
+    >>> from dgl.data import BA2MotifDataset
+    >>> dataset = BA2MotifDataset()
+    >>> dataset.num_classes
+    2
+    >>> # Get the first graph and its label
+    >>> g, label = dataset[0]
+    >>> feat = g.ndata['feat']
+    """
+    def __init__(self,
+                 raw_dir=None,
+                 force_reload=False,
+                 verbose=True,
+                 transform=None):
+        super(BA2MotifDataset, self).__init__(name='BA-2motifs',
+                                              url=_get_dgl_url('dataset/BA-2motif.pkl'),
+                                              raw_dir=raw_dir,
+                                              force_reload=force_reload,
+                                              verbose=verbose,
+                                              transform=transform)
+
+    def download(self):
+        r""" Automatically download data."""
+        file_path = os.path.join(self.raw_dir, self.name + '.pkl')
+        download(self.url, path=file_path)
+
+    def process(self):
+        file_path = os.path.join(self.raw_dir, self.name + '.pkl')
+        with open(file_path, 'rb') as f:
+            adjs, features, labels = pickle.load(f)
+
+        self.graphs = []
+        self.labels = F.tensor(labels, F.int64)
+
+        for i in range(len(adjs)):
+            g = graph(adjs[i].nonzero())
+            g.ndata['feat'] = F.zerocopy_from_numpy(features[i])
+            self.graphs.append(g)
+
+    @property
+    def graph_path(self):
+        return os.path.join(self.save_path, '{}_dgl_graph.bin'.format(self.name))
+
+    def save(self):
+        label_dict = {'labels': self.labels}
+        save_graphs(str(self.graph_path), self.graphs, label_dict)
+
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+
+    def load(self):
+        self.graphs, label_dict = load_graphs(str(self.graph_path))
+        self.labels = label_dict['labels']
+
+    def __getitem__(self, idx):
+        g = self.graphs[idx]
+        if self._transform is not None:
+            g = self._transform(g)
+        return g, self.labels[idx]
+
+    def __len__(self):
+        return len(self.graphs)
+
+    @property
+    def num_classes(self):
+        return 2
--- a/python/dgl/transforms/module.py
+++ b/python/dgl/transforms/module.py
@@ -14,13 +14,14 @@
 #   limitations under the License.
 #
 """Modules for transform"""
-# pylint: disable= no-member, arguments-differ, invalid-name
+# pylint: disable= no-member, arguments-differ, invalid-name, missing-function-docstring

 from scipy.linalg import expm

 from .. import convert
 from .. import backend as F
 from .. import function as fn
+from ..base import DGLError
 from . import functional

 try:
@@ -50,7 +51,8 @@ __all__ = [
    'NodeShuffle',
    'DropNode',
    'DropEdge',
-    'AddEdge'
+    'AddEdge',
+    'SIGNDiffusion'
 ]

 def update_graph_structure(g, data_dict, copy_edata=True):
@@ -1492,3 +1494,181 @@ class AddEdge(BaseTransform):
            dst = F.randint([num_edges_to_add], idtype, device, low=0, high=g.num_nodes(vtype))
            g.add_edges(src, dst, etype=c_etype)
        return g
+
+class SIGNDiffusion(BaseTransform):
+    r"""The diffusion operator from `SIGN: Scalable Inception Graph Neural Networks
+    <https://arxiv.org/abs/2004.11198>`__
+
+    It performs node feature diffusion with :math:`TX, \cdots, T^{k}X`, where :math:`T`
+    is a diffusion matrix and :math:`X` is the input node features.
+
+    Specifically, this module provides four options for :math:`T`.
+
+    **raw**: raw adjacency matrix :math:`A`
+
+    **rw**: random walk (row-normalized) adjacency matrix :math:`D^{-1}A`, where
+    :math:`D` is the degree matrix.
+
+    **gcn**: symmetrically normalized adjacency matrix used by
+    `GCN <https://arxiv.org/abs/1609.02907>`__, :math:`D^{-1/2}AD^{-1/2}`
+
+    **ppr**: approximate personalized PageRank used by
+    `APPNP <https://arxiv.org/abs/1810.05997>`__
+
+    .. math::
+        H^{0} &= X
+
+        H^{l+1} &= (1-\alpha)\left(D^{-1/2}AD^{-1/2} H^{l}\right) + \alpha X
+
+    This module only works for homogeneous graphs.
+
+    Parameters
+    ----------
+    k : int
+        The maximum number of times for node feature diffusion.
+    in_feat_name : str, optional
+        :attr:`g.ndata[{in_feat_name}]` should store the input node features. Default: 'feat'
+    out_feat_name : str, optional
+        :attr:`g.ndata[{out_feat_name}_i]` will store the result of diffusing
+        input node features for i times. Default: 'out_feat'
+    eweight_name : str, optional
+        Name to retrieve edge weights from :attr:`g.edata`. Default: None,
+        treating the graph as unweighted.
+    diffuse_op : str, optional
+        The diffusion operator to use, which can be 'raw', 'rw', 'gcn', or 'ppr'.
+        Default: 'raw'
+    alpha : float, optional
+        Restart probability if :attr:`diffuse_op` is :attr:`'ppr'`,
+        which commonly lies in :math:`[0.05, 0.2]`. Default: 0.2
+
+    Example
+    -------
+
+    >>> import dgl
+    >>> import torch
+    >>> from dgl import SIGNDiffusion
+
+    >>> transform = SIGNDiffusion(k=2, eweight_name='w')
+    >>> num_nodes = 5
+    >>> num_edges = 20
+    >>> g = dgl.rand_graph(num_nodes, num_edges)
+    >>> g.ndata['feat'] = torch.randn(num_nodes, 10)
+    >>> g.edata['w'] = torch.randn(num_edges)
+    >>> transform(g)
+    Graph(num_nodes=5, num_edges=20,
+          ndata_schemes={'feat': Scheme(shape=(10,), dtype=torch.float32),
+                         'out_feat_1': Scheme(shape=(10,), dtype=torch.float32),
+                         'out_feat_2': Scheme(shape=(10,), dtype=torch.float32)}
+          edata_schemes={'w': Scheme(shape=(), dtype=torch.float32)})
+    """
+    def __init__(self,
+                 k,
+                 in_feat_name='feat',
+                 out_feat_name='out_feat',
+                 eweight_name=None,
+                 diffuse_op='raw',
+                 alpha=0.2):
+        self.k = k
+        self.in_feat_name = in_feat_name
+        self.out_feat_name = out_feat_name
+        self.eweight_name = eweight_name
+        self.diffuse_op = diffuse_op
+        self.alpha = alpha
+
+        if diffuse_op == 'raw':
+            self.diffuse = self.raw
+        elif diffuse_op == 'rw':
+            self.diffuse = self.rw
+        elif diffuse_op == 'gcn':
+            self.diffuse = self.gcn
+        elif diffuse_op == 'ppr':
+            self.diffuse = self.ppr
+        else:
+            raise DGLError("Expect diffuse_op to be from ['raw', 'rw', 'gcn', 'ppr'], \
+                got {}".format(diffuse_op))
+
+    def __call__(self, g):
+        feat_list = self.diffuse(g)
+
+        for i in range(1, self.k + 1):
+            g.ndata[self.out_feat_name + '_' + str(i)] = feat_list[i - 1]
+
+    def raw(self, g):
+        use_eweight = False
+        if (self.eweight_name is not None) and self.eweight_name in g.edata:
+            use_eweight = True
+
+        feat_list = []
+        with g.local_scope():
+            if use_eweight:
+                message_func = fn.u_mul_e(self.in_feat_name, self.eweight_name, 'm')
+            else:
+                message_func = fn.copy_u(self.in_feat_name, 'm')
+            for _ in range(self.k):
+                g.update_all(message_func, fn.sum('m', self.in_feat_name))
+                feat_list.append(g.ndata[self.in_feat_name])
+        return feat_list
+
+    def rw(self, g):
+        use_eweight = False
+        if (self.eweight_name is not None) and self.eweight_name in g.edata:
+            use_eweight = True
+
+        feat_list = []
+        with g.local_scope():
+            g.ndata['h'] = g.ndata[self.in_feat_name]
+            if use_eweight:
+                message_func = fn.u_mul_e('h', self.eweight_name, 'm')
+                reduce_func = fn.sum('m', 'h')
+                # Compute the diagonal entries of D from the weighted A
+                g.update_all(fn.copy_e(self.eweight_name, 'm'), fn.sum('m', 'z'))
+            else:
+                message_func = fn.copy_u('h', 'm')
+                reduce_func = fn.mean('m', 'h')
+
+            for _ in range(self.k):
+                g.update_all(message_func, reduce_func)
+                if use_eweight:
+                    g.ndata['h'] = g.ndata['h'] / F.reshape(g.ndata['z'], (g.num_nodes(), 1))
+                feat_list.append(g.ndata['h'])
+        return feat_list
+
+    def gcn(self, g):
+        feat_list = []
+        with g.local_scope():
+            if self.eweight_name is None:
+                eweight_name = 'w'
+                if eweight_name in g.edata:
+                    g.edata.pop(eweight_name)
+            else:
+                eweight_name = self.eweight_name
+
+            transform = GCNNorm(eweight_name=eweight_name)
+            transform(g)
+
+            for _ in range(self.k):
+                g.update_all(fn.u_mul_e(self.in_feat_name, eweight_name, 'm'),
+                             fn.sum('m', self.in_feat_name))
+                feat_list.append(g.ndata[self.in_feat_name])
+        return feat_list
+
+    def ppr(self, g):
+        feat_list = []
+        with g.local_scope():
+            if self.eweight_name is None:
+                eweight_name = 'w'
+                if eweight_name in g.edata:
+                    g.edata.pop(eweight_name)
+            else:
+                eweight_name = self.eweight_name
+            transform = GCNNorm(eweight_name=eweight_name)
+            transform(g)
+
+            in_feat = g.ndata[self.in_feat_name]
+            for _ in range(self.k):
+                g.update_all(fn.u_mul_e(self.in_feat_name, eweight_name, 'm'),
+                             fn.sum('m', self.in_feat_name))
+                g.ndata[self.in_feat_name] = (1 - self.alpha) * g.ndata[self.in_feat_name] +\
+                    self.alpha * in_feat
+                feat_list.append(g.ndata[self.in_feat_name])
+        return feat_list
--- a/tests/compute/test_data.py
+++ b/tests/compute/test_data.py
@@ -203,6 +203,64 @@ def test_reddit():
    g2 = data.RedditDataset(transform=transform)[0]
    assert g2.num_edges() - g.num_edges() == g.num_nodes()

+@unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
+def test_explain_syn():
+    dataset = data.BAShapeDataset()
+    assert dataset.num_classes == 4
+    g = dataset[0]
+    assert 'label' in g.ndata
+    assert 'feat' in g.ndata
+
+    g1 = data.BAShapeDataset(force_reload=True, seed=0)[0]
+    src1, dst1 = g1.edges()
+    g2 = data.BAShapeDataset(force_reload=True, seed=0)[0]
+    src2, dst2 = g2.edges()
+    assert F.allclose(src1, src2)
+    assert F.allclose(dst1, dst2)
+
+    dataset = data.BACommunityDataset()
+    assert dataset.num_classes == 8
+    g = dataset[0]
+    assert 'label' in g.ndata
+    assert 'feat' in g.ndata
+
+    g1 = data.BACommunityDataset(force_reload=True, seed=0)[0]
+    src1, dst1 = g1.edges()
+    g2 = data.BACommunityDataset(force_reload=True, seed=0)[0]
+    src2, dst2 = g2.edges()
+    assert F.allclose(src1, src2)
+    assert F.allclose(dst1, dst2)
+
+    dataset = data.TreeCycleDataset()
+    assert dataset.num_classes == 2
+    g = dataset[0]
+    assert 'label' in g.ndata
+    assert 'feat' in g.ndata
+
+    g1 = data.TreeCycleDataset(force_reload=True, seed=0)[0]
+    src1, dst1 = g1.edges()
+    g2 = data.TreeCycleDataset(force_reload=True, seed=0)[0]
+    src2, dst2 = g2.edges()
+    assert F.allclose(src1, src2)
+    assert F.allclose(dst1, dst2)
+
+    dataset = data.TreeGridDataset()
+    assert dataset.num_classes == 2
+    g = dataset[0]
+    assert 'label' in g.ndata
+    assert 'feat' in g.ndata
+
+    g1 = data.TreeGridDataset(force_reload=True, seed=0)[0]
+    src1, dst1 = g1.edges()
+    g2 = data.TreeGridDataset(force_reload=True, seed=0)[0]
+    src2, dst2 = g2.edges()
+    assert F.allclose(src1, src2)
+    assert F.allclose(dst1, dst2)
+
+    dataset = data.BA2MotifDataset()
+    assert dataset.num_classes == 2
+    g, label = dataset[0]
+    assert 'feat' in g.ndata

 @unittest.skipIf(F._default_context_str == 'gpu', reason="Datasets don't need to be tested on GPU.")
 def test_extract_archive():

--- a/tests/compute/test_transform.py
+++ b/tests/compute/test_transform.py
@@ -24,6 +24,8 @@ import dgl.partition
 import backend as F
 import unittest
 import math
+import pytest
+from test_utils.graph_cases import get_cases
 from utils import parametrize_dtype

 from test_heterograph import create_test_heterograph3, create_test_heterograph4, create_test_heterograph5
@@ -2350,6 +2352,75 @@ def test_module_laplacian_pe(idtype):
    else:
        assert F.allclose(new_g.ndata['lappe'].abs(), tgt)

+@unittest.skipIf(dgl.backend.backend_name != 'pytorch', reason='Only support PyTorch for now')
+@pytest.mark.parametrize('g', get_cases(['has_scalar_e_feature']))
+def test_module_sign(g):
+    import torch
+
+    ctx = F.ctx()
+    g = g.to(ctx)
+    adj = g.adj(transpose=True, scipy_fmt='coo').todense()
+    adj = torch.tensor(adj).float().to(ctx)
+
+    weight_adj = g.adj(transpose=True, scipy_fmt='coo').astype(float).todense()
+    weight_adj = torch.tensor(weight_adj).float().to(ctx)
+    src, dst = g.edges()
+    src, dst = src.long(), dst.long()
+    weight_adj[dst, src] = g.edata['scalar_w']
+
+    # raw
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h', diffuse_op='raw')
+    transform(g)
+    assert torch.allclose(g.ndata['out_feat_1'], torch.matmul(adj, g.ndata['h']))
+
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h', eweight_name='scalar_w', diffuse_op='raw')
+    transform(g)
+    assert torch.allclose(g.ndata['out_feat_1'], torch.matmul(weight_adj, g.ndata['h']))
+
+    # rw
+    adj_rw = torch.matmul(torch.diag(1 / adj.sum(dim=1)), adj)
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h', diffuse_op='rw')
+    transform(g)
+    assert torch.allclose(g.ndata['out_feat_1'], torch.matmul(adj_rw, g.ndata['h']))
+
+    weight_adj_rw = torch.matmul(torch.diag(1 / weight_adj.sum(dim=1)), weight_adj)
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h', eweight_name='scalar_w', diffuse_op='rw')
+    transform(g)
+    assert torch.allclose(g.ndata['out_feat_1'], torch.matmul(weight_adj_rw, g.ndata['h']))
+
+    # gcn
+    raw_eweight = g.edata['scalar_w']
+    gcn_norm = dgl.GCNNorm()
+    gcn_norm(g)
+    adj_gcn = adj.clone()
+    adj_gcn[dst, src] = g.edata.pop('w')
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h', diffuse_op='gcn')
+    transform(g)
+    assert torch.allclose(g.ndata['out_feat_1'], torch.matmul(adj_gcn, g.ndata['h']))
+
+    gcn_norm = dgl.GCNNorm('scalar_w')
+    gcn_norm(g)
+    weight_adj_gcn = weight_adj.clone()
+    weight_adj_gcn[dst, src] = g.edata['scalar_w']
+    g.edata['scalar_w'] = raw_eweight
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h',
+                                  eweight_name='scalar_w', diffuse_op='gcn')
+    transform(g)
+    assert torch.allclose(g.ndata['out_feat_1'], torch.matmul(weight_adj_gcn, g.ndata['h']))
+
+    # ppr
+    alpha = 0.2
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h', diffuse_op='ppr', alpha=alpha)
+    transform(g)
+    target = (1 - alpha) * torch.matmul(adj_gcn, g.ndata['h']) + alpha * g.ndata['h']
+    assert torch.allclose(g.ndata['out_feat_1'], target)
+
+    transform = dgl.SIGNDiffusion(k=1, in_feat_name='h', eweight_name='scalar_w',
+                                  diffuse_op='ppr', alpha=alpha)
+    transform(g)
+    target = (1 - alpha) * torch.matmul(weight_adj_gcn, g.ndata['h']) + alpha * g.ndata['h']
+    assert torch.allclose(g.ndata['out_feat_1'], target)
+
 @unittest.skipIf(dgl.backend.backend_name != 'pytorch', reason='Only support PyTorch for now')
 @parametrize_dtype
 def test_module_row_feat_normalizer(idtype):
@@ -2416,8 +2487,6 @@ def test_module_feat_mask(idtype):
    assert g.edata['w'][('user', 'follows', 'user')].shape == (2, 5)
    assert g.edata['w'][('player', 'plays', 'game')].shape == (2, 5)

-
-
 if __name__ == '__main__':
    test_partition_with_halo()
    test_module_heat_kernel(F.int32)