[Doc] Update doc of dataloading package. (#3886)

* wip: dataloading doc * update dataloading package doc and many others * lint

[Doc] Update doc of dataloading package. (#3886)
* wip: dataloading doc * update dataloading package doc and many others * lint
ae3316c8 · Minjie Wang · GitHub · 6752bd45 · ae3316c8 · ae3316c8
Unverified Commit ae3316c8 authored Mar 26, 2022 by Minjie Wang Committed by GitHub Mar 26, 2022
10 changed files
--- a/docs/source/api/python/dgl.dataloading.rst
+++ b/docs/source/api/python/dgl.dataloading.rst
@@ -3,24 +3,31 @@
 dgl.dataloading
 =================================
-.. automodule:: dgl.dataloading
 .. currentmodule:: dgl.dataloading
+The ``dgl.dataloading`` package provides two privimitives to compose a data pipeline
+for loading from graph data. ``Sampler`` represents algorithms
+to generate subgraph samples from the original graph, and ``DataLoader``
+represents the iterable over these samples.
+DGL provides a number of built-in samplers that subclass :class:`~dgl.dataloading.Sampler`.
+Creating new samplers follow the same paradigm. Read our user guide chapter
+:ref:`guide-minibatch` for more examples and explanations.
+The entire package only works for PyTorch backend.
 DataLoaders
 -----------
-DGL DataLoader for mini-batch training works similarly to PyTorch's DataLoader.
-It has a generator interface that returns mini-batches sampled from some given graphs.
-DGL provides two DataLoaders: a ``NodeDataLoader`` for node classification task
-and an ``EdgeDataLoader`` for edge/link prediction task.
 .. autosummary::
    :toctree: ../../generated/
+    :nosignatures:
+    :template: classtemplate.rst
    DataLoader
+    GraphDataLoader
    NodeDataLoader
    EdgeDataLoader
-    GraphDataLoader
    DistNodeDataLoader
    DistEdgeDataLoader
@@ -31,21 +38,16 @@ Samplers
 .. autosummary::
    :toctree: ../../generated/
+    :nosignatures:
+    :template: classtemplate.rst
    Sampler
-    BlockSampler
    NeighborSampler
    MultiLayerFullNeighborSampler
    ClusterGCNSampler
    ShaDowKHopSampler
-Sampler Transformations
-----------------------
-.. autosummary::
-    :toctree: ../../generated/
    as_edge_prediction_sampler
+    BlockSampler
 .. _api-dataloading-negative-sampling:
@@ -53,10 +55,10 @@ Negative Samplers for Link Prediction
 -------------------------------------
 .. currentmodule:: dgl.dataloading.negative_sampler
-Negative samplers are classes that control the behavior of the edge prediction samplers
 .. autosummary::
    :toctree: ../../generated/
+    :nosignatures:
+    :template: classtemplate.rst
    Uniform
    PerSourceUniform
@@ -68,9 +70,11 @@ Utility Class and Functions for Feature Prefetching
 .. autosummary::
    :toctree: ../../generated/
+    :nosignatures:
+    :template: classtemplate.rst
-    LazyFeature
    set_node_lazy_features
    set_edge_lazy_features
    set_src_lazy_features
    set_dst_lazy_features
+    LazyFeature
--- a/docs/source/guide/minibatch-custom-sampler.rst
+++ b/docs/source/guide/minibatch-custom-sampler.rst
 .. _guide-minibatch-customizing-neighborhood-sampler:
-6.4 Implementing custom graph samplers
+6.4 Implementing Custom Graph Samplers
 ----------------------------------------------
 Implementing custom samplers involves subclassing the :class:`dgl.dataloading.Sampler`
@@ -142,4 +142,4 @@ subgraph:
 Further Readings
 ~~~~~~~~~~~~~~~~~~
 See :ref:`guide-minibatch-prefetching` for how to write a custom graph sampler
 with feature prefetching.
\ No newline at end of file
--- a/python/dgl/data/ppi.py
+++ b/python/dgl/data/ppi.py
@@ -6,7 +6,7 @@ from networkx.readwrite import json_graph
 import os
 from .dgl_dataset import DGLBuiltinDataset
-from .utils import _get_dgl_url, save_graphs, save_info, load_info, load_graphs, deprecate_property
+from .utils import _get_dgl_url, save_graphs, save_info, load_info, load_graphs
 from .. import backend as F
 from ..convert import from_networkx
@@ -14,24 +14,6 @@ from ..convert import from_networkx
 class PPIDataset(DGLBuiltinDataset):
    r""" Protein-Protein Interaction dataset for inductive node classification
-    .. deprecated:: 0.5.0
-        - ``lables`` is deprecated, it is replaced by:
-            >>> dataset = PPIDataset()
-            >>> for g in dataset:
-            ....    labels = g.ndata['label']
-            ....
-            >>>
-        - ``features`` is deprecated, it is replaced by:
-            >>> dataset = PPIDataset()
-            >>> for g in dataset:
-            ....    features = g.ndata['feat']
-            ....
-            >>>
    A toy Protein-Protein Interaction network dataset. The dataset contains
    24 graphs. The average number of nodes per graph is 2372. Each node has
    50 features and 121 labels. 20 graphs for training, 2 for validation
@@ -154,16 +136,6 @@ class PPIDataset(DGLBuiltinDataset):
    def num_labels(self):
        return 121
-    @property
-    def labels(self):
-        deprecate_property('dataset.labels', 'dataset.graphs[i].ndata[\'label\']')
-        return self._labels
-    @property
-    def features(self):
-        deprecate_property('dataset.features', 'dataset.graphs[i].ndata[\'feat\']')
-        return self._feats
    def __len__(self):
        """Return number of samples in this dataset."""
        return len(self.graphs)

--- a/python/dgl/dataloading/__init__.py
+++ b/python/dgl/dataloading/__init__.py
-"""The ``dgl.dataloading`` package contains:
+"""Package for dataloaders and samplers."""
-* Data loader classes for iterating over a set of nodes or edges in a graph and generates
-  computation dependency via neighborhood sampling methods.
-* Various sampler classes that perform neighborhood sampling for multi-layer GNNs.
-* Negative samplers for link prediction.
-For a holistic explanation on how different components work together.
-Read the user guide :ref:`guide-minibatch`.
-.. note::
-    This package is experimental and the interfaces may be subject
-    to changes in future releases. It currently only has implementations in PyTorch.
-"""
 from .. import backend as F
 from .neighbor_sampler import *
 from .cluster_gcn import *

--- a/python/dgl/dataloading/base.py
+++ b/python/dgl/dataloading/base.py
@@ -18,7 +18,7 @@ def _set_lazy_features(x, xdata, feature_names):
            x[type_].data.update({k: LazyFeature(k) for k in names})
 def set_node_lazy_features(g, feature_names):
-    """Assign :class:`~dgl.LazyFeature`s to the node data of the input graph.
+    """Assign lazy features to the ``ndata`` of the input graph for prefetching optimization.
    When used in a :class:`~dgl.dataloading.Sampler`, lazy features mark which data
    should be fetched before computation in model. See :ref:`guide-minibatch-prefetching`
@@ -52,7 +52,7 @@ def set_node_lazy_features(g, feature_names):
    return _set_lazy_features(g.nodes, g.ndata, feature_names)
 def set_edge_lazy_features(g, feature_names):
-    """Assign :class:`~dgl.LazyFeature`s to the edge data of the input graph.
+    """Assign lazy features to the ``edata`` of the input graph for prefetching optimization.
    When used in a :class:`~dgl.dataloading.Sampler`, lazy features mark which data
    should be fetched before computation in model. See :ref:`guide-minibatch-prefetching`
@@ -87,7 +87,7 @@ def set_edge_lazy_features(g, feature_names):
    return _set_lazy_features(g.edges, g.edata, feature_names)
 def set_src_lazy_features(g, feature_names):
-    """Assign :class:`~dgl.LazyFeature`s to the source node data of the input MFG.
+    """Assign lazy features to the ``srcdata`` of the input graph for prefetching optimization.
    When used in a :class:`~dgl.dataloading.Sampler`, lazy features mark which data
    should be fetched before computation in model. See :ref:`guide-minibatch-prefetching`
@@ -121,7 +121,7 @@ def set_src_lazy_features(g, feature_names):
    return _set_lazy_features(g.srcnodes, g.srcdata, feature_names)
 def set_dst_lazy_features(g, feature_names):
-    """Assign :class:`~dgl.LazyFeature`s to the destination node data of the input MFG.
+    """Assign lazy features to the ``dstdata`` of the input graph for prefetching optimization.
    When used in a :class:`~dgl.dataloading.Sampler`, lazy features mark which data
    should be fetched before computation in model. See :ref:`guide-minibatch-prefetching`
@@ -155,7 +155,22 @@ def set_dst_lazy_features(g, feature_names):
    return _set_lazy_features(g.dstnodes, g.dstdata, feature_names)
 class Sampler(object):
-    """Abstract sampler class."""
+    """Base class for graph samplers.
+    All graph samplers must subclass this class and override the ``sample``
+    method.
+    .. code:: python
+        from dgl.dataloading import Sampler
+        class SubgraphSampler(Sampler):
+            def __init__(self):
+                super().__init__()
+            def sample(self, g, indices):
+                return g.subgraph(indices)
+    """
    def sample(self, g, indices):
        """Abstract sample method.
@@ -169,12 +184,11 @@ class Sampler(object):
        raise NotImplementedError
 class BlockSampler(Sampler):
-    """Abstract class that assumes to take in a set of nodes whose
+    """Base class for sampling mini-batches in the form of Message-passing
-    outputs are to compute, and returns a list of MFGs.
+    Flow Graphs (MFGs).
-    Moreover, it assumes that the input node features will be put in the first MFG's
+    It provides prefetching options to fetch the node features for the first MFG's ``srcdata``,
-    ``srcdata``, the output node labels will be put in the last MFG's ``dstdata``, and
+    the node labels for the last MFG's ``dstdata`` and the edge features of all MFG's ``edata``.
-    the edge data will be put in all the MFGs' ``edata``.
    Parameters
    ----------
@@ -217,29 +231,6 @@ class BlockSampler(Sampler):
    def assign_lazy_features(self, result):
        """Assign lazy features for prefetching."""
-        # A LazyFeature is a placeholder telling the dataloader where and which IDs
-        # to prefetch.  It has the signature LazyFeature(name, id_).  id_ can be None
-        # if the LazyFeature is set into one of the subgraph's ``xdata``, in which case the
-        # dataloader will infer from the subgraph's ``xdata[dgl.NID]`` (or ``xdata[dgl.EID]``
-        # if the LazyFeature is set as edge features).
-        #
-        # If you want to prefetch things other than ndata and edata, you can also
-        # return a LazyFeature(name, id_).  If a LazyFeature is returned in places other than
-        # in a graph's ndata/edata/srcdata/dstdata, the DataLoader will prefetch it
-        # from its dictionary ``other_data``.
-        # For instance, you can run
-        #
-        #     return blocks, LazyFeature('other_feat', id_)
-        #
-        # To make it work with the sampler returning the stuff above, your dataloader
-        # needs to have the following
-        #
-        #     dataloader.attach_data('other_feat', tensor)
-        #
-        # Then you can run
-        #
-        #     for blocks, other_feat in dataloader:
-        #         train_on(blocks, other_feat)
        input_nodes, output_nodes, blocks = result
        set_src_lazy_features(blocks[0], self.prefetch_node_feats)
        set_dst_lazy_features(blocks[-1], self.prefetch_labels)

--- a/python/dgl/dataloading/cluster_gcn.py
+++ b/python/dgl/dataloading/cluster_gcn.py
@@ -9,7 +9,9 @@ from ..partition import metis_partition_assignment
 from .base import set_node_lazy_features, set_edge_lazy_features, Sampler
 class ClusterGCNSampler(Sampler):
-    """Cluster-GCN sampler.
+    """Cluster sampler from `Cluster-GCN: An Efficient Algorithm for Training
+    Deep and Large Graph Convolutional Networks
+    <https://arxiv.org/abs/1905.07953>`__
    This sampler first partitions the graph with METIS partitioning, then it caches the nodes of
    each partition to a file within the given cache directory.

--- a/python/dgl/dataloading/dataloader.py
+++ b/python/dgl/dataloading/dataloader.py
@@ -565,10 +565,13 @@ def _get_device(device):
    return device
 class DataLoader(torch.utils.data.DataLoader):
-    """PyTorch dataloader for batch-iterating over a set of nodes, edges or any other
+    """Sampled graph data loader. Wrap a :class:`~dgl.DGLGraph` and a
-    kinds of indices.  The minibatch of such indices will be then passed to a sampler
+    :class:`~dgl.dataloading.Sampler` into an iterable over mini-batches of samples.
-    generating subgraphs, message flow graphs (MFGs), or any other structures necessary
-    to compute the representations.
+    DGL's ``DataLoader`` extends PyTorch's ``DataLoader`` by handling creation
+    and transmission of graph samples. It supports iterating over a set of nodes,
+    edges or any kinds of indices to get samples in the form of ``DGLGraph``, message
+    flow graphs (MFGS), or any other structures necessary to train a graph neural network.
    Parameters
    ----------
@@ -624,11 +627,14 @@ class DataLoader(torch.utils.data.DataLoader):
        Whether to pin the feature tensors into pinned memory.
        Default: True if the graph is on CPU and :attr:`device` is CUDA.  False otherwise.
-    batch_size : int, optional
-    drop_last : bool, optional
-    shuffle : bool, optional
    kwargs : dict
-        Arguments being passed to :py:class:`torch.utils.data.DataLoader`.
+        Key-word arguments to be passed to the parent PyTorch
+        :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
+          - ``batch_size`` (int): The number of indices in each batch.
+          - ``drop_last`` (bool): Whether to drop the last incomplete batch.
+          - ``shuffle`` (bool): Whether to randomly shuffle the indices at each epoch.
    Examples
    --------
@@ -817,302 +823,45 @@ class DataLoader(torch.utils.data.DataLoader):
 # Alias
 class NodeDataLoader(DataLoader):
-    """PyTorch dataloader for batch-iterating over a set of nodes, generating the list
+    """(DEPRECATED) Sampled graph data loader over a set of nodes.
-    of message flow graphs (MFGs) as computation dependency of the said minibatch.
-    Parameters
-    ----------
-    graph : DGLGraph
-        The graph.
-    indices : Tensor or dict[ntype, Tensor]
-        The node set to compute outputs.
-    graph_sampler : dgl.dataloading.Sampler
-        The subgraph sampler.
-    device : device context, optional
-        The device of the generated MFGs in each iteration, which should be a
-        PyTorch device object (e.g., ``torch.device``).
-        By default this value is the same as the device of :attr:`g`.
-    use_ddp : boolean, optional
-        If True, tells the DataLoader to split the training set for each
-        participating process appropriately using
-        :class:`torch.utils.data.distributed.DistributedSampler`.
-        Overrides the :attr:`sampler` argument of :class:`torch.utils.data.DataLoader`.
-    ddp_seed : int, optional
-        The seed for shuffling the dataset in
-        :class:`torch.utils.data.distributed.DistributedSampler`.
-        Only effective when :attr:`use_ddp` is True.
-    use_uva : bool, optional
-        Whether to use Unified Virtual Addressing (UVA) to directly sample the graph
-        and slice the features from CPU into GPU.  Setting it to True will pin the
-        graph and feature tensors into pinned memory.
-        If True, requires that :attr:`indices` must have the same device as the
-        :attr:`device` argument.
-        Default: False.
-    use_prefetch_thread : bool, optional
-        (Advanced option)
-        Spawns a new Python thread to perform feature slicing
-        asynchronously.  Can make things faster at the cost of GPU memory.
-        Default: True if the graph is on CPU and :attr:`device` is CUDA.  False otherwise.
-    use_alternate_streams : bool, optional
-        (Advanced option)
-        Whether to slice and transfers the features to GPU on a non-default stream.
-        Default: True if the graph is on CPU, :attr:`device` is CUDA, and :attr:`use_uva`
-        is False.  False otherwise.
-    pin_prefetcher : bool, optional
-        (Advanced option)
-        Whether to pin the feature tensors into pinned memory.
-        Default: True if the graph is on CPU and :attr:`device` is CUDA.  False otherwise.
+    .. deprecated:: 0.8
-    batch_size : int, optional
-    drop_last : bool, optional
-    shuffle : bool, optional
-    kwargs : dict
-        Arguments being passed to :py:class:`torch.utils.data.DataLoader`.
-    Examples
-    --------
-    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
-    a homogeneous graph where each node takes messages from all neighbors (assume
-    the backend is PyTorch):
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> dataloader = dgl.dataloading.NodeDataLoader(
-    ...     g, train_nid, sampler,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, output_nodes, blocks in dataloader:
-    ...     train_on(input_nodes, output_nodes, blocks)
-    **Using with Distributed Data Parallel**
-    If you are using PyTorch's distributed training (e.g. when using
-    :mod:`torch.nn.parallel.DistributedDataParallel`), you can train the model by turning
-    on the `use_ddp` option:
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
+        The class is deprecated since v0.8, replaced by :class:`~dgl.dataloading.DataLoader`.
-    >>> dataloader = dgl.dataloading.NodeDataLoader(
-    ...     g, train_nid, sampler, use_ddp=True,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for epoch in range(start_epoch, n_epochs):
-    ...     for input_nodes, output_nodes, blocks in dataloader:
-    ...         train_on(input_nodes, output_nodes, blocks)
-    Notes
-    -----
-    Please refer to
-    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`
-    and :ref:`User Guide Section 6 <guide-minibatch>` for usage.
-    **Tips for selecting the proper device**
-    * If the input graph :attr:`g` is on GPU, the output device :attr:`device` must be the same GPU
-      and :attr:`num_workers` must be zero. In this case, the sampling and subgraph construction
-      will take place on the GPU. This is the recommended setting when using a single-GPU and
-      the whole graph fits in GPU memory.
-    * If the input graph :attr:`g` is on CPU while the output device :attr:`device` is GPU, then
-      depending on the value of :attr:`use_uva`:
-      - If :attr:`use_uva` is set to True, the sampling and subgraph construction will happen
-        on GPU even if the GPU itself cannot hold the entire graph. This is the recommended
-        setting unless there are operations not supporting UVA. :attr:`num_workers` must be 0
-        in this case.
-      - Otherwise, both the sampling and subgraph construction will take place on the CPU.
    """
 class EdgeDataLoader(DataLoader):
-    """PyTorch dataloader for batch-iterating over a set of edges, generating the list
+    """(DEPRECATED) Sampled graph data loader over a set of edges.
-    of message flow graphs (MFGs) as computation dependency of the said minibatch for
-    edge classification, edge regression, and link prediction.
-    For more details, please refer to :ref:`guide-minibatch-edge-classification-sampler`
+    .. deprecated:: 0.8
-    and :ref:`guide-minibatch-link-classification-sampler`.
-    Parameters
+        The class is deprecated since v0.8 -- its function has been covered by
-    ----------
+        :class:`~dgl.dataloading.DataLoader` and :func:`~dgl.as_edge_prediction_sampler`.
-    g : DGLGraph
-        The graph.
-    indices : Tensor or dict[etype, Tensor]
-        The edge set in graph :attr:`g` to compute outputs.
-    graph_sampler : dgl.dataloading.Sampler
-        The subgraph sampler
-    device : device context, optional
-        The device of the generated MFGs and graphs in each iteration, which should be a
-        PyTorch device object (e.g., ``torch.device``).
-        By default this value is the same as the device of :attr:`g`.
+        To migrate, change the legacy usage like:
-    use_ddp : boolean, optional
-        If True, tells the DataLoader to split the training set for each
-        participating process appropriately using
-        :class:`torch.utils.data.distributed.DistributedSampler`.
-        Overrides the :attr:`sampler` argument of :class:`torch.utils.data.DataLoader`.
-    ddp_seed : int, optional
-        The seed for shuffling the dataset in
-        :class:`torch.utils.data.distributed.DistributedSampler`.
-        Only effective when :attr:`use_ddp` is True.
+        .. code:: python
-    use_prefetch_thread : bool, optional
-        (Advanced option)
-        Spawns a new Python thread to perform feature slicing
-        asynchronously.  Can make things faster at the cost of GPU memory.
-        Default: True if the graph is on CPU and :attr:`device` is CUDA.  False otherwise.
+            sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    use_alternate_streams : bool, optional
+            dataloader = dgl.dataloading.EdgeDataLoader(
-        (Advanced option)
+                g, train_eid, sampler, exclude='reverse_id',
-        Whether to slice and transfers the features to GPU on a non-default stream.
+                reverse_eids=reverse_eids,
+                negative_sampler=dgl.dataloading.negative_sampler.Uniform(5),
+                batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-        Default: True if the graph is on CPU, :attr:`device` is CUDA, and :attr:`use_uva`
+        to:
-        is False.  False otherwise.
-    pin_prefetcher : bool, optional
-        (Advanced option)
-        Whether to pin the feature tensors into pinned memory.
-        Default: True if the graph is on CPU and :attr:`device` is CUDA.  False otherwise.
-    use_uva : bool, optional
-        Whether to use Unified Virtual Addressing (UVA) to directly sample the graph
-        and slice the features from CPU into GPU.  Setting it to True will pin the
-        graph and feature tensors into pinned memory.
-        If True, requires that :attr:`indices` must have the same device as the
-        :attr:`device` argument.
-        Default: False.
-    exclude : str, optional
-    reverse_eids : Tensor or dict[etype, Tensor], optional
-    reverse_etypes : dict[etype, etype], optional
-    negative_sampler : callable, optional
-        Deprecated and will be passed to :func:`dgl.dataloading.as_edge_prediction_sampler`.
-    batch_size : int, optional
-    drop_last : bool, optional
-    shuffle : bool, optional
-    kwargs : dict
-        Arguments being passed to :py:class:`torch.utils.data.DataLoader`.
-    Examples
-    --------
-    The following example shows how to train a 3-layer GNN for edge classification on a
-    set of edges ``train_eid`` on a homogeneous undirected graph. Each node takes
-    messages from all neighbors.
-    Say that you have an array of source node IDs ``src`` and another array of destination
-    node IDs ``dst``.  One can make it bidirectional by adding another set of edges
-    that connects from ``dst`` to ``src``:
-    >>> g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])))
-    One can then know that the ID difference of an edge and its reverse edge is ``|E|``,
-    where ``|E|`` is the length of your source/destination array.  The reverse edge
-    mapping can be obtained by
-    >>> E = len(src)
-    >>> reverse_eids = torch.cat([torch.arange(E, 2 * E), torch.arange(0, E)])
-    Note that the sampled edges as well as their reverse edges are removed from
-    computation dependencies of the incident nodes.  That is, the edge will not
-    involve in neighbor sampling and message aggregation.  This is a common trick
-    to avoid information leakage.
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> dataloader = dgl.dataloading.EdgeDataLoader(
-    ...     g, train_eid, sampler, exclude='reverse_id',
-    ...     reverse_eids=reverse_eids,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, pair_graph, blocks in dataloader:
-    ...     train_on(input_nodes, pair_graph, blocks)
-    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` on a
+        .. code:: python
-    homogeneous graph where each node takes messages from all neighbors (assume the
-    backend is PyTorch), with 5 uniformly chosen negative samples per edge:
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
+            sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
+            sampler = dgl.dataloading.as_edge_prediction_sampler(
-    >>> dataloader = dgl.dataloading.EdgeDataLoader(
+                sampler, exclude='reverse_id',
-    ...     g, train_eid, sampler, exclude='reverse_id',
+                reverse_eids=reverse_eids,
-    ...     reverse_eids=reverse_eids, negative_sampler=neg_sampler,
+                negative_sampler=dgl.dataloading.negative_sampler.Uniform(5))
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+            dataloader = dgl.dataloading.DataLoader(
-    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
+                g, train_eid, sampler,
-    ...     train_on(input_nodse, pair_graph, neg_pair_graph, blocks)
+                batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    For heterogeneous graphs, the reverse of an edge may have a different edge type
-    from the original edge.  For instance, consider that you have an array of
-    user-item clicks, representated by a user array ``user`` and an item array ``item``.
-    You may want to build a heterogeneous graph with a user-click-item relation and an
-    item-clicked-by-user relation.
-    >>> g = dgl.heterograph({
-    ...     ('user', 'click', 'item'): (user, item),
-    ...     ('item', 'clicked-by', 'user'): (item, user)})
-    To train a 3-layer GNN for edge classification on a set of edges ``train_eid`` with
-    type ``click``, you can write
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> dataloader = dgl.dataloading.EdgeDataLoader(
-    ...     g, {'click': train_eid}, sampler, exclude='reverse_types',
-    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'},
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, pair_graph, blocks in dataloader:
-    ...     train_on(input_nodes, pair_graph, blocks)
-    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` with type
-    ``click``, you can write
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
-    >>> dataloader = dgl.dataloading.EdgeDataLoader(
-    ...     g, train_eid, sampler, exclude='reverse_types',
-    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'},
-    ...     negative_sampler=neg_sampler,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
-    ...     train_on(input_nodes, pair_graph, neg_pair_graph, blocks)
-    **Using with Distributed Data Parallel**
-    If you are using PyTorch's distributed training (e.g. when using
-    :mod:`torch.nn.parallel.DistributedDataParallel`), you can train the model by
-    turning on the :attr:`use_ddp` option:
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([15, 10, 5])
-    >>> dataloader = dgl.dataloading.EdgeDataLoader(
-    ...     g, train_eid, sampler, use_ddp=True, exclude='reverse_id',
-    ...     reverse_eids=reverse_eids,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for epoch in range(start_epoch, n_epochs):
-    ...     for input_nodes, pair_graph, blocks in dataloader:
-    ...         train_on(input_nodes, pair_graph, blocks)
-    Notes
-    -----
-    Please refer to
-    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`
-    and :ref:`User Guide Section 6 <guide-minibatch>` for usage.
-    **Tips for selecting the proper device**
-    * If the input graph :attr:`g` is on GPU, the output device :attr:`device` must be the same GPU
-      and :attr:`num_workers` must be zero. In this case, the sampling and subgraph construction
-      will take place on the GPU. This is the recommended setting when using a single-GPU and
-      the whole graph fits in GPU memory.
-    * If the input graph :attr:`g` is on CPU while the output device :attr:`device` is GPU, then
-      depending on the value of :attr:`use_uva`:
-      - If :attr:`use_uva` is set to True, the sampling and subgraph construction will happen
-        on GPU even if the GPU itself cannot hold the entire graph. This is the recommended
-        setting unless there are operations not supporting UVA. :attr:`num_workers` must be 0
-        in this case.
-      - Otherwise, both the sampling and subgraph construction will take place on the CPU.
    """
    def __init__(self, graph, indices, graph_sampler, device='cpu', use_ddp=False,
                 ddp_seed=0, batch_size=1, drop_last=False, shuffle=False,
@@ -1247,11 +996,15 @@ class GraphCollator(object):
        raise TypeError(self.graph_collate_err_msg_format.format(elem_type))
 class GraphDataLoader(torch.utils.data.DataLoader):
-    """PyTorch dataloader for batch-iterating over a set of graphs, generating the batched
+    """Batched graph data loader.
+    PyTorch dataloader for batch-iterating over a set of graphs, generating the batched
    graph and corresponding label tensor (if provided) of the said minibatch.
    Parameters
    ----------
+    dataset : torch.utils.data.Dataset
+        The dataset to load graphs from.
    collate_fn : Function, default is None
        The customized collate function. Will use the default collate
        function if not given.
@@ -1267,19 +1020,23 @@ class GraphDataLoader(torch.utils.data.DataLoader):
        Only effective when :attr:`use_ddp` is True.
    kwargs : dict
-        Arguments being passed to :py:class:`torch.utils.data.DataLoader`.
+        Key-word arguments to be passed to the parent PyTorch
+        :py:class:`torch.utils.data.DataLoader` class. Common arguments are:
+          - ``batch_size`` (int): The number of indices in each batch.
+          - ``drop_last`` (bool): Whether to drop the last incomplete batch.
+          - ``shuffle`` (bool): Whether to randomly shuffle the indices at each epoch.
    Examples
    --------
-    To train a GNN for graph classification on a set of graphs in ``dataset`` (assume
+    To train a GNN for graph classification on a set of graphs in ``dataset``:
-    the backend is PyTorch):
    >>> dataloader = dgl.dataloading.GraphDataLoader(
    ...     dataset, batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
    >>> for batched_graph, labels in dataloader:
    ...     train_on(batched_graph, labels)
-    **Using with Distributed Data Parallel**
+    **With Distributed Data Parallel**
    If you are using PyTorch's distributed training (e.g. when using
    :mod:`torch.nn.parallel.DistributedDataParallel`), you can train the model by

--- a/python/dgl/dataloading/dist_dataloader.py
+++ b/python/dgl/dataloading/dist_dataloader.py
@@ -14,12 +14,14 @@ def _remove_kwargs_dist(kwargs):
    return kwargs
 class DistNodeDataLoader(DistDataLoader):
-    """PyTorch dataloader for batch-iterating over a set of nodes, generating the list
+    """Sampled graph data loader over nodes for distributed graph storage.
+    It wraps an iterable over a set of nodes, generating the list
    of message flow graphs (MFGs) as computation dependency of the said minibatch, on
    a distributed graph.
    All the arguments have the same meaning as the single-machine counterpart
-    :class:`dgl.dataloading.pytorch.NodeDataLoader` except the first argument
+    :class:`dgl.dataloading.DataLoader` except the first argument
    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
    Parameters
@@ -28,11 +30,11 @@ class DistNodeDataLoader(DistDataLoader):
        The distributed graph.
    nids, graph_sampler, device, kwargs :
-        See :class:`dgl.dataloading.pytorch.NodeDataLoader`.
+        See :class:`dgl.dataloading.DataLoader`.
    See also
    --------
-    dgl.dataloading.pytorch.NodeDataLoader
+    dgl.dataloading.DataLoader
    """
    def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
        collator_kwargs = {}
@@ -57,13 +59,15 @@ class DistNodeDataLoader(DistDataLoader):
        self.device = device
 class DistEdgeDataLoader(DistDataLoader):
-    """PyTorch dataloader for batch-iterating over a set of edges, generating the list
+    """Sampled graph data loader over edges for distributed graph storage.
+    It wraps an iterable over a set of edges, generating the list
    of message flow graphs (MFGs) as computation dependency of the said minibatch for
    edge classification, edge regression, and link prediction, on a distributed
    graph.
    All the arguments have the same meaning as the single-machine counterpart
-    :class:`dgl.dataloading.pytorch.EdgeDataLoader` except the first argument
+    :class:`dgl.dataloading.EdgeDataLoader` except the first argument
    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
    Parameters
@@ -72,11 +76,11 @@ class DistEdgeDataLoader(DistDataLoader):
        The distributed graph.
    eids, graph_sampler, device, kwargs :
-        See :class:`dgl.dataloading.pytorch.EdgeDataLoader`.
+        See :class:`dgl.dataloading.EdgeDataLoader`.
    See also
    --------
-    dgl.dataloading.pytorch.EdgeDataLoader
+    dgl.dataloading.EdgeDataLoader
    """
    def __init__(self, g, eids, graph_sampler, device=None, **kwargs):
        collator_kwargs = {}

--- a/python/dgl/dataloading/shadow.py
+++ b/python/dgl/dataloading/shadow.py
@@ -5,8 +5,8 @@ from ..base import NID
 from .base import set_node_lazy_features, set_edge_lazy_features, Sampler
 class ShaDowKHopSampler(Sampler):
-    """K-hop subgraph sampler used by
+    """K-hop subgraph sampler from `Deep Graph Neural Networks with Shallow
-    `ShaDow-GNN <https://arxiv.org/abs/2012.01380>`__.
+    Subgraph Samplers <https://arxiv.org/abs/2012.01380>`__.
    It performs node-wise neighbor sampling and returns the subgraph induced by
    all the sampled nodes. The seed nodes from which the neighbors are sampled

--- a/python/dgl/merge.py
+++ b/python/dgl/merge.py
@@ -19,6 +19,11 @@ def merge(graphs):
    graphs : list[DGLGraph]
        Input graphs.
+    Returns
+    -------
+    DGLGraph
+        The merged graph.
    Notes
    ----------
    * Inplace updates are applied to a new, empty graph.
@@ -36,15 +41,17 @@ def merge(graphs):
    >>> g.ndata["x"] = torch.zeros(4)
    >>> h = dgl.graph((torch.tensor([1,2]), torch.tensor([0,4])))
    >>> h.ndata["x"] = torch.ones(5)
-    >>> m = dgl.merge([g,h])
+    >>> m = dgl.merge([g, h])
    ``m`` now contains edges and nodes from ``h`` and ``g``.
    >>> m.edges()
    (tensor([0, 1, 1, 2]), tensor([2, 3, 0, 4]))
    >>> m.nodes()
    tensor([0, 1, 2, 3, 4])
    ``g``'s data has updated with ``h``'s in ``m``.
    >>> m.ndata["x"]
    tensor([1., 1., 1., 1., 1.])