[Feature] Edge DataLoader for edge classification & link prediction (#1828)

* clean commit * oops forgot the most important files * use einsum * copy feature from frontier to block * Revert "copy feature from frontier to block" This reverts commit 5224ec963eb6a3ef1b6ab74d8ecbd44e4e42f285. * temp fix * unit test * fix * revert jtnn * lint * fix win64 * docstring fixes and doc indexing * revert einsum in sparse bidecoder * fix some examples * lint * fix due to some tediousness in remove_edges * addresses comments * fix * more jtnn fixes * fix

[Feature] Edge DataLoader for edge classification & link prediction (#1828)
* clean commit * oops forgot the most important files * use einsum * copy feature from frontier to block * Revert "copy feature from frontier to block" This reverts commit 5224ec963eb6a3ef1b6ab74d8ecbd44e4e42f285. * temp fix * unit test * fix * revert jtnn * lint * fix win64 * docstring fixes and doc indexing * revert einsum in sparse bidecoder * fix some examples * lint * fix due to some tediousness in remove_edges * addresses comments * fix * more jtnn fixes * fix
f5eb80d2 · Quan (Andy) Gan · GitHub · d340ea3a · f5eb80d2 · f5eb80d2
Unverified Commit f5eb80d2 authored Aug 11, 2020 by Quan (Andy) Gan Committed by GitHub Aug 11, 2020
20 changed files
--- a/examples/pytorch/ogb/ogbn-products/graphsage/main.py
+++ b/examples/pytorch/ogb/ogbn-products/graphsage/main.py
@@ -70,8 +70,8 @@ class SAGE(nn.Module):
        for l, layer in enumerate(self.layers):
            y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)

-            sampler = dgl.sampling.MultiLayerNeighborSampler([None])
-            dataloader = dgl.sampling.NodeDataLoader(
+            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
+            dataloader = dgl.dataloading.NodeDataLoader(
                g,
                th.arange(g.number_of_nodes()),
                sampler,
@@ -132,9 +132,9 @@ def run(args, device, data):
    train_nid, val_nid, test_nid, in_feats, labels, n_classes, g = data

    # Create PyTorch DataLoader for constructing blocks
-    sampler = dgl.sampling.MultiLayerNeighborSampler(
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(
        [int(fanout) for fanout in args.fan_out.split(',')])
-    dataloader = dgl.sampling.NodeDataLoader(
+    dataloader = dgl.dataloading.NodeDataLoader(
        g,
        train_nid,
        sampler,

--- a/examples/pytorch/pinsage/data_utils.py
+++ b/examples/pytorch/pinsage/data_utils.py
@@ -6,42 +6,25 @@ import scipy.sparse as ssp
 # This is the train-test split method most of the recommender system papers running on MovieLens
 # takes.  It essentially follows the intuition of "training on the past and predict the future".
 # One can also change the threshold to make validation and test set take larger proportions.
-def train_test_split_by_time(g, column, etype, itype):
-    n_edges = g.number_of_edges(etype)
-    with g.local_scope():
-        def splits(edges):
-            num_edges, count = edges.data['train_mask'].shape
-
-            # sort by timestamp
-            _, sorted_idx = edges.data[column].sort(1)
-
-            train_mask = edges.data['train_mask']
-            val_mask = edges.data['val_mask']
-            test_mask = edges.data['test_mask']
-
-            x = torch.arange(num_edges)
-
-            # If one user has more than one interactions, select the latest one for test.
-            if count > 1:
-                train_mask[x, sorted_idx[:, -1]] = False
-                test_mask[x, sorted_idx[:, -1]] = True
-            # If one user has more than two interactions, select the second latest one for validation.
-            if count > 2:
-                train_mask[x, sorted_idx[:, -2]] = False
-                val_mask[x, sorted_idx[:, -2]] = True
-            return {'train_mask': train_mask, 'val_mask': val_mask, 'test_mask': test_mask}
-
-        g.edges[etype].data['train_mask'] = torch.ones(n_edges, dtype=torch.bool)
-        g.edges[etype].data['val_mask'] = torch.zeros(n_edges, dtype=torch.bool)
-        g.edges[etype].data['test_mask'] = torch.zeros(n_edges, dtype=torch.bool)
-        g.nodes[itype].data['count'] = g.in_degrees(etype=etype)
-        g.group_apply_edges('src', splits, etype=etype)
-
-        train_indices = g.filter_edges(lambda edges: edges.data['train_mask'], etype=etype)
-        val_indices = g.filter_edges(lambda edges: edges.data['val_mask'], etype=etype)
-        test_indices = g.filter_edges(lambda edges: edges.data['test_mask'], etype=etype)
-
-    return train_indices, val_indices, test_indices
+def train_test_split_by_time(df, timestamp, item):
+    df = df.copy()
+    df['train_mask'] = np.ones((len(df),), dtype=np.bool)
+    df['val_mask'] = np.zeros((len(df),), dtype=np.bool)
+    df['test_mask'] = np.zeros((len(df),), dtype=np.bool)
+    df = df.sort_values([item, timestamp])
+    for track_id in df[item].unique():
+        idx = (df[item] == track_id).to_numpy().nonzero()[0]
+        idx = df.index[idx]
+        if len(idx) > 1:
+            df.loc[idx[-1], 'train_mask'] = False
+            df.loc[idx[-1], 'test_mask'] = True
+        if len(idx) > 2:
+            df.loc[idx[-2], 'train_mask'] = False
+            df.loc[idx[-2], 'val_mask'] = True
+    df = df.sort_index()
+    return df['train_mask'].to_numpy().nonzero()[0], \
+           df['val_mask'].to_numpy().nonzero()[0], \
+           df['test_mask'].to_numpy().nonzero()[0]

 def build_train_graph(g, train_indices, utype, itype, etype, etype_rev):
    train_g = g.edge_subgraph(

--- a/examples/pytorch/pinsage/process_movielens1m.py
+++ b/examples/pytorch/pinsage/process_movielens1m.py
@@ -115,7 +115,7 @@ g.edges['watched-by'].data['timestamp'] = torch.LongTensor(ratings['timestamp'].
 # Train-validation-test split
 # This is a little bit tricky as we want to select the last interaction for test, and the
 # second-to-last interaction for validation.
-train_indices, val_indices, test_indices = train_test_split_by_time(g, 'timestamp', 'watched', 'movie')
+train_indices, val_indices, test_indices = train_test_split_by_time(ratings, 'timestamp', 'movie_id')

 # Build the graph with training interactions only.
 train_g = build_train_graph(g, train_indices, 'user', 'movie', 'watched', 'watched-by')

--- a/examples/pytorch/pinsage/process_nowplaying_rs.py
+++ b/examples/pytorch/pinsage/process_nowplaying_rs.py
@@ -52,7 +52,7 @@ g.edges['listened'].data['created_at'] = torch.LongTensor(events['created_at'].v
 g.edges['listened-by'].data['created_at'] = torch.LongTensor(events['created_at'].values)

 n_edges = g.number_of_edges('listened')
-train_indices, val_indices, test_indices = train_test_split_by_time(g, 'created_at', 'listened', 'track')
+train_indices, val_indices, test_indices = train_test_split_by_time(events, 'created_at', 'track_id')
 train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened', 'listened-by')
 val_matrix, test_matrix = build_val_test_matrix(
    g, val_indices, test_indices, 'user', 'track', 'listened')

--- a/examples/pytorch/rgcn-hetero/entity_classify_mb.py
+++ b/examples/pytorch/rgcn-hetero/entity_classify_mb.py
@@ -98,15 +98,15 @@ def main(args):
        model.cuda()

    # train sampler
-    sampler = dgl.sampling.MultiLayerNeighborSampler([args.fanout] * args.n_layers)
-    loader = dgl.sampling.NodeDataLoader(
+    sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.n_layers)
+    loader = dgl.dataloading.NodeDataLoader(
        g, {category: train_idx}, sampler,
        batch_size=args.batch_size, shuffle=True, num_workers=0)

    # validation sampler
    # we do not use full neighbor to save computation resources
-    val_sampler = dgl.sampling.MultiLayerNeighborSampler([args.fanout] * args.n_layers)
-    val_loader = dgl.sampling.NodeDataLoader(
+    val_sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.n_layers)
+    val_loader = dgl.dataloading.NodeDataLoader(
        g, {category: val_idx}, val_sampler,
        batch_size=args.batch_size, shuffle=True, num_workers=0)


--- a/examples/pytorch/rgcn-hetero/model.py
+++ b/examples/pytorch/rgcn-hetero/model.py
@@ -106,7 +106,7 @@ class RelGraphConvLayer(nn.Module):
            inputs_src = inputs
            inputs_dst = {k: v[:g.number_of_dst_nodes(k)] for k, v in inputs.items()}
        else:
-            inputs_src, inputs_dst = inputs
+            inputs_src = inputs_dst = inputs

        hs = self.conv(g, inputs, mod_kwargs=wdict)

@@ -232,8 +232,8 @@ class EntityClassify(nn.Module):
                    self.h_dim if l != len(self.layers) - 1 else self.out_dim)
                for k in g.ntypes}

-            sampler = dgl.sampling.MultiLayerNeighborSampler([None])
-            dataloader = dgl.sampling.NodeDataLoader(
+            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
+            dataloader = dgl.dataloading.NodeDataLoader(
                g,
                {k: th.arange(g.number_of_nodes(k)) for k in g.ntypes},
                sampler,

--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -13,6 +13,7 @@ from . import container
 from . import distributed
 from . import random
 from . import sampling
+from . import dataloading
 from . import ops

 from ._ffi.runtime_ctypes import TypeCode

--- a/python/dgl/backend/backend.py
+++ b/python/dgl/backend/backend.py
@@ -908,7 +908,7 @@ def ones(shape, dtype, ctx):
    pass

 def uniform(shape, dtype, ctx, low, high):
-    """Crear a tensor with random value in an uniform 
+    """Create a tensor with random value in a uniform
    distribution between low (inclusive) and high (exclusive).

    Parameters
@@ -927,6 +927,26 @@ def uniform(shape, dtype, ctx, low, high):
    """
    pass

+def randint(shape, dtype, ctx, low, high):
+    """Create a tensor with random value in a uniform integer
+    distribution between low (inclusive) and high (exclusive)
+
+    Parameters
+    ----------
+    shape : tuple of int
+        The tensor shape.
+    dtype : data type
+        It should be one of the values in the data type dict.
+    ctx : context
+        The device of the result tensor.
+
+    Returns
+    -------
+    Tensor
+        The random tensor.
+    """
+    pass
+
 def pad_packed_tensor(input, lengths, value, l_min=None):
    r"""Pads a packed batch of variable length tensors with given value.


--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -276,6 +276,9 @@ def ones(shape, dtype, ctx):
 def uniform(shape, dtype, ctx, low, high):
    return nd.random.uniform(low, high, ctx=ctx, dtype=dtype, shape=shape)

+def randint(shape, dtype, ctx, low, high):
+    return nd.random.randint(low, high, ctx=ctx, dtype=dtype, shape=shape)
+
 def pad_packed_tensor(input, lengths, value, l_min=None):
    old_shape = input.shape
    if isinstance(lengths, nd.NDArray):

--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -216,6 +216,9 @@ def ones(shape, dtype, ctx):
 def uniform(shape, dtype, ctx, low, high):
    return th.empty(shape, dtype=dtype, device=ctx).uniform_(low, high)

+def randint(shape, dtype, ctx, low, high):
+    return th.randint(low, high, shape, dtype=dtype, device=ctx)
+
 def pad_packed_tensor(input, lengths, value, l_min=None):
    old_shape = input.shape
    if isinstance(lengths, th.Tensor):

--- a/python/dgl/backend/tensorflow/tensor.py
+++ b/python/dgl/backend/tensorflow/tensor.py
@@ -336,6 +336,12 @@ def uniform(shape, dtype, ctx, low, high):
    return t


+def randint(shape, dtype, ctx, low, high):
+    with tf.device(ctx):
+        t = tf.random.uniform(shape, dtype=dtype, minval=low, maxval=high)
+    return t
+
+
 def pad_packed_tensor(input, lengths, value, l_min=None):
    old_shape = input.shape
    if isinstance(lengths, tf.Tensor):

--- a/python/dgl/dataloading/__init__.py
+++ b/python/dgl/dataloading/__init__.py
+"""Classes that involves iterating over nodes or edges in a graph and generates
+computation dependency of necessary nodes with neighborhood sampling methods.
+
+This includes
+
+* :py:class:`~dgl.dataloading.pytorch.NodeDataLoader`` for iterating over the nodes in
+  a graph in minibatches.
+
+* :py:class:`~dgl.dataloading.pytorch.EdgeDataLoader`` for iterating over the edges in
+  a graph in minibatches.
+
+* Various sampler classes that perform neighborhood sampling for multi-layer GNNs.
+
+* Negative samplers for link prediction.
+
+NOTE: this module is experimental and the interfaces may be subject to changes in
+future releases.
+"""
+from .neighbor import *
+from .dataloader import *
+
+from . import negative_sampler
+
+from .. import backend as F
+
+if F.get_preferred_backend() == 'pytorch':
+    from .pytorch import *
--- a/python/dgl/dataloading/dataloader.py
+++ b/python/dgl/dataloading/dataloader.py
+"""Data loaders"""
+
+from collections.abc import Mapping
+from abc import ABC, abstractproperty, abstractmethod
+import numpy as np
+from .. import transform
+from ..base import NID, EID
+from .. import backend as F
+from .. import utils
+from ..convert import heterograph
+
+# pylint: disable=unused-argument
+def assign_block_eids(block, frontier):
+    """Assigns edge IDs from the original graph to the block.
+
+    See also
+    --------
+    BlockSampler
+    """
+    for etype in block.canonical_etypes:
+        block.edges[etype].data[EID] = frontier.edges[etype].data[EID][
+            block.edges[etype].data[EID]]
+    return block
+
+def _tensor_or_dict_to_numpy(ids):
+    if isinstance(ids, Mapping):
+        return {k: F.zerocopy_to_numpy(v) for k, v in ids.items()}
+    else:
+        return F.zerocopy_to_numpy(ids)
+
+def _locate_eids_to_exclude(frontier_parent_eids, exclude_eids):
+    """Find the edges whose IDs in parent graph appeared in exclude_eids.
+
+    Note that both arguments are numpy arrays or numpy dicts.
+    """
+    if isinstance(frontier_parent_eids, Mapping):
+        result = {
+            k: np.isin(frontier_parent_eids[k], exclude_eids[k]).nonzero()[0]
+            for k in frontier_parent_eids.keys() if k in exclude_eids.keys()}
+        return {k: F.zerocopy_from_numpy(v) for k, v in result.items()}
+    else:
+        result = np.isin(frontier_parent_eids, exclude_eids).nonzero()[0]
+        return F.zerocopy_from_numpy(result)
+
+def _find_exclude_eids_with_reverse_id(g, eids, reverse_eid_map):
+    if isinstance(eids, Mapping):
+        eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+        exclude_eids = {
+            k: F.cat([v, F.gather_row(reverse_eid_map[k], v)], 0)
+            for k, v in eids.items()}
+    else:
+        exclude_eids = F.cat([eids, F.gather_row(reverse_eid_map, eids)], 0)
+    return exclude_eids
+
+def _find_exclude_eids_with_reverse_types(g, eids, reverse_etype_map):
+    exclude_eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+    reverse_etype_map = {
+        g.to_canonical_etype(k): g.to_canonical_etype(v)
+        for k, v in reverse_etype_map.items()}
+    exclude_eids.update({reverse_etype_map[k]: v for k, v in exclude_eids.items()})
+    return exclude_eids
+
+def _find_exclude_eids(g, exclude_mode, eids, **kwargs):
+    """Find all edge IDs to exclude according to ``exclude_mode``.
+
+    Parameters
+    ----------
+    g : DGLHeteroGraph
+        The graph.
+    exclude_mode : str, optional
+        Can be either of the following,
+
+        None (default)
+            Does not exclude any edge.
+        'reverse_id'
+            Exclude all edges specified in ``eids``, as well as their reverse edges
+            of the same edge type.
+
+            The mapping from each edge ID to its reverse edge ID is specified in
+            the keyword argument ``reverse_eid_map``.
+
+            This mode assumes that the reverse of an edge with ID ``e`` and type
+            ``etype`` will have ID ``reverse_eid_map[e]`` and type ``etype``.
+        'reverse_types'
+            Exclude all edges specified in ``eids``, as well as their reverse
+            edges of the corresponding edge types.
+
+            The mapping from each edge type to its reverse edge type is specified
+            in the keyword argument ``reverse_etype_map``.
+
+            This mode assumes that the reverse of an edge with ID ``e`` and type ``etype``
+            will have ID ``e`` and type ``reverse_etype_map[etype]``.
+    eids : Tensor or dict[etype, Tensor]
+        The edge IDs.
+    reverse_eid_map : Tensor or dict[etype, Tensor]
+        The mapping from edge ID to its reverse edge ID.
+    reverse_etype_map : dict[etype, etype]
+        The mapping from edge etype to its reverse edge type.
+    """
+    if exclude_mode is None:
+        return None
+    elif exclude_mode == 'reverse_id':
+        return _find_exclude_eids_with_reverse_id(g, eids, kwargs['reverse_eid_map'])
+    elif exclude_mode == 'reverse_types':
+        return _find_exclude_eids_with_reverse_types(g, eids, kwargs['reverse_etype_map'])
+    else:
+        raise ValueError('unsupported mode {}'.format(exclude_mode))
+
+
+class BlockSampler(object):
+    """Abstract class specifying the neighborhood sampling strategy for DGL data loaders.
+
+    The main method for BlockSampler is :func:`~dgl.dataloading.BlockSampler.sample_blocks`,
+    which generates a list of blocks for a multi-layer GNN given a set of seed nodes to
+    have their outputs computed.
+
+    The default implementation of :py:meth:`~dgl.dataloading.BlockSampler.sample_blocks` is
+    to repeat ``num_layers`` times the following procedure from the last layer to the first
+    layer:
+
+    * Obtain a frontier.  The frontier is defined as a graph with the same nodes as the
+      original graph but only the edges involved in message passing on the current layer.
+      Customizable via :py:meth:`~dgl.dataloading.BlockSampler.sample_frontier`.
+
+    * Optionally, if the task is link prediction or edge classfication, remove edges
+      connecting training node pairs.  If the graph is undirected, also remove the
+      reverse edges.  This is controlled by the argument :attr:`exclude_eids` in
+      :py:meth:``~dgl.dataloading.BlockSampler.sample_blocks`` method.
+
+    * Convert the frontier into a block.
+
+    * Optionally assign the IDs of the edges in the original graph selected in the first step
+      to the block, controlled by the argument ``return_eids`` in
+      :py:meth:``~dgl.dataloading.BlockSampler.sample_blocks`` method.
+
+    * Prepend the block to the block list to be returned.
+
+    All subclasses should override :py:meth:`~dgl.dataloading.BlockSampler.sample_frontier`
+    method while specifying the number of layers to sample in :attr:`num_layers` argument.
+
+    Parameters
+    ----------
+    num_layers : int
+        The number of layers to sample.
+    return_eids : bool, default False
+        Whether to return the edge IDs involved in message passing in the block.
+        If True, the edge IDs will be stored as an edge feature named ``dgl.EID``.
+
+    Notes
+    -----
+    For the concept of frontiers and blocks, please refer to User Guide Section 6.
+    """
+    def __init__(self, num_layers, return_eids):
+        self.num_layers = num_layers
+        self.return_eids = return_eids
+
+    def sample_frontier(self, block_id, g, seed_nodes):
+        """Generate the frontier given the output nodes.
+
+        Parameters
+        ----------
+        block_id : int
+            Represents which GNN layer the frontier is generated for.
+        g : DGLHeteroGraph
+            The original graph.
+        seed_nodes : Tensor or dict[ntype, Tensor]
+            The output nodes by node type.
+
+            If the graph only has one node type, one can just specify a single tensor
+            of node IDs.
+
+        Returns
+        -------
+        DGLHeteroGraph
+            The frontier generated for the current layer.
+
+        See also
+        --------
+        For the concept of frontiers and blocks, please refer to User Guide Section 6.
+        """
+        raise NotImplementedError
+
+    def sample_blocks(self, g, seed_nodes, exclude_eids=None):
+        """Generate the a list of blocks given the output nodes.
+
+        Parameters
+        ----------
+        g : DGLHeteroGraph
+            The original graph.
+        seed_nodes : Tensor or dict[ntype, Tensor]
+            The output nodes by node type.
+
+            If the graph only has one node type, one can just specify a single tensor
+            of node IDs.
+        exclude_eids : Tensor or dict[etype, Tensor]
+            The edges to exclude from computation dependency.
+
+        Returns
+        -------
+        list[DGLHeteroGraph]
+            The blocks generated for computing the multi-layer GNN output.
+
+        See also
+        --------
+        For the concept of frontiers and blocks, please refer to User Guide Section 6.
+        """
+        blocks = []
+        exclude_eids = (
+            _tensor_or_dict_to_numpy(exclude_eids) if exclude_eids is not None else None)
+        for block_id in reversed(range(self.num_layers)):
+            frontier = self.sample_frontier(block_id, g, seed_nodes)
+
+            # Removing edges from the frontier for link prediction training falls
+            # into the category of frontier postprocessing
+            if exclude_eids is not None:
+                parent_eids = frontier.edata[EID]
+                parent_eids_np = _tensor_or_dict_to_numpy(parent_eids)
+                located_eids = _locate_eids_to_exclude(parent_eids_np, exclude_eids)
+                if not isinstance(located_eids, Mapping):
+                    # (BarclayII) If frontier already has a EID field and located_eids is empty,
+                    # the returned graph will keep EID intact.  Otherwise, EID will change
+                    # to the mapping from the new graph to the old frontier.
+                    # So we need to test if located_eids is empty, and do the remapping ourselves.
+                    if len(located_eids) > 0:
+                        frontier = transform.remove_edges(frontier, located_eids)
+                        frontier.edata[EID] = F.gather_row(parent_eids, frontier.edata[EID])
+                else:
+                    # (BarclayII) remove_edges only accepts removing one type of edges,
+                    # so I need to keep track of the edge IDs left one by one.
+                    new_eids = parent_eids.copy()
+                    for k, v in located_eids.items():
+                        if len(v) > 0:
+                            frontier = transform.remove_edges(frontier, v, etype=k)
+                            new_eids[k] = F.gather_row(parent_eids[k], frontier.edges[k].data[EID])
+                    frontier.edata[EID] = new_eids
+
+            block = transform.to_block(frontier, seed_nodes)
+
+            if self.return_eids:
+                assign_block_eids(block, frontier)
+
+            seed_nodes = {ntype: block.srcnodes[ntype].data[NID] for ntype in block.srctypes}
+            # Pre-generate CSR format so that it can be used in training directly
+            block.create_format_()
+            blocks.insert(0, block)
+        return blocks
+
+class Collator(ABC):
+    """Abstract DGL collator for training GNNs on downstream tasks stochastically.
+
+    Provides a ``dataset`` object containing the collection of all nodes or edges,
+    as well as a ``collate`` method that combines a set of items from ``dataset`` and
+    obtains the blocks.
+
+    See also
+    --------
+    For the concept of blocks, please refer to User Guide Section 6.
+    """
+    @abstractproperty
+    def dataset(self):
+        """Returns the dataset object of the collator."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def collate(self, items):
+        """Combines the items from the dataset object and obtains the list of blocks.
+
+        Parameters
+        ----------
+        items : list[str, int]
+            The list of node or edge type-ID pairs.
+
+        See also
+        --------
+        For the concept of blocks, please refer to User Guide Section 6.
+        """
+        raise NotImplementedError
+
+class NodeCollator(Collator):
+    """DGL collator to combine nodes and their computation dependencies within a minibatch for
+    training node classification or regression on a single graph with neighborhood sampling.
+
+    Parameters
+    ----------
+    g : DGLHeteroGraph
+        The graph.
+    nids : Tensor or dict[ntype, Tensor]
+        The node set to compute outputs.
+    block_sampler : dgl.dataloading.BlockSampler
+        The neighborhood sampler.
+
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from all neighbors (assume
+    the backend is PyTorch):
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> collator = dgl.dataloading.NodeCollator(g, train_nid, sampler)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, output_nodes, blocks in dataloader:
+    ...     train_on(input_nodes, output_nodes, blocks)
+    """
+    def __init__(self, g, nids, block_sampler):
+        self.g = g
+        if not isinstance(nids, Mapping):
+            assert len(g.ntypes) == 1, \
+                "nids should be a dict of node type and ids for graph with multiple node types"
+        self.nids = nids
+        self.block_sampler = block_sampler
+
+        if isinstance(nids, Mapping):
+            self._dataset = utils.FlattenedDict(nids)
+        else:
+            self._dataset = nids
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    def collate(self, items):
+        """Find the list of blocks necessary for computing the representation of given
+        nodes for a node classification/regression task.
+
+        Returns
+        -------
+        input_nodes : Tensor or dict[ntype, Tensor]
+            The input nodes necessary for computation in this minibatch.
+
+            If the original graph has multiple node types, return a dictionary of
+            node type names and node ID tensors.  Otherwise, return a single tensor.
+        output_nodes : Tensor or dict[ntype, Tensor]
+            The nodes whose representations are to be computed in this minibatch.
+
+            If the original graph has multiple node types, return a dictionary of
+            node type names and node ID tensors.  Otherwise, return a single tensor.
+        blocks : list[DGLHeteroGraph]
+            The list of blocks necessary for computing the representation.
+        """
+        if isinstance(items[0], tuple):
+            # returns a list of pairs: group them by node types into a dict
+            items = utils.group_as_dict(items)
+        blocks = self.block_sampler.sample_blocks(self.g, items)
+        output_nodes = blocks[-1].dstdata[NID]
+        input_nodes = blocks[0].srcdata[NID]
+
+        return input_nodes, output_nodes, blocks
+
+class EdgeCollator(Collator):
+    """DGL collator to combine edges and their computation dependencies within a minibatch for
+    training edge classification, edge regression, or link prediction on a single graph
+    with neighborhood sampling.
+
+    Given a set of edges, the collate function will yield
+
+    * A tensor of input nodes necessary for computing the representation on edges, or
+      a dictionary of node type names and such tensors.
+
+    * A subgraph that contains only the edges in the minibatch and their incident nodes.
+      Note that the graph has an identical metagraph with the original graph.
+
+    * If a negative sampler is given, another graph that contains the "negative edges",
+      connecting the source and destination nodes yielded from the given negative sampler.
+
+    * A list of blocks necessary for computing the representation of the incident nodes
+      of the edges in the minibatch.
+
+    Parameters
+    ----------
+    g : DGLHeteroGraph
+        The graph from which the edges are iterated in minibatches and the subgraphs
+        are generated.
+    eids : Tensor or dict[etype, Tensor]
+        The edge set in graph :attr:`g` to compute outputs.
+    block_sampler : dgl.dataloading.BlockSampler
+        The neighborhood sampler.
+    g_sampling : DGLHeteroGraph, optional
+        The graph where neighborhood sampling and message passing is performed.
+
+        Note that this is not necessarily the same as :attr:`g`.
+
+        If None, assume to be the same as :attr:`g`.
+    exclude : str, optional
+        Whether and how to exclude dependencies related to the sampled edges in the
+        minibatch.  Possible values are
+
+        * None, which excludes nothing.
+
+        * ``'reverse_id'``, which excludes the reverse edges of the sampled edges.  The said
+          reverse edges have the same edge type as the sampled edges.  Only works
+          on edge types whose source node type is the same as its destination node type.
+
+        * ``'reverse_types'``, which excludes the reverse edges of the sampled edges.  The
+          said reverse edges have different edge types from the sampled edges.
+
+        If ``g_sampling`` is given, ``exclude`` is ignored and will be always ``None``.
+    reverse_eids : Tensor or dict[etype, Tensor], optional
+        The mapping from original edge ID to its reverse edge ID.
+
+        Required and only used when ``exclude`` is set to ``reverse_id``.
+
+        For heterogeneous graph this will be a dict of edge type and edge IDs.  Note that
+        only the edge types whose source node type is the same as destination node type
+        are needed.
+    reverse_etypes : dict[etype, etype], optional
+        The mapping from the edge type to its reverse edge type.
+
+        Required and only used when ``exclude`` is set to ``reverse_types``.
+    negative_sampler : callable, optional
+        The negative sampler.  Can be omitted if no negative sampling is needed.
+
+        The negative sampler must be a callable that takes in the following arguments:
+
+        * The original (heterogeneous) graph.
+
+        * The ID array of sampled edges in the minibatch, or the dictionary of edge
+          types and ID array of sampled edges in the minibatch if the graph is
+          heterogeneous.
+
+        It should return
+
+        * A pair of source and destination node ID arrays as negative samples,
+          or a dictionary of edge types and such pairs if the graph is heterogenenous.
+
+        A set of builtin negative samplers are provided in
+        :py:mod:`dgl.dataloading.negative_sampler`.
+
+    Examples
+    --------
+    The following example shows how to train a 3-layer GNN for edge classification on a
+    set of edges ``train_eid`` on a homogeneous undirected graph.  Each node takes
+    messages from all neighbors.
+
+    Say that you have an array of source node IDs ``src`` and another array of destination
+    node IDs ``dst``.  One can make it bidirectional by adding another set of edges
+    that connects from ``dst`` to ``src``:
+
+    >>> g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])))
+
+    One can then know that the ID difference of an edge and its reverse edge is ``|E|``,
+    where ``|E|`` is the length of your source/destination array.  The reverse edge
+    mapping can be obtained by
+
+    >>> E = len(src)
+    >>> reverse_eids = torch.cat([torch.arange(E, 2 * E), torch.arange(0, E)])
+
+    Note that the sampled edges as well as their reverse edges are removed from
+    computation dependencies of the incident nodes.  This is a common trick to avoid
+    information leakage.
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, train_eid, sampler, exclude='reverse',
+    ...     reverse_eids=reverse_eids)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pair_graph, blocks in dataloader:
+    ...     train_on(input_nodes, pair_graph, blocks)
+
+    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` on a
+    homogeneous graph where each node takes messages from all neighbors (assume the
+    backend is PyTorch), with 5 uniformly chosen negative samples per edge:
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, train_eid, sampler, exclude='reverse',
+    ...     reverse_eids=reverse_eids, negative_sampler=neg_sampler,
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
+    ...     train_on(input_nodse, pair_graph, neg_pair_graph, blocks)
+
+    For heterogeneous graphs, the reverse of an edge may have a different edge type
+    from the original edge.  For instance, consider that you have an array of
+    user-item clicks, representated by a user array ``user`` and an item array ``item``.
+    You may want to build a heterogeneous graph with a user-click-item relation and an
+    item-clicked-by-user relation.
+
+    >>> g = dgl.heterograph({
+    ...     ('user', 'click', 'item'): (user, item),
+    ...     ('item', 'clicked-by', 'user'): (item, user)})
+
+    To train a 3-layer GNN for edge classification on a set of edges ``train_eid`` with
+    type ``click``, you can write
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, {'click': train_eid}, sampler, exclude='reverse_types',
+    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'})
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pair_graph, blocks in dataloader:
+    ...     train_on(input_nodes, pair_graph, blocks)
+
+    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` with type
+    ``click``, you can write
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
+    >>> collator = dgl.dataloading.EdgeCollator(
+    ...     g, train_eid, sampler, exclude='reverse_types',
+    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'},
+    ...     negative_sampler=neg_sampler)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
+    ...     train_on(input_nodse, pair_graph, neg_pair_graph, blocks)
+    """
+    def __init__(self, g, eids, block_sampler, g_sampling=None, exclude=None,
+                 reverse_eids=None, reverse_etypes=None, negative_sampler=None):
+        self.g = g
+        if not isinstance(eids, Mapping):
+            assert len(g.etypes) == 1, \
+                "eids should be a dict of etype and ids for graph with multiple etypes"
+        self.eids = eids
+        self.block_sampler = block_sampler
+
+        # One may wish to iterate over the edges in one graph while perform sampling in
+        # another graph.  This may be the case for iterating over validation and test
+        # edge set while perform neighborhood sampling on the graph formed by only
+        # the training edge set.
+        # See GCMC for an example usage.
+        if g_sampling is not None:
+            self.g_sampling = g_sampling
+            self.exclude = None
+        else:
+            self.g_sampling = self.g
+            self.exclude = exclude
+
+        self.reverse_eids = reverse_eids
+        self.reverse_etypes = reverse_etypes
+        self.negative_sampler = negative_sampler
+
+        if isinstance(eids, Mapping):
+            self._dataset = utils.FlattenedDict(eids)
+        else:
+            self._dataset = eids
+
+    @property
+    def dataset(self):
+        return self._dataset
+
+    def _collate(self, items):
+        if isinstance(items[0], tuple):
+            items = utils.group_as_dict(items)
+            items = {k: F.zerocopy_from_numpy(np.asarray(v)) for k, v in items.items()}
+        else:
+            items = F.zerocopy_from_numpy(np.asarray(items))
+
+        pair_graph = self.g.edge_subgraph(items)
+        seed_nodes = pair_graph.ndata[NID]
+
+        exclude_eids = _find_exclude_eids(
+            self.g,
+            self.exclude,
+            items,
+            reverse_eid_map=self.reverse_eids,
+            reverse_etype_map=self.reverse_etypes)
+
+        blocks = self.block_sampler.sample_blocks(
+            self.g_sampling, seed_nodes, exclude_eids=exclude_eids)
+        input_nodes = blocks[0].srcdata[NID]
+
+        return input_nodes, pair_graph, blocks
+
+    def _collate_with_negative_sampling(self, items):
+        if isinstance(items[0], tuple):
+            items = utils.group_as_dict(items)
+            items = {k: F.zerocopy_from_numpy(np.asarray(v)) for k, v in items.items()}
+        else:
+            items = F.zerocopy_from_numpy(np.asarray(items))
+
+        pair_graph = self.g.edge_subgraph(items, preserve_nodes=True)
+        induced_edges = pair_graph.edata[EID]
+
+        neg_srcdst = self.negative_sampler(self.g, items)
+        if not isinstance(neg_srcdst, Mapping):
+            assert len(self.g.etypes) == 1, \
+                'graph has multiple or no edge types; '\
+                'please return a dict in negative sampler.'
+            neg_srcdst = {self.g.canonical_etypes[0]: neg_srcdst}
+        neg_edges = {
+            etype: neg_srcdst.get(etype, []) for etype in self.g.canonical_etypes}
+        neg_pair_graph = heterograph(
+            neg_edges, {ntype: self.g.number_of_nodes(ntype) for ntype in self.g.ntypes})
+
+        pair_graph, neg_pair_graph = transform.compact_graphs([pair_graph, neg_pair_graph])
+        pair_graph.edata[EID] = induced_edges
+
+        seed_nodes = pair_graph.ndata[NID]
+
+        exclude_eids = _find_exclude_eids(
+            self.g,
+            self.exclude,
+            items,
+            reverse_eid_map=self.reverse_eids,
+            reverse_etype_map=self.reverse_etypes)
+
+        blocks = self.block_sampler.sample_blocks(
+            self.g_sampling, seed_nodes, exclude_eids=exclude_eids)
+        input_nodes = blocks[0].srcdata[NID]
+
+        return input_nodes, pair_graph, neg_pair_graph, blocks
+
+    def collate(self, items):
+        """Combines the sampled edges into a minibatch for edge classification, edge
+        regression, and link prediction tasks.
+
+        Returns
+        -------
+        Either ``(input_nodes, pair_graph, blocks)``, or
+        ``(input_nodes, pair_graph, negative_pair_graph, blocks)`` if negative sampling is
+        enabled.
+
+        input_nodes : Tensor or dict[ntype, Tensor]
+            The input nodes necessary for computation in this minibatch.
+
+            If the original graph has multiple node types, return a dictionary of
+            node type names and node ID tensors.  Otherwise, return a single tensor.
+        pair_graph : DGLHeteroGraph
+            The graph that contains only the edges in the minibatch as well as their incident
+            nodes.
+
+            Note that the metagraph of this graph will be identical to that of the original
+            graph.
+        negative_pair_graph : DGLHeteroGraph
+            The graph that contains only the edges connecting the source and destination nodes
+            yielded from the given negative sampler, if negative sampling is enabled.
+
+            Note that the metagraph of this graph will be identical to that of the original
+            graph.
+        blocks : list[DGLHeteroGraph]
+            The list of blocks necessary for computing the representation of the edges.
+        """
+        if self.negative_sampler is None:
+            return self._collate(items)
+        else:
+            return self._collate_with_negative_sampling(items)
--- a/python/dgl/dataloading/negative_sampler.py
+++ b/python/dgl/dataloading/negative_sampler.py
+"""Negative samplers"""
+from collections.abc import Mapping
+from .. import backend as F
+
+class _BaseNegativeSampler(object):
+    def _generate(self, g, eids, canonical_etype):
+        raise NotImplementedError
+
+    def __call__(self, g, eids):
+        """Returns negative examples.
+
+        Parameters
+        ----------
+        g : DGLHeteroGraph
+            The graph.
+        eids : Tensor or dict[etype, Tensor]
+            The sampled edges in the minibatch.
+
+        Returns
+        -------
+        tuple[Tensor, Tensor] or dict[etype, tuple[Tensor, Tensor]]
+            The returned source-destination pairs as negative examples.
+        """
+        if isinstance(eids, Mapping):
+            eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+            neg_pair = {k: self._generate(g, v, k) for k, v in eids.items()}
+        else:
+            assert len(g.etypes) == 1, \
+                'please specify a dict of etypes and ids for graphs with multiple edge types'
+            neg_pair = self._generate(g, eids, g.canonical_etypes[0])
+
+        return neg_pair
+
+class Uniform(_BaseNegativeSampler):
+    """Negative sampler that randomly chooses negative destination nodes
+    for each source node according to a uniform distribution.
+
+    For each edge ``(u, v)`` of type ``(srctype, etype, dsttype)``, DGL generates
+    :attr:`k` pairs of negative edges ``(u, v')``, where ``v'`` is chosen
+    uniformly from all the nodes of type ``dsttype``.  The resulting edges will
+    also have type ``(srctype, etype, dsttype)``.
+
+    Parameters
+    ----------
+    k : int
+        The number of negative examples per edge.
+
+    Examples
+    --------
+    >>> g = dgl.graph(([0, 1, 2], [1, 2, 3]))
+    >>> neg_sampler = dgl.sampling.negative_sampler.Uniform(2)
+    >>> neg_sampler(g, [0, 1])
+    (tensor([0, 0, 1, 1]), tensor([1, 0, 2, 3]))
+    """
+    def __init__(self, k):
+        self.k = k
+
+    def _generate(self, g, eids, canonical_etype):
+        _, _, vtype = canonical_etype
+        shape = F.shape(eids)
+        dtype = F.dtype(eids)
+        ctx = F.context(eids)
+        shape = (shape[0] * self.k,)
+        src, _ = g.find_edges(eids, etype=canonical_etype)
+        src = F.repeat(src, self.k, 0)
+        dst = F.randint(shape, dtype, ctx, 0, g.number_of_nodes(vtype))
+        return src, dst
--- a/python/dgl/dataloading/neighbor.py
+++ b/python/dgl/dataloading/neighbor.py
+"""Data loading components for neighbor sampling"""
+from .dataloader import BlockSampler
+from .. import sampling, subgraph
+
+class MultiLayerNeighborSampler(BlockSampler):
+    """Sampler that builds computational dependency of node representations via
+    neighbor sampling for multilayer GNN.
+
+    This sampler will make every node gather messages from a fixed number of neighbors
+    per edge type.  The neighbors are picked uniformly.
+
+    Parameters
+    ----------
+    fanouts : list[int] or list[dict[etype, int] or None]
+        List of neighbors to sample per edge type for each GNN layer, starting from the
+        first layer.
+
+        If the graph is homogeneous, only an integer is needed for each layer.
+
+        If None is provided for one layer, all neighbors will be included regardless of
+        edge types.
+
+        If -1 is provided for one edge type on one layer, then all inbound edges
+        of that edge type will be included.
+    replace : bool, default True
+        Whether to sample with replacement
+    return_eids : bool, default False
+        Whether to return the edge IDs involved in message passing in the block.
+        If True, the edge IDs will be stored as an edge feature named ``dgl.EID``.
+
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from 5, 10, 15 neighbors for
+    the first, second, and third layer respectively (assuming the backend is PyTorch):
+
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10, 15])
+    >>> collator = dgl.dataloading.NodeCollator(g, train_nid, sampler)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for blocks in dataloader:
+    ...     train_on(blocks)
+
+    If training on a heterogeneous graph and you want different number of neighbors for each
+    edge type, one should instead provide a list of dicts.  Each dict would specify the
+    number of neighbors to pick per edge type.
+
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([
+    ...     {('user', 'follows', 'user'): 5,
+    ...      ('user', 'plays', 'game'): 4,
+    ...      ('game', 'played-by', 'user'): 3}] * 3)
+    """
+    def __init__(self, fanouts, replace=False, return_eids=False):
+        super().__init__(len(fanouts), return_eids)
+
+        self.fanouts = fanouts
+        self.replace = replace
+
+    def sample_frontier(self, block_id, g, seed_nodes):
+        fanout = self.fanouts[block_id]
+        if fanout is None:
+            frontier = subgraph.in_subgraph(g, seed_nodes)
+        else:
+            frontier = sampling.sample_neighbors(g, seed_nodes, fanout, replace=self.replace)
+        return frontier
+
+class MultiLayerFullNeighborSampler(MultiLayerNeighborSampler):
+    """Sampler that builds computational dependency of node representations by taking messages
+    from all neighbors for multilayer GNN.
+
+    This sampler will make every node gather messages from every single neighbor per edge type.
+
+    Parameters
+    ----------
+    n_layers : int
+        The number of GNN layers to sample.
+    return_eids : bool, default False
+        Whether to return the edge IDs involved in message passing in the block.
+        If True, the edge IDs will be stored as an edge feature named ``dgl.EID``.
+
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from all neighbors for the first,
+    second, and third layer respectively (assuming the backend is PyTorch):
+
+    >>> sampler = dgl.dataloading.MultiLayerFullNeighborSampler(3)
+    >>> collator = dgl.dataloading.NodeCollator(g, train_nid, sampler)
+    >>> dataloader = torch.utils.data.DataLoader(
+    ...     collator.dataset, collate_fn=collator.collate,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for blocks in dataloader:
+    ...     train_on(blocks)
+    """
+    def __init__(self, n_layers, return_eids=False):
+        super().__init__([None] * n_layers, return_eids=return_eids)
--- a/python/dgl/dataloading/pytorch/__init__.py
+++ b/python/dgl/dataloading/pytorch/__init__.py
+"""DGL PyTorch DataLoaders"""
+import inspect
+from torch.utils.data import DataLoader
+from ..dataloader import NodeCollator, EdgeCollator
+
+class NodeDataLoader(DataLoader):
+    """PyTorch dataloader for batch-iterating over a set of nodes, generating the list
+    of blocks as computation dependency of the said minibatch.
+
+    Parameters
+    ----------
+    g : DGLHeteroGraph
+        The graph.
+    nids : Tensor or dict[ntype, Tensor]
+        The node set to compute outputs.
+    block_sampler : :py:class:`~dgl.dataloading.BlockSampler`
+        The neighborhood sampler.
+    kwargs : dict
+        Arguments being passed to ``torch.utils.data.DataLoader``.
+
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from all neighbors (assume
+    the backend is PyTorch):
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> dataloader = dgl.dataloading.NodeDataLoader(
+    ...     g, train_nid, sampler,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, output_nodes, blocks in dataloader:
+    ...     train_on(input_nodes, output_nodes, blocks)
+    """
+    collator_arglist = inspect.getfullargspec(NodeCollator).args
+
+    def __init__(self, g, nids, block_sampler, **kwargs):
+        collator_kwargs = {}
+        dataloader_kwargs = {}
+        for k, v in kwargs.items():
+            if k in self.collator_arglist:
+                collator_kwargs[k] = v
+            else:
+                dataloader_kwargs[k] = v
+        self.collator = NodeCollator(g, nids, block_sampler, **collator_kwargs)
+        super().__init__(
+            self.collator.dataset, collate_fn=self.collator.collate, **dataloader_kwargs)
+
+class EdgeDataLoader(DataLoader):
+    """PyTorch dataloader for batch-iterating over a set of edges, generating the list
+    of blocks as computation dependency of the said minibatch for edge classification,
+    edge regression, and link prediction.
+
+    Parameters
+    ----------
+    g : DGLHeteroGraph
+        The graph.
+    nids : Tensor or dict[ntype, Tensor]
+        The node set to compute outputs.
+    block_sampler : :py:class:`~dgl.dataloading.BlockSampler`
+        The neighborhood sampler.
+    g_sampling : DGLHeteroGraph, optional
+        The graph where neighborhood sampling is performed.
+
+        One may wish to iterate over the edges in one graph while perform sampling in
+        another graph.  This may be the case for iterating over validation and test
+        edge set while perform neighborhood sampling on the graph formed by only
+        the training edge set.
+
+        If None, assume to be the same as ``g``.
+    exclude : str, optional
+        Whether and how to exclude dependencies related to the sampled edges in the
+        minibatch.  Possible values are
+
+        * None,
+        * ``reverse``,
+        * ``reverse_types``
+
+        See the docstring in :py:class:`~dgl.dataloading.EdgeCollator`.
+    reverse_edge_ids : Tensor or dict[etype, Tensor], optional
+        See the docstring in :py:class:`~dgl.dataloading.EdgeCollator`.
+    reverse_etypes : dict[etype, etype], optional
+        See the docstring in :py:class:`~dgl.dataloading.EdgeCollator`.
+    negative_sampler : callable, optional
+        The negative sampler.
+
+        See the docstring in :py:class:`~dgl.dataloading.EdgeCollator`.
+    kwargs : dict
+        Arguments being passed to `torch.utils.data.DataLoader`.
+
+    Examples
+    --------
+    The following example shows how to train a 3-layer GNN for edge classification on a
+    set of edges ``train_eid`` on a homogeneous undirected graph.  Each node takes
+    messages from all neighbors.
+
+    Say that you have an array of source node IDs ``src`` and another array of destination
+    node IDs ``dst``.  One can make it bidirectional by adding another set of edges
+    that connects from ``dst`` to ``src``:
+
+    >>> g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])))
+
+    One can then know that the ID difference of an edge and its reverse edge is ``|E|``,
+    where ``|E|`` is the length of your source/destination array.  The reverse edge
+    mapping can be obtained by
+
+    >>> E = len(src)
+    >>> reverse_eids = torch.cat([torch.arange(E, 2 * E), torch.arange(0, E)])
+
+    Note that the sampled edges as well as their reverse edges are removed from
+    computation dependencies of the incident nodes.  This is a common trick to avoid
+    information leakage.
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> dataloader = dgl.dataloading.EdgeDataLoader(
+    ...     g, train_eid, sampler, exclude='reverse',
+    ...     reverse_eids=reverse_eids,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pair_graph, blocks in dataloader:
+    ...     train_on(input_nodes, pair_graph, blocks)
+
+    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` on a
+    homogeneous graph where each node takes messages from all neighbors (assume the
+    backend is PyTorch), with 5 uniformly chosen negative samples per edge:
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
+    >>> dataloader = dgl.dataloading.EdgeDataLoader(
+    ...     g, train_eid, sampler, exclude='reverse',
+    ...     reverse_eids=reverse_eids, negative_sampler=neg_sampler,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
+    ...     train_on(input_nodse, pair_graph, neg_pair_graph, blocks)
+
+    For heterogeneous graphs, the reverse of an edge may have a different edge type
+    from the original edge.  For instance, consider that you have an array of
+    user-item clicks, representated by a user array ``user`` and an item array ``item``.
+    You may want to build a heterogeneous graph with a user-click-item relation and an
+    item-clicked-by-user relation.
+
+    >>> g = dgl.heterograph({
+    ...     ('user', 'click', 'item'): (user, item),
+    ...     ('item', 'clicked-by', 'user'): (item, user)})
+
+    To train a 3-layer GNN for edge classification on a set of edges ``train_eid`` with
+    type ``click``, you can write
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> dataloader = dgl.dataloading.EdgeDataLoader(
+    ...     g, {'click': train_eid}, sampler, exclude='reverse_types',
+    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'},
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pair_graph, blocks in dataloader:
+    ...     train_on(input_nodes, pair_graph, blocks)
+
+    To train a 3-layer GNN for link prediction on a set of edges ``train_eid`` with type
+    ``click``, you can write
+
+    >>> sampler = dgl.dataloading.NeighborSampler([None, None, None])
+    >>> neg_sampler = dgl.dataloading.negative_sampler.Uniform(5)
+    >>> dataloader = dgl.dataloading.EdgeDataLoader(
+    ...     g, train_eid, sampler, exclude='reverse_types',
+    ...     reverse_etypes={'click': 'clicked-by', 'clicked-by': 'click'},
+    ...     negative_sampler=neg_sampler,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, pos_pair_graph, neg_pair_graph, blocks in dataloader:
+    ...     train_on(input_nodse, pair_graph, neg_pair_graph, blocks)
+
+    See also
+    --------
+    :py:class:`~dgl.dataloading.EdgeCollator`
+
+    For end-to-end usages, please refer to the following tutorial/examples:
+
+    * Edge classification on heterogeneous graph: GCMC
+
+    * Link prediction on homogeneous graph: GraphSAGE for unsupervised learning
+
+    * Link prediction on heterogeneous graph: RGCN for link prediction.
+    """
+    collator_arglist = inspect.getfullargspec(EdgeCollator).args
+
+    def __init__(self, g, eids, block_sampler, **kwargs):
+        collator_kwargs = {}
+        dataloader_kwargs = {}
+        for k, v in kwargs.items():
+            if k in self.collator_arglist:
+                collator_kwargs[k] = v
+            else:
+                dataloader_kwargs[k] = v
+        self.collator = EdgeCollator(g, eids, block_sampler, **collator_kwargs)
+        super().__init__(
+            self.collator.dataset, collate_fn=self.collator.collate, **dataloader_kwargs)
--- a/python/dgl/sampling/__init__.py
+++ b/python/dgl/sampling/__init__.py
@@ -5,9 +5,3 @@ This module contains the implementations of various sampling operators.
 from .randomwalks import *
 from .pinsage import *
 from .neighbor import *
-from .dataloader import *
-
-from .. import backend as F
-
-if F.get_preferred_backend() == 'pytorch':
-    from .pytorch import *
--- a/python/dgl/sampling/dataloader.py
+++ b/python/dgl/sampling/dataloader.py
-"""Data loaders"""
-
-from collections.abc import Mapping
-from abc import ABC, abstractproperty, abstractmethod
-from .. import transform
-from ..base import NID, EID
-from .. import utils
-
-# pylint: disable=unused-argument
-def assign_block_eids(block, frontier, block_id, g, seed_nodes, *args, **kwargs):
-    """Assigns edge IDs from the original graph to the block.
-
-    This is the default block postprocessor for samplers created with
-    ``return_eids`` as True.
-
-    See also
-    --------
-    BlockSampler
-    MultiLayerNeighborSampler
-    """
-    for etype in block.canonical_etypes:
-        block.edges[etype].data[EID] = frontier.edges[etype].data[EID][
-            block.edges[etype].data[EID]]
-    return block
-
-def _default_frontier_postprocessor(frontier, block_id, g, seed_nodes, *args, **kwargs):
-    return frontier
-
-def _default_block_postprocessor(block, frontier, block_id, g, seed_nodes, *args, **kwargs):
-    return block
-
-class BlockSampler(object):
-    """Abstract class specifying the neighborhood sampling strategy for DGL data loaders.
-
-    The main method for BlockSampler is :func:`~dgl.sampling.BlockSampler.sample_blocks`,
-    which generates a list of blocks for a multi-layer GNN given a set of seed nodes to
-    have their outputs computed.
-
-    The default implementation of :py:meth:`~dgl.sampling.BlockSampler.sample_blocks` is
-    to repeat ``num_hops`` times the following:
-
-    * Obtain a frontier with the same nodes as the original graph but only the edges
-      involved in message passing on the last layer.
-      Customizable via :py:meth:`~dgl.sampling.BlockSampler.sample_frontier`.
-
-    * Optionally, post-process the obtained frontier (e.g. by removing edges connecting training
-      node pairs).  One can add such postprocessors via
-      :py:meth:`~dgl.sampling.BlockSampler.add_frontier_postprocessor`.
-
-    * Convert the frontier into a block.
-
-    * Optionally, post-process the block (e.g. by assigning edge IDs).  One can add such
-      postprocessors via
-      :py:meth:`~dgl.sampling.BlockSampler.add_block_postprocessor`.
-
-    * Prepend the block to the block list to be returned.
-
-    All subclasses should either
-
-    * Override :py:meth:`~dgl.sampling.BlockSampler.sample_blocks` method, or
-
-    * Override
-      :py:meth:`~dgl.sampling.BlockSampler.sample_frontier` method while specifying
-      the number of layers to sample in ``num_hops`` argument.
-
-    See also
-    --------
-    For the concept of frontiers and blocks, please refer to User Guide Section 6.
-    """
-    def __init__(self, num_hops):
-        self.num_hops = num_hops
-        self._frontier_postprocessor = _default_frontier_postprocessor
-        self._block_postprocessor = _default_block_postprocessor
-
-    @property
-    def frontier_postprocessor(self):
-        """Frontier postprocessor."""
-        return self._frontier_postprocessor
-
-    @property
-    def block_postprocessor(self):
-        """B;pcl postprocessor."""
-        return self._block_postprocessor
-
-    def set_frontier_postprocessor(self, postprocessor):
-        """Set a frontier postprocessor.
-
-        The postprocessor must have the following signature:
-
-        .. code::
-
-           postprocessor(frontier, block_id, g, seed_nodes, *args, **kwargs)
-
-        where
-
-        * ``frontier`` represents the frontier obtained by
-          :py:meth:`~dgl.sampling.BlockSampler.sample_frontier` method.
-
-        * ``block_id`` represents which GNN layer the block is currently generated for.
-
-        * ``g`` represents the original graph.
-
-        * ``seed_nodes`` represents the output nodes on the current layer.
-
-        * Other arguments are the same ones passed into
-          :py:meth:`~dgl.sampling.BlockSampler.sample_blocks` method.
-
-        Parameters
-        ----------
-        postprocessor : callable
-            The postprocessor.
-        """
-        self._frontier_postprocessor = postprocessor
-
-    def set_block_postprocessor(self, postprocessor):
-        """Set a block postprocessor.
-
-        The postprocessor must have the following signature:
-
-        .. code::
-
-           postprocessor(block, frontier, block_id, g, seed_nodes, *args, **kwargs)
-
-        where
-
-        * ``block`` represents the block converted from the frontier.
-
-        * ``frontier`` represents the frontier the block is generated from.
-
-        * ``block_id`` represents which GNN layer the block is currently generated for.
-
-        * ``g`` represents the original graph.
-
-        * ``seed_nodes`` represents the output nodes on the current layer.
-
-        * Other arguments are the same ones passed into
-          :py:meth:`~dgl.sampling.BlockSampler.sample_blocks` method.
-
-        Parameters
-        ----------
-        postprocessor : callable
-            The postprocessor.
-        """
-        self._block_postprocessor = postprocessor
-
-    def _postprocess_frontier(self, frontier, block_id, g, seed_nodes, *args, **kwargs):
-        """Post-processes the generated frontier."""
-        return self._frontier_postprocessor(
-            frontier, block_id, g, seed_nodes, *args, **kwargs)
-
-    def _postprocess_block(self, block, frontier, block_id, g, seed_nodes, *args, **kwargs):
-        """Post-processes the generated block."""
-        return self._block_postprocessor(
-            block, frontier, block_id, g, seed_nodes, *args, **kwargs)
-
-    def sample_frontier(self, block_id, g, seed_nodes, *args, **kwargs):
-        """
-        Generate the frontier given the output nodes.
-
-        Parameters
-        ----------
-        block_id : int
-            Represents which GNN layer the frontier is generated for.
-        g : DGLHeteroGraph
-            The original graph.
-        seed_nodes : Tensor or dict[ntype, Tensor]
-            The output nodes by node type.
-
-            If the graph only has one node type, one can just specify a single tensor
-            of node IDs.
-        args, kwargs :
-            Other arguments being passed by
-            :py:meth:`~dgl.sampling.BlockSampler.sample_blocks`.
-
-        Returns
-        -------
-        DGLHeteroGraph
-            The frontier generated for the current layer.
-
-        See also
-        --------
-        For the concept of frontiers and blocks, please refer to User Guide Section 6.
-        """
-        raise NotImplementedError
-
-    def sample_blocks(self, g, seed_nodes, *args, **kwargs):
-        """
-        Generate the a list of blocks given the output nodes.
-
-        Parameters
-        ----------
-        g : DGLHeteroGraph
-            The original graph.
-        seed_nodes : Tensor or dict[ntype, Tensor]
-            The output nodes by node type.
-
-            If the graph only has one node type, one can just specify a single tensor
-            of node IDs.
-        args, kwargs :
-            Other arguments being passed by
-            :py:meth:`~dgl.sampling.BlockSampler.sample_blocks`.
-
-        Returns
-        -------
-        list[DGLHeteroGraph]
-            The blocks generated for computing the multi-layer GNN output.
-
-        See also
-        --------
-        For the concept of frontiers and blocks, please refer to User Guide Section 6.
-        """
-        blocks = []
-        for block_id in reversed(range(self.num_hops)):
-            frontier = self.sample_frontier(block_id, g, seed_nodes, *args, **kwargs)
-            # Removing edges from the frontier for link prediction training falls
-            # into the category of frontier postprocessing
-            frontier = self._postprocess_frontier(
-                frontier, block_id, g, seed_nodes, *args, **kwargs)
-
-            block = transform.to_block(frontier, seed_nodes)
-            # Assigning edge IDs and/or node/edge features falls into the category of block
-            # postprocessing
-            block = self._postprocess_block(
-                block, frontier, block_id, g, seed_nodes, *args, **kwargs)
-
-            seed_nodes = {ntype: block.srcnodes[ntype].data[NID] for ntype in block.srctypes}
-            blocks.insert(0, block)
-        return blocks
-
-class Collator(ABC):
-    """
-    Abstract DGL collator for training GNNs on downstream tasks stochastically.
-
-    Provides a ``dataset`` object containing the collection of all nodes or edges,
-    as well as a ``collate`` method that combines a set of items from ``dataset`` and
-    obtains the blocks.
-
-    See also
-    --------
-    For the concept of blocks, please refer to User Guide Section 6.
-    """
-    @abstractproperty
-    def dataset(self):
-        """Returns the dataset object of the collator."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def collate(self, items):
-        """Combines the items from the dataset object and obtains the list of blocks.
-
-        Parameters
-        ----------
-        items : list[str, int]
-            The list of node or edge type-ID pairs.
-
-        See also
-        --------
-        For the concept of blocks, please refer to User Guide Section 6.
-        """
-        raise NotImplementedError
-
-class NodeCollator(Collator):
-    """
-    DGL collator to combine training node classification or regression on a single graph.
-
-    Parameters
-    ----------
-    g : DGLHeteroGraph
-        The graph.
-    nids : Tensor or dict[ntype, Tensor]
-        The node set to compute outputs.
-    block_sampler : :py:class:`~dgl.sampling.BlockSampler`
-        The neighborhood sampler.
-
-    Examples
-    --------
-    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
-    a homogeneous graph where each node takes messages from all neighbors (assume
-    the backend is PyTorch):
-    >>> sampler = dgl.sampling.NeighborSampler([None, None, None])
-    >>> collator = dgl.sampling.NodeCollator(g, train_nid, sampler)
-    >>> dataloader = torch.utils.data.DataLoader(
-    ...     collator.dataset, collate_fn=collator.collate,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, output_nodes, blocks in dataloader:
-    ...     train_on(input_nodes, output_nodes, blocks)
-    """
-    def __init__(self, g, nids, block_sampler):
-        self.g = g
-        if not isinstance(nids, Mapping):
-            assert len(g.ntypes) == 1, \
-                "nids should be a dict of node type and ids for graph with multiple node types"
-        self.nids = nids
-        self.block_sampler = block_sampler
-
-        if isinstance(nids, Mapping):
-            self._dataset = utils.FlattenedDict(nids)
-        else:
-            self._dataset = nids
-
-    @property
-    def dataset(self):
-        return self._dataset
-
-    def collate(self, items):
-        """Find the list of blocks necessary for computing the representation of given
-        nodes for a node classification/regression task.
-
-        Returns
-        -------
-        input_nodes : Tensor or dict[ntype, Tensor]
-            The input nodes necessary for computation in this minibatch.
-
-            If the original graph has multiple node types, return a dictionary of
-            node type names and node ID tensors.  Otherwise, return a single tensor.
-        output_nodes : Tensor or dict[ntype, Tensor]
-            The nodes whose representations are to be computed in this minibatch.
-
-            If the original graph has multiple node types, return a dictionary of
-            node type names and node ID tensors.  Otherwise, return a single tensor.
-        blocks : list[DGLHeteroGraph]
-            The list of blocks necessary for computing the representation.
-        """
-        if isinstance(items[0], tuple):
-            # returns a list of pairs: group them by node types into a dict
-            items = utils.group_as_dict(items)
-        blocks = self.block_sampler.sample_blocks(self.g, items)
-
-        if len(self.g.ntypes) == 1:
-            output_nodes = blocks[-1].dstdata[NID]
-            input_nodes = blocks[0].srcdata[NID]
-        else:
-            output_nodes = {
-                ntype: blocks[-1].dstnodes[ntype].data[NID]
-                for ntype in blocks[-1].dsttypes}
-            input_nodes = {
-                ntype: blocks[0].srcnodes[ntype].data[NID]
-                for ntype in blocks[0].srctypes}
-
-        return input_nodes, output_nodes, blocks
--- a/python/dgl/sampling/neighbor.py
+++ b/python/dgl/sampling/neighbor.py
@@ -6,13 +6,10 @@ from ..base import DGLError, EID
 from ..heterograph import DGLHeteroGraph
 from .. import ndarray as nd
 from .. import utils
-from .. import subgraph as subg
-from .dataloader import BlockSampler, assign_block_eids

 __all__ = [
    'sample_neighbors',
-    'select_topk',
-    'MultiLayerNeighborSampler']
+    'select_topk']

 def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
    """Sample neighboring edges of the given nodes and return the induced subgraph.
@@ -235,74 +232,4 @@ def select_topk(g, k, weight, nodes=None, edge_dir='in', ascending=False):
        ret.edges[etype].data[EID] = induced_edges[i]
    return ret

-
-class MultiLayerNeighborSampler(BlockSampler):
-    """Sampler that builds computational dependency of node representations via
-    neighbor sampling for multilayer GNN.
-
-    This sampler will make every node gather messages from a fixed number of neighbors
-    per edge type.  The neighbors are picked uniformly.
-
-    Parameters
-    ----------
-    fanouts : list[int] or list[dict[etype, int] or None]
-        List of neighbors to sample per edge type for each GNN layer, starting from the
-        first layer.
-
-        If the graph is homogeneous, only an integer is needed for each layer.
-
-        If None is provided for one layer, all neighbors will be included regardless of
-        edge types.
-
-        If -1 is provided for one edge type on one layer, then all inbound edges
-        of that edge type will be included.
-    replace : bool, default True
-        Whether to sample with replacement
-    return_eids : bool, default False
-        Whether to return edge IDs of the original graph in the sampled blocks.
-
-        If True, the edge IDs will be stored as ``dgl.EID`` feature for each edge type.
-
-    Examples
-    --------
-    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
-    a homogeneous graph where each node takes messages from all neighbors (assume
-    the backend is PyTorch):
-    >>> sampler = dgl.sampling.NeighborSampler([None, None, None])
-    >>> collator = dgl.sampling.NodeCollator(g, train_nid, sampler)
-    >>> dataloader = torch.utils.data.DataLoader(
-    ...     collator.dataset, collate_fn=collator.collate,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for blocks in dataloader:
-    ...     train_on(blocks)
-
-    If we wish to gather from 5 neighbors on the first layer, 10 neighbors on the second,
-    and 15 layers on the third:
-    >>> sampler = dgl.sampling.NeighborSampler([5, 10, 15])
-
-    If training on a heterogeneous graph and you want different number of neighbors for each
-    edge type, one should instead provide a list of dicts.  Each dict would specify the
-    number of neighbors to pick per edge type.
-    >>> sampler = dgl.sampling.NeighborSampler([
-    ...     {('user', 'follows', 'user'): 5,
-    ...      ('user', 'plays', 'game'): 4,
-    ...      ('game', 'played-by', 'user'): 3}] * 3)
-    """
-    def __init__(self, fanouts, replace=False, return_eids=False):
-        super().__init__(len(fanouts))
-
-        self.fanouts = fanouts
-        self.replace = replace
-        self.return_eids = return_eids
-        if return_eids:
-            self.set_block_postprocessor(assign_block_eids)
-
-    def sample_frontier(self, block_id, g, seed_nodes, *args, **kwargs):
-        fanout = self.fanouts[block_id]
-        if fanout is None:
-            frontier = subg.in_subgraph(g, seed_nodes)
-        else:
-            frontier = sample_neighbors(g, seed_nodes, fanout, replace=self.replace)
-        return frontier
-
 _init_api('dgl.sampling.neighbor', __name__)
--- a/python/dgl/sampling/pytorch/__init__.py
+++ b/python/dgl/sampling/pytorch/__init__.py
-"""DGL PyTorch DataLoaders"""
-from torch.utils.data import DataLoader
-from ..dataloader import NodeCollator
-
-class NodeDataLoader(DataLoader):
-    """PyTorch dataloader for batch-iterating over a set of nodes, generating the list
-    of blocks as computation dependency of the said minibatch.
-
-    Parameters
-    ----------
-    g : DGLHeteroGraph
-        The graph.
-    nids : Tensor or dict[ntype, Tensor]
-        The node set to compute outputs.
-    block_sampler : :py:class:`~dgl.sampling.BlockSampler`
-        The neighborhood sampler.
-    kwargs : dict
-        Arguments being passed to `torch.utils.data.DataLoader`.
-
-    Examples
-    --------
-    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
-    a homogeneous graph where each node takes messages from all neighbors (assume
-    the backend is PyTorch):
-    >>> sampler = dgl.sampling.NeighborSampler([None, None, None])
-    >>> dataloader = dgl.sampling.NodeDataLoader(
-    ...     g, train_nid, sampler,
-    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
-    >>> for input_nodes, output_nodes, blocks in dataloader:
-    ...     train_on(input_nodes, output_nodes, blocks)
-    """
-    def __init__(self, g, nids, block_sampler, **kwargs):
-        self.collator = NodeCollator(g, nids, block_sampler)
-        super().__init__(self.collator.dataset, collate_fn=self.collator.collate, **kwargs)