[Sampling] New sampling pipeline plus asynchronous prefetching (#3665)

* initial update * more * more * multi-gpu example * cluster gcn, finalize homogeneous * more explanation * fix * bunch of fixes * fix * RGAT example and more fixes * shadow-gnn sampler and some changes in unit test * fix * wth * more fixes * remove shadow+node/edge dataloader tests for possible ux changes * lints * add legacy dataloading import just in case * fix * update pylint for f-strings * fix * lint * lint * lint again * cherry-picking commit fa9f494 * oops * fix * add sample_neighbors in dist_graph * fix * lint * fix * fix * fix * fix tutorial * fix * fix * fix * fix warning * remove debug * add get_foo_storage apis * lint

[Sampling] New sampling pipeline plus asynchronous prefetching (#3665)
* initial update * more * more * multi-gpu example * cluster gcn, finalize homogeneous * more explanation * fix * bunch of fixes * fix * RGAT example and more fixes * shadow-gnn sampler and some changes in unit test * fix * wth * more fixes * remove shadow+node/edge dataloader tests for possible ux changes * lints * add legacy dataloading import just in case * fix * update pylint for f-strings * fix * lint * lint * lint again * cherry-picking commit fa9f494 * oops * fix * add sample_neighbors in dist_graph * fix * lint * fix * fix * fix * fix tutorial * fix * fix * fix * fix warning * remove debug * add get_foo_storage apis * lint
701b4fcc · Quan (Andy) Gan · GitHub · 5152a879 · 701b4fcc · 701b4fcc
Unverified Commit 701b4fcc authored Jan 30, 2022 by Quan (Andy) Gan Committed by GitHub Jan 30, 2022
20 changed files
--- a/python/dgl/dataloading/async_transferer.py
+++ b/python/dgl/dataloading/async_transferer.py
@@ -101,4 +101,4 @@ class AsyncTransferer(object):
        return Transfer(transfer_id, self._handle)
-_init_api("dgl.dataloading.async_transferer")
+_init_api("dataloading.async_transferer", "dgl._dataloading.async_transferer")
--- a/python/dgl/_dataloading/cluster_gcn.py
+++ b/python/dgl/_dataloading/cluster_gcn.py
+"""Cluster-GCN subgraph iterators."""
+import os
+import pickle
+import numpy as np
+from ..transform import metis_partition_assignment
+from .. import backend as F
+from .dataloader import SubgraphIterator
+class ClusterGCNSubgraphIterator(SubgraphIterator):
+    """Subgraph sampler following that of ClusterGCN.
+    This sampler first partitions the graph with METIS partitioning, then it caches the nodes of
+    each partition to a file within the given cache directory.
+    This is used in conjunction with :class:`dgl.dataloading.pytorch.GraphDataLoader`.
+    Notes
+    -----
+    The graph must be homogeneous and on CPU.
+    Parameters
+    ----------
+    g : DGLGraph
+        The original graph.
+    num_partitions : int
+        The number of partitions.
+    cache_directory : str
+        The path to the cache directory for storing the partition result.
+    refresh : bool
+        If True, recompute the partition.
+    Examples
+    --------
+    Assuming that you have a graph ``g``:
+    >>> sgiter = dgl.dataloading.ClusterGCNSubgraphIterator(
+    ...     g, num_partitions=100, cache_directory='.', refresh=True)
+    >>> dataloader = dgl.dataloading.GraphDataLoader(sgiter, batch_size=4, num_workers=0)
+    >>> for subgraph_batch in dataloader:
+    ...     train_on(subgraph_batch)
+    """
+    def __init__(self, g, num_partitions, cache_directory, refresh=False):
+        if os.name == 'nt':
+            raise NotImplementedError("METIS partitioning is not supported on Windows yet.")
+        super().__init__(g)
+        # First see if the cache is already there.  If so, directly read from cache.
+        if not refresh and self._load_parts(cache_directory):
+            return
+        # Otherwise, build the cache.
+        assignment = F.asnumpy(metis_partition_assignment(g, num_partitions))
+        self._save_parts(assignment, cache_directory)
+    def _cache_file_path(self, cache_directory):
+        return os.path.join(cache_directory, 'cluster_gcn_cache')
+    def _load_parts(self, cache_directory):
+        path = self._cache_file_path(cache_directory)
+        if not os.path.exists(path):
+            return False
+        with open(path, 'rb') as file_:
+            self.part_indptr, self.part_indices = pickle.load(file_)
+        return True
+    def _save_parts(self, assignment, cache_directory):
+        os.makedirs(cache_directory, exist_ok=True)
+        self.part_indices = np.argsort(assignment)
+        num_nodes_per_part = np.bincount(assignment)
+        self.part_indptr = np.insert(np.cumsum(num_nodes_per_part), 0, 0)
+        with open(self._cache_file_path(cache_directory), 'wb') as file_:
+            pickle.dump((self.part_indptr, self.part_indices), file_)
+    def __len__(self):
+        return self.part_indptr.shape[0] - 1
+    def __getitem__(self, i):
+        nodes = self.part_indices[self.part_indptr[i]:self.part_indptr[i+1]]
+        return self.g.subgraph(nodes)
--- a/python/dgl/_dataloading/dataloader.py
+++ b/python/dgl/_dataloading/dataloader.py
--- a/python/dgl/_dataloading/negative_sampler.py
+++ b/python/dgl/_dataloading/negative_sampler.py
+"""Negative samplers"""
+from collections.abc import Mapping
+from .. import backend as F
+from ..sampling import global_uniform_negative_sampling
+class _BaseNegativeSampler(object):
+    def _generate(self, g, eids, canonical_etype):
+        raise NotImplementedError
+    def __call__(self, g, eids):
+        """Returns negative samples.
+        Parameters
+        ----------
+        g : DGLGraph
+            The graph.
+        eids : Tensor or dict[etype, Tensor]
+            The sampled edges in the minibatch.
+        Returns
+        -------
+        tuple[Tensor, Tensor] or dict[etype, tuple[Tensor, Tensor]]
+            The returned source-destination pairs as negative samples.
+        """
+        if isinstance(eids, Mapping):
+            eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+            neg_pair = {k: self._generate(g, v, k) for k, v in eids.items()}
+        else:
+            assert len(g.etypes) == 1, \
+                'please specify a dict of etypes and ids for graphs with multiple edge types'
+            neg_pair = self._generate(g, eids, g.canonical_etypes[0])
+        return neg_pair
+class PerSourceUniform(_BaseNegativeSampler):
+    """Negative sampler that randomly chooses negative destination nodes
+    for each source node according to a uniform distribution.
+    For each edge ``(u, v)`` of type ``(srctype, etype, dsttype)``, DGL generates
+    :attr:`k` pairs of negative edges ``(u, v')``, where ``v'`` is chosen
+    uniformly from all the nodes of type ``dsttype``.  The resulting edges will
+    also have type ``(srctype, etype, dsttype)``.
+    Parameters
+    ----------
+    k : int
+        The number of negative samples per edge.
+    Examples
+    --------
+    >>> g = dgl.graph(([0, 1, 2], [1, 2, 3]))
+    >>> neg_sampler = dgl.dataloading.negative_sampler.PerSourceUniform(2)
+    >>> neg_sampler(g, torch.tensor([0, 1]))
+    (tensor([0, 0, 1, 1]), tensor([1, 0, 2, 3]))
+    """
+    def __init__(self, k):
+        self.k = k
+    def _generate(self, g, eids, canonical_etype):
+        _, _, vtype = canonical_etype
+        shape = F.shape(eids)
+        dtype = F.dtype(eids)
+        ctx = F.context(eids)
+        shape = (shape[0] * self.k,)
+        src, _ = g.find_edges(eids, etype=canonical_etype)
+        src = F.repeat(src, self.k, 0)
+        dst = F.randint(shape, dtype, ctx, 0, g.number_of_nodes(vtype))
+        return src, dst
+# Alias
+Uniform = PerSourceUniform
+class GlobalUniform(_BaseNegativeSampler):
+    """Negative sampler that randomly chooses negative source-destination pairs according
+    to a uniform distribution.
+    For each edge ``(u, v)`` of type ``(srctype, etype, dsttype)``, DGL generates at most
+    :attr:`k` pairs of negative edges ``(u', v')``, where ``u'`` is chosen uniformly from
+    all the nodes of type ``srctype`` and ``v'`` is chosen uniformly from all the nodes
+    of type ``dsttype``.  The resulting edges will also have type
+    ``(srctype, etype, dsttype)``.  DGL guarantees that the sampled pairs will not have
+    edges in between.
+    Parameters
+    ----------
+    k : int
+        The desired number of negative samples to generate per edge.
+    exclude_self_loops : bool, optional
+        Whether to exclude self-loops from negative samples.  (Default: True)
+    replace : bool, optional
+        Whether to sample with replacement.  Setting it to True will make things
+        faster.  (Default: True)
+    redundancy : float, optional
+        Indicates how much more negative samples to actually generate during rejection sampling
+        before finding the unique pairs.
+        Increasing it will increase the likelihood of getting :attr:`k` negative samples
+        per edge, but will also take more time and memory.
+        (Default: automatically determined by the density of graph)
+    Notes
+    -----
+    This negative sampler will try to generate as many negative samples as possible, but
+    it may rarely return less than :attr:`k` negative samples per edge.
+    This is more likely to happen if a graph is so small or dense that not many unique
+    negative samples exist.
+    Examples
+    --------
+    >>> g = dgl.graph(([0, 1, 2], [1, 2, 3]))
+    >>> neg_sampler = dgl.dataloading.negative_sampler.GlobalUniform(2, True)
+    >>> neg_sampler(g, torch.LongTensor([0, 1]))
+    (tensor([0, 1, 3, 2]), tensor([2, 0, 2, 1]))
+    """
+    def __init__(self, k, exclude_self_loops=True, replace=False, redundancy=None):
+        self.k = k
+        self.exclude_self_loops = exclude_self_loops
+        self.replace = replace
+        self.redundancy = redundancy
+    def _generate(self, g, eids, canonical_etype):
+        return global_uniform_negative_sampling(
+            g, len(eids) * self.k, self.exclude_self_loops, self.replace,
+            canonical_etype, self.redundancy)
--- a/python/dgl/dataloading/neighbor.py
+++ b/python/dgl/dataloading/neighbor.py
--- a/python/dgl/dataloading/pytorch/__init__.py
+++ b/python/dgl/dataloading/pytorch/__init__.py
--- a/python/dgl/dataloading/pytorch/dataloader.py
+++ b/python/dgl/dataloading/pytorch/dataloader.py
@@ -10,7 +10,6 @@ from torch.utils.data.distributed import DistributedSampler
 import torch.distributed as dist
 from ..dataloader import NodeCollator, EdgeCollator, GraphCollator, SubgraphIterator
 from ...distributed import DistGraph
-from ...distributed import DistDataLoader
 from ...ndarray import NDArray as DGLNDArray
 from ... import backend as F
 from ...base import DGLError
@@ -26,6 +25,10 @@ PYTORCH_VER = LooseVersion(th.__version__)
 PYTORCH_16 = PYTORCH_VER >= LooseVersion("1.6.0")
 PYTORCH_17 = PYTORCH_VER >= LooseVersion("1.7.0")
+def _check_graph_type(g):
+    if isinstance(g, DistGraph):
+        raise TypeError("Please use DistNodeDataLoader or DistEdgeDataLoader for DistGraph")
 def _create_dist_sampler(dataset, dataloader_kwargs, ddp_seed):
    # Note: will change the content of dataloader_kwargs
    dist_sampler_kwargs = {'shuffle': dataloader_kwargs['shuffle']}
@@ -166,14 +169,6 @@ class _ScalarDataBatcher(th.utils.data.IterableDataset):
        """Set epoch number for distributed training."""
        self.epoch = epoch
-def _remove_kwargs_dist(kwargs):
-    if 'num_workers' in kwargs:
-        del kwargs['num_workers']
-    if 'pin_memory' in kwargs:
-        del kwargs['pin_memory']
-        print('Distributed DataLoader does not support pin_memory')
-    return kwargs
 # The following code is a fix to the PyTorch-specific issue in
 # https://github.com/dmlc/dgl/issues/2137
 #
@@ -290,14 +285,14 @@ def _restore_storages(subgs, g):
            _restore_subgraph_storage(subg, g)
 class _NodeCollator(NodeCollator):
-    def collate(self, items):
+    def collate(self, items): # pylint: disable=missing-docstring
        # input_nodes, output_nodes, blocks
        result = super().collate(items)
        _pop_storages(result[-1], self.g)
        return result
 class _EdgeCollator(EdgeCollator):
-    def collate(self, items):
+    def collate(self, items): # pylint: disable=missing-docstring
        if self.negative_sampler is None:
            # input_nodes, pair_graph, blocks
            result = super().collate(items)
@@ -381,10 +376,10 @@ def _background_node_dataloader(dl_iter, g, device, results, load_input, load_ou
 class _NodeDataLoaderIter:
-    def __init__(self, node_dataloader):
+    def __init__(self, node_dataloader, iter_):
        self.device = node_dataloader.device
        self.node_dataloader = node_dataloader
-        self.iter_ = iter(node_dataloader.dataloader)
+        self.iter_ = iter_
        self.async_load = node_dataloader.async_load and (
            F.device_type(self.device) == 'cuda')
        if self.async_load:
@@ -418,10 +413,10 @@ class _NodeDataLoaderIter:
        return input_nodes, output_nodes, blocks
 class _EdgeDataLoaderIter:
-    def __init__(self, edge_dataloader):
+    def __init__(self, edge_dataloader, iter_):
        self.device = edge_dataloader.device
        self.edge_dataloader = edge_dataloader
-        self.iter_ = iter(edge_dataloader.dataloader)
+        self.iter_ = iter_
    # Make this an iterator for PyTorch Lightning compatibility
    def __iter__(self):
@@ -441,9 +436,9 @@ class _EdgeDataLoaderIter:
        return result
 class _GraphDataLoaderIter:
-    def __init__(self, graph_dataloader):
+    def __init__(self, graph_dataloader, iter_):
        self.dataloader = graph_dataloader
-        self.iter_ = iter(graph_dataloader.dataloader)
+        self.iter_ = iter_
    def __iter__(self):
        return self
@@ -490,14 +485,9 @@ def _init_dataloader(collator, device, dataloader_kwargs, use_ddp, ddp_seed):
    else:
        dist_sampler = None
-    dataloader = DataLoader(
+    return use_scalar_batcher, scalar_batcher, dataset, collator, dist_sampler
-        dataset,
-        collate_fn=collator.collate,
-        **dataloader_kwargs)
-    return use_scalar_batcher, scalar_batcher, dataloader, dist_sampler
+class NodeDataLoader(DataLoader):
-class NodeDataLoader:
    """PyTorch dataloader for batch-iterating over a set of nodes, generating the list
    of message flow graphs (MFGs) as computation dependency of the said minibatch.
@@ -600,6 +590,7 @@ class NodeDataLoader:
    def __init__(self, g, nids, graph_sampler, device=None, use_ddp=False, ddp_seed=0,
                 load_input=None, load_output=None, async_load=False, **kwargs):
+        _check_graph_type(g)
        collator_kwargs = {}
        dataloader_kwargs = {}
        for k, v in kwargs.items():
@@ -608,65 +599,42 @@ class NodeDataLoader:
            else:
                dataloader_kwargs[k] = v
-        if isinstance(g, DistGraph):
+        if device is None:
-            if device is None:
+            # default to the same device the graph is on
-                # for the distributed case default to the CPU
+            device = th.device(g.device)
-                device = 'cpu'
-            assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.'
+        if not g.is_homogeneous:
-            # Distributed DataLoader currently does not support heterogeneous graphs
+            if load_input or load_output:
-            # and does not copy features.  Fallback to normal solution
+                raise DGLError('load_input/load_output not supported for heterograph yet.')
-            self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs)
+        self.load_input = {} if load_input is None else load_input
-            _remove_kwargs_dist(dataloader_kwargs)
+        self.load_output = {} if load_output is None else load_output
-            self.dataloader = DistDataLoader(self.collator.dataset,
+        self.async_load = async_load
-                                             collate_fn=self.collator.collate,
-                                             **dataloader_kwargs)
+        # if the sampler supports it, tell it to output to the specified device.
-            self.is_distributed = True
+        # But if async_load is enabled, set_output_context should be skipped as
-        else:
+        # we'd like to avoid any graph/data transfer graphs across devices in
-            if device is None:
+        # sampler. Such transfer will be handled in dataloader.
-                # default to the same device the graph is on
+        num_workers = dataloader_kwargs.get('num_workers', 0)
-                device = th.device(g.device)
+        if ((not async_load) and
+                callable(getattr(graph_sampler, "set_output_context", None)) and
-            if not g.is_homogeneous:
+                num_workers == 0):
-                if load_input or load_output:
+            graph_sampler.set_output_context(to_dgl_context(device))
-                    raise DGLError('load_input/load_output not supported for heterograph yet.')
-            self.load_input = {} if load_input is None else load_input
+        self.collator = _NodeCollator(g, nids, graph_sampler, **collator_kwargs)
-            self.load_output = {} if load_output is None else load_output
+        self.use_scalar_batcher, self.scalar_batcher, self.dataloader, self.dist_sampler = \
-            self.async_load = async_load
+            _init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed)
-            # if the sampler supports it, tell it to output to the specified device.
-            # But if async_load is enabled, set_output_context should be skipped as
-            # we'd like to avoid any graph/data transfer graphs across devices in
-            # sampler. Such transfer will be handled in dataloader.
-            num_workers = dataloader_kwargs.get('num_workers', 0)
-            if ((not async_load) and
-                    callable(getattr(graph_sampler, "set_output_context", None)) and
-                    num_workers == 0):
-                graph_sampler.set_output_context(to_dgl_context(device))
-            self.collator = _NodeCollator(g, nids, graph_sampler, **collator_kwargs)
-            self.use_scalar_batcher, self.scalar_batcher, self.dataloader, self.dist_sampler = \
-                _init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed)
-            self.use_ddp = use_ddp
+        self.use_ddp = use_ddp
-            self.is_distributed = False
+        self.is_distributed = False
-            # Precompute the CSR and CSC representations so each subprocess does not
+        # Precompute the CSR and CSC representations so each subprocess does not
-            # duplicate.
+        # duplicate.
-            if num_workers > 0:
+        if num_workers > 0:
-                g.create_formats_()
+            g.create_formats_()
        self.device = device
    def __iter__(self):
-        """Return the iterator of the data loader."""
+        return _NodeDataLoaderIter(self, super().__iter__())
-        if self.is_distributed:
-            # Directly use the iterator of DistDataLoader, which doesn't copy features anyway.
-            return iter(self.dataloader)
-        else:
-            return _NodeDataLoaderIter(self)
-    def __len__(self):
-        """Return the number of batches of the data loader."""
-        return len(self.dataloader)
    def set_epoch(self, epoch):
        """Sets the epoch number for the underlying sampler which ensures all replicas
@@ -689,7 +657,7 @@ class NodeDataLoader:
        else:
            raise DGLError('set_epoch is only available when use_ddp is True.')
-class EdgeDataLoader:
+class EdgeDataLoader(DataLoader):
    """PyTorch dataloader for batch-iterating over a set of edges, generating the list
    of message flow graphs (MFGs) as computation dependency of the said minibatch for
    edge classification, edge regression, and link prediction.
@@ -897,8 +865,9 @@ class EdgeDataLoader:
    * Link prediction on heterogeneous graph: RGCN for link prediction.
    """
    collator_arglist = inspect.getfullargspec(EdgeCollator).args
+    def __init__(self, g, eids, graph_sampler, device='cpu', use_ddp=False, ddp_seed=0,
-    def __init__(self, g, eids, graph_sampler, device='cpu', use_ddp=False, ddp_seed=0, **kwargs):
+                 **kwargs):
+        _check_graph_type(g)
        collator_kwargs = {}
        dataloader_kwargs = {}
        for k, v in kwargs.items():
@@ -907,53 +876,30 @@ class EdgeDataLoader:
            else:
                dataloader_kwargs[k] = v
-        if isinstance(g, DistGraph):
-            if device is None:
-                # for the distributed case default to the CPU
-                device = 'cpu'
-            assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.'
-            # Distributed DataLoader currently does not support heterogeneous graphs
-            # and does not copy features.  Fallback to normal solution
-            self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
-            _remove_kwargs_dist(dataloader_kwargs)
-            self.dataloader = DistDataLoader(self.collator.dataset,
-                                             collate_fn=self.collator.collate,
-                                             **dataloader_kwargs)
-            self.is_distributed = True
-        else:
            if device is None:
                # default to the same device the graph is on
                device = th.device(g.device)
-            # if the sampler supports it, tell it to output to the
+        # if the sampler supports it, tell it to output to the
-            # specified device
+        # specified device
-            num_workers = dataloader_kwargs.get('num_workers', 0)
+        num_workers = dataloader_kwargs.get('num_workers', 0)
-            if callable(getattr(graph_sampler, "set_output_context", None)) and num_workers == 0:
+        if callable(getattr(graph_sampler, "set_output_context", None)) and num_workers == 0:
-                graph_sampler.set_output_context(to_dgl_context(device))
+            graph_sampler.set_output_context(to_dgl_context(device))
-            self.collator = _EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
+        self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
-            self.use_scalar_batcher, self.scalar_batcher, self.dataloader, self.dist_sampler = \
+        self.use_scalar_batcher, self.scalar_batcher, dataset, collator, self.dist_sampler = \
-                    _init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed)
+                _init_dataloader(self.collator, device, dataloader_kwargs, use_ddp, ddp_seed)
-            self.use_ddp = use_ddp
+        self.use_ddp = use_ddp
-            self.is_distributed = False
+        super().__init__(dataset, collate_fn=collator.collate, **dataloader_kwargs)
-            # Precompute the CSR and CSC representations so each subprocess does not duplicate.
+        # Precompute the CSR and CSC representations so each subprocess does not duplicate.
-            if num_workers > 0:
+        if num_workers > 0:
-                g.create_formats_()
+            g.create_formats_()
        self.device = device
    def __iter__(self):
-        """Return the iterator of the data loader."""
+        return _EdgeDataLoaderIter(self, super().__iter__())
-        if self.is_distributed:
-            # Directly use the iterator of DistDataLoader, which doesn't copy features anyway.
-            return iter(self.dataloader)
-        else:
-            return _EdgeDataLoaderIter(self)
-    def __len__(self):
-        """Return the number of batches of the data loader."""
-        return len(self.dataloader)
    def set_epoch(self, epoch):
        """Sets the epoch number for the underlying sampler which ensures all replicas
@@ -976,7 +922,7 @@ class EdgeDataLoader:
        else:
            raise DGLError('set_epoch is only available when use_ddp is True.')
-class GraphDataLoader:
+class GraphDataLoader(DataLoader):
    """PyTorch dataloader for batch-iterating over a set of graphs, generating the batched
    graph and corresponding label tensor (if provided) of the said minibatch.
@@ -1023,7 +969,6 @@ class GraphDataLoader:
    ...         train_on(batched_graph, labels)
    """
    collator_arglist = inspect.getfullargspec(GraphCollator).args
    def __init__(self, dataset, collate_fn=None, use_ddp=False, ddp_seed=0, **kwargs):
        collator_kwargs = {}
        dataloader_kwargs = {}
@@ -1058,14 +1003,11 @@ class GraphDataLoader:
        if use_ddp:
            self.dist_sampler = _create_dist_sampler(dataset, dataloader_kwargs, ddp_seed)
            dataloader_kwargs['sampler'] = self.dist_sampler
+        super().__init__(dataset, collate_fn=self.collate, **dataloader_kwargs)
-        self.dataloader = DataLoader(dataset=dataset,
-                                     collate_fn=self.collate,
-                                     **dataloader_kwargs)
    def __iter__(self):
        """Return the iterator of the data loader."""
-        return _GraphDataLoaderIter(self)
+        return _GraphDataLoaderIter(self, super().__iter__())
    def __len__(self):
        """Return the number of batches of the data loader."""

--- a/python/dgl/_dataloading/shadow.py
+++ b/python/dgl/_dataloading/shadow.py
+"""ShaDow-GNN subgraph samplers."""
+from ..utils import prepare_tensor_or_dict
+from ..base import NID
+from .. import transform
+from ..sampling import sample_neighbors
+from .neighbor import NeighborSamplingMixin
+from .dataloader import exclude_edges, Sampler
+class ShaDowKHopSampler(NeighborSamplingMixin, Sampler):
+    """K-hop subgraph sampler used by
+    `ShaDow-GNN <https://arxiv.org/abs/2012.01380>`__.
+    It performs node-wise neighbor sampling but instead of returning a list of
+    MFGs, it returns a single subgraph induced by all the sampled nodes. The
+    seed nodes from which the neighbors are sampled will appear the first in the
+    induced nodes of the subgraph.
+    This is used in conjunction with :class:`dgl.dataloading.pytorch.NodeDataLoader`
+    and :class:`dgl.dataloading.pytorch.EdgeDataLoader`.
+    Parameters
+    ----------
+    fanouts : list[int] or list[dict[etype, int]]
+        List of neighbors to sample per edge type for each GNN layer, with the i-th
+        element being the fanout for the i-th GNN layer.
+        If only a single integer is provided, DGL assumes that every edge type
+        will have the same fanout.
+        If -1 is provided for one edge type on one layer, then all inbound edges
+        of that edge type will be included.
+    replace : bool, default True
+        Whether to sample with replacement
+    prob : str, optional
+        If given, the probability of each neighbor being sampled is proportional
+        to the edge feature value with the given name in ``g.edata``. The feature must be
+        a scalar on each edge.
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from 5, 10, 15 neighbors for
+    the first, second, and third layer respectively (assuming the backend is PyTorch):
+    >>> g = dgl.data.CoraFullDataset()[0]
+    >>> sampler = dgl.dataloading.ShaDowKHopSampler([5, 10, 15])
+    >>> dataloader = dgl.dataloading.NodeDataLoader(
+    ...     g, torch.arange(g.num_nodes()), sampler,
+    ...     batch_size=5, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, output_nodes, (subgraph,) in dataloader:
+    ...     print(subgraph)
+    ...     assert torch.equal(input_nodes, subgraph.ndata[dgl.NID])
+    ...     assert torch.equal(input_nodes[:output_nodes.shape[0]], output_nodes)
+    ...     break
+    Graph(num_nodes=529, num_edges=3796,
+          ndata_schemes={'label': Scheme(shape=(), dtype=torch.int64),
+                         'feat': Scheme(shape=(8710,), dtype=torch.float32),
+                         '_ID': Scheme(shape=(), dtype=torch.int64)}
+          edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
+    If training on a heterogeneous graph and you want different number of neighbors for each
+    edge type, one should instead provide a list of dicts. Each dict would specify the
+    number of neighbors to pick per edge type.
+    >>> sampler = dgl.dataloading.ShaDowKHopSampler([
+    ...     {('user', 'follows', 'user'): 5,
+    ...      ('user', 'plays', 'game'): 4,
+    ...      ('game', 'played-by', 'user'): 3}] * 3)
+    If you would like non-uniform neighbor sampling:
+    >>> g.edata['p'] = torch.rand(g.num_edges())   # any non-negative 1D vector works
+    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10, 15], prob='p')
+    """
+    def __init__(self, fanouts, replace=False, prob=None, output_ctx=None):
+        super().__init__(output_ctx)
+        self.fanouts = fanouts
+        self.replace = replace
+        self.prob = prob
+        self.set_output_context(output_ctx)
+    def sample(self, g, seed_nodes, exclude_eids=None):
+        self._build_fanout(len(self.fanouts), g)
+        self._build_prob_arrays(g)
+        seed_nodes = prepare_tensor_or_dict(g, seed_nodes, 'seed nodes')
+        output_nodes = seed_nodes
+        for i in range(len(self.fanouts)):
+            fanout = self.fanouts[i]
+            frontier = sample_neighbors(
+                g, seed_nodes, fanout, replace=self.replace, prob=self.prob_arrays)
+            block = transform.to_block(frontier, seed_nodes)
+            seed_nodes = block.srcdata[NID]
+        subg = g.subgraph(seed_nodes, relabel_nodes=True)
+        subg = exclude_edges(subg, exclude_eids, self.output_device)
+        return seed_nodes, output_nodes, [subg]
--- a/python/dgl/dataloading/__init__.py
+++ b/python/dgl/dataloading/__init__.py
@@ -14,15 +14,12 @@ Read the user guide :ref:`guide-minibatch`.
    This package is experimental and the interfaces may be subject
    to changes in future releases. It currently only has implementations in PyTorch.
 """
-from .neighbor import *
+from .. import backend as F
-from .dataloader import *
+from .neighbor_sampler import *
 from .cluster_gcn import *
 from .shadow import *
+from .base import *
 from . import negative_sampler
-from .async_transferer import AsyncTransferer
-from .. import backend as F
 if F.get_preferred_backend() == 'pytorch':
-    from .pytorch import *
+    from .dataloader import *
+    from .dist_dataloader import *
--- a/python/dgl/dataloading/base.py
+++ b/python/dgl/dataloading/base.py
+"""Base classes and functionalities for dataloaders"""
+from collections import Mapping
+from ..base import NID, EID
+from ..convert import heterograph
+from .. import backend as F
+from ..transform import compact_graphs
+from ..frame import LazyFeature
+from ..utils import recursive_apply
+def _set_lazy_features(x, xdata, feature_names):
+    if feature_names is None:
+        return
+    if not isinstance(feature_names, Mapping):
+        xdata.update({k: LazyFeature(k) for k in feature_names})
+    else:
+        for type_, names in feature_names.items():
+            x[type_].data.update({k: LazyFeature(k) for k in names})
+def set_node_lazy_features(g, feature_names):
+    """Set lazy features for ``g.ndata`` if :attr:`feature_names` is a list of strings,
+    or ``g.nodes[ntype].data`` if :attr:`feature_names` is a dict of list of strings.
+    """
+    return _set_lazy_features(g.nodes, g.ndata, feature_names)
+def set_edge_lazy_features(g, feature_names):
+    """Set lazy features for ``g.edata`` if :attr:`feature_names` is a list of strings,
+    or ``g.edges[etype].data`` if :attr:`feature_names` is a dict of list of strings.
+    """
+    return _set_lazy_features(g.edges, g.edata, feature_names)
+def set_src_lazy_features(g, feature_names):
+    """Set lazy features for ``g.srcdata`` if :attr:`feature_names` is a list of strings,
+    or ``g.srcnodes[srctype].data`` if :attr:`feature_names` is a dict of list of strings.
+    """
+    return _set_lazy_features(g.srcnodes, g.srcdata, feature_names)
+def set_dst_lazy_features(g, feature_names):
+    """Set lazy features for ``g.dstdata`` if :attr:`feature_names` is a list of strings,
+    or ``g.dstnodes[dsttype].data`` if :attr:`feature_names` is a dict of list of strings.
+    """
+    return _set_lazy_features(g.dstnodes, g.dstdata, feature_names)
+class BlockSampler(object):
+    """BlockSampler is an abstract class assuming to take in a set of nodes whose
+    outputs are to compute, and return a list of blocks.
+    Moreover, it assumes that the input node features will be put in the first block's
+    ``srcdata``, the output node labels will be put in the last block's ``dstdata``, and
+    the edge data will be put in all the blocks' ``edata``.
+    """
+    def __init__(self, prefetch_node_feats=None, prefetch_labels=None,
+                 prefetch_edge_feats=None, output_device=None):
+        self.prefetch_node_feats = prefetch_node_feats or []
+        self.prefetch_labels = prefetch_labels or []
+        self.prefetch_edge_feats = prefetch_edge_feats or []
+        self.output_device = output_device
+    def sample_blocks(self, g, seed_nodes, exclude_eids=None):
+        """Generates a list of blocks from the given seed nodes.
+        This function must return a triplet where the first element is the input node IDs
+        for the first GNN layer (a tensor or a dict of tensors for heterogeneous graphs),
+        the second element is the output node IDs for the last GNN layer, and the third
+        element is the said list of blocks.
+        """
+        raise NotImplementedError
+    def assign_lazy_features(self, result):
+        """Assign lazy features for prefetching."""
+        # A LazyFeature is a placeholder telling the dataloader where and which IDs
+        # to prefetch.  It has the signature LazyFeature(name, id_).  id_ can be None
+        # if the LazyFeature is set into one of the subgraph's ``xdata``, in which case the
+        # dataloader will infer from the subgraph's ``xdata[dgl.NID]`` (or ``xdata[dgl.EID]``
+        # if the LazyFeature is set as edge features).
+        #
+        # If you want to prefetch things other than ndata and edata, you can also
+        # return a LazyFeature(name, id_).  If a LazyFeature is returned in places other than
+        # in a graph's ndata/edata/srcdata/dstdata, the DataLoader will prefetch it
+        # from its dictionary ``other_data``.
+        # For instance, you can run
+        #
+        #     return blocks, LazyFeature('other_feat', id_)
+        #
+        # To make it work with the sampler returning the stuff above, your dataloader
+        # needs to have the following
+        #
+        #     dataloader.attach_data('other_feat', tensor)
+        #
+        # Then you can run
+        #
+        #     for blocks, other_feat in dataloader:
+        #         train_on(blocks, other_feat)
+        input_nodes, output_nodes, blocks = result
+        set_src_lazy_features(blocks[0], self.prefetch_node_feats)
+        set_dst_lazy_features(blocks[-1], self.prefetch_labels)
+        for block in blocks:
+            set_edge_lazy_features(block, self.prefetch_edge_feats)
+        return input_nodes, output_nodes, blocks
+    def sample(self, g, seed_nodes):
+        """Sample a list of blocks from the given seed nodes."""
+        result = self.sample_blocks(g, seed_nodes)
+        return self.assign_lazy_features(result)
+def _find_exclude_eids_with_reverse_id(g, eids, reverse_eid_map):
+    if isinstance(eids, Mapping):
+        eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+        exclude_eids = {
+            k: F.cat([v, F.gather_row(reverse_eid_map[k], v)], 0)
+            for k, v in eids.items()}
+    else:
+        exclude_eids = F.cat([eids, F.gather_row(reverse_eid_map, eids)], 0)
+    return exclude_eids
+def _find_exclude_eids_with_reverse_types(g, eids, reverse_etype_map):
+    exclude_eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
+    reverse_etype_map = {
+        g.to_canonical_etype(k): g.to_canonical_etype(v)
+        for k, v in reverse_etype_map.items()}
+    exclude_eids.update({reverse_etype_map[k]: v for k, v in exclude_eids.items()})
+    return exclude_eids
+def _find_exclude_eids(g, exclude_mode, eids, **kwargs):
+    if exclude_mode is None:
+        return None
+    elif F.is_tensor(exclude_mode) or (
+            isinstance(exclude_mode, Mapping) and
+            all(F.is_tensor(v) for v in exclude_mode.values())):
+        return exclude_mode
+    elif exclude_mode == 'self':
+        return eids
+    elif exclude_mode == 'reverse_id':
+        return _find_exclude_eids_with_reverse_id(g, eids, kwargs['reverse_eid_map'])
+    elif exclude_mode == 'reverse_types':
+        return _find_exclude_eids_with_reverse_types(g, eids, kwargs['reverse_etype_map'])
+    else:
+        raise ValueError('unsupported mode {}'.format(exclude_mode))
+def find_exclude_eids(g, seed_edges, exclude, reverse_eids=None, reverse_etypes=None,
+                      output_device=None):
+    """Find all edge IDs to exclude according to :attr:`exclude_mode`.
+    Parameters
+    ----------
+    g : DGLGraph
+        The graph.
+    exclude_mode : str, optional
+        Can be either of the following,
+        None (default)
+            Does not exclude any edge.
+        Tensor or dict[etype, Tensor]
+            Exclude the given edge IDs.
+        'self'
+            Exclude the given edges themselves but nothing else.
+        'reverse_id'
+            Exclude all edges specified in ``eids``, as well as their reverse edges
+            of the same edge type.
+            The mapping from each edge ID to its reverse edge ID is specified in
+            the keyword argument ``reverse_eid_map``.
+            This mode assumes that the reverse of an edge with ID ``e`` and type
+            ``etype`` will have ID ``reverse_eid_map[e]`` and type ``etype``.
+        'reverse_types'
+            Exclude all edges specified in ``eids``, as well as their reverse
+            edges of the corresponding edge types.
+            The mapping from each edge type to its reverse edge type is specified
+            in the keyword argument ``reverse_etype_map``.
+            This mode assumes that the reverse of an edge with ID ``e`` and type ``etype``
+            will have ID ``e`` and type ``reverse_etype_map[etype]``.
+    eids : Tensor or dict[etype, Tensor]
+        The edge IDs.
+    reverse_eids : Tensor or dict[etype, Tensor]
+        The mapping from edge ID to its reverse edge ID.
+    reverse_etypes : dict[etype, etype]
+        The mapping from edge etype to its reverse edge type.
+    output_device : device
+        The device of the output edge IDs.
+    """
+    exclude_eids = _find_exclude_eids(
+        g,
+        exclude,
+        seed_edges,
+        reverse_eid_map=reverse_eids,
+        reverse_etype_map=reverse_etypes)
+    if exclude_eids is not None:
+        exclude_eids = recursive_apply(
+            exclude_eids, lambda x: x.to(output_device))
+    return exclude_eids
+class EdgeBlockSampler(object):
+    """Adapts a :class:`BlockSampler` object's :attr:`sample` method for edge
+    classification and link prediction.
+    """
+    def __init__(self, block_sampler, exclude=None, reverse_eids=None,
+                 reverse_etypes=None, negative_sampler=None, prefetch_node_feats=None,
+                 prefetch_labels=None, prefetch_edge_feats=None):
+        self.reverse_eids = reverse_eids
+        self.reverse_etypes = reverse_etypes
+        self.exclude = exclude
+        self.block_sampler = block_sampler
+        self.negative_sampler = negative_sampler
+        self.prefetch_node_feats = prefetch_node_feats or []
+        self.prefetch_labels = prefetch_labels or []
+        self.prefetch_edge_feats = prefetch_edge_feats or []
+        self.output_device = block_sampler.output_device
+    def _build_neg_graph(self, g, seed_edges):
+        neg_srcdst = self.negative_sampler(g, seed_edges)
+        if not isinstance(neg_srcdst, Mapping):
+            assert len(g.canonical_etypes) == 1, \
+                'graph has multiple or no edge types; '\
+                'please return a dict in negative sampler.'
+            neg_srcdst = {g.canonical_etypes[0]: neg_srcdst}
+        dtype = F.dtype(list(neg_srcdst.values())[0][0])
+        neg_edges = {
+            etype: neg_srcdst.get(etype, (F.tensor([], dtype), F.tensor([], dtype)))
+            for etype in g.canonical_etypes}
+        neg_pair_graph = heterograph(
+            neg_edges, {ntype: g.num_nodes(ntype) for ntype in g.ntypes})
+        return neg_pair_graph
+    def assign_lazy_features(self, result):
+        """Assign lazy features for prefetching."""
+        pair_graph = result[1]
+        blocks = result[-1]
+        set_src_lazy_features(blocks[0], self.prefetch_node_feats)
+        set_edge_lazy_features(pair_graph, self.prefetch_labels)
+        for block in blocks:
+            set_edge_lazy_features(block, self.prefetch_edge_feats)
+        # In-place updates
+        return result
+    def sample(self, g, seed_edges):
+        """Samples a list of blocks, as well as a subgraph containing the sampled
+        edges from the original graph.
+        If :attr:`negative_sampler` is given, also returns another graph containing the
+        negative pairs as edges.
+        """
+        exclude = self.exclude
+        pair_graph = g.edge_subgraph(
+            seed_edges, relabel_nodes=False, output_device=self.output_device)
+        eids = pair_graph.edata[EID]
+        if self.negative_sampler is not None:
+            neg_graph = self._build_neg_graph(g, seed_edges)
+            pair_graph, neg_graph = compact_graphs([pair_graph, neg_graph])
+        else:
+            pair_graph = compact_graphs(pair_graph)
+        pair_graph.edata[EID] = eids
+        seed_nodes = pair_graph.ndata[NID]
+        exclude_eids = find_exclude_eids(
+            g, seed_edges, exclude, self.reverse_eids, self.reverse_etypes,
+            self.output_device)
+        input_nodes, _, blocks = self.block_sampler.sample_blocks(g, seed_nodes, exclude_eids)
+        if self.negative_sampler is None:
+            return self.assign_lazy_features((input_nodes, pair_graph, blocks))
+        else:
+            return self.assign_lazy_features((input_nodes, pair_graph, neg_graph, blocks))
--- a/python/dgl/dataloading/cluster_gcn.py
+++ b/python/dgl/dataloading/cluster_gcn.py
-"""Cluster-GCN subgraph iterators."""
+"""Cluster-GCN samplers."""
 import os
 import pickle
 import numpy as np
-from ..transform import metis_partition_assignment
 from .. import backend as F
-from .dataloader import SubgraphIterator
+from ..base import DGLError
+from ..partition import metis_partition_assignment
+from .base import set_node_lazy_features, set_edge_lazy_features
-class ClusterGCNSubgraphIterator(SubgraphIterator):
+class ClusterGCNSampler(object):
-    """Subgraph sampler following that of ClusterGCN.
+    """Cluster-GCN sampler.
    This sampler first partitions the graph with METIS partitioning, then it caches the nodes of
    each partition to a file within the given cache directory.
-    This is used in conjunction with :class:`dgl.dataloading.pytorch.GraphDataLoader`.
+    This is used in conjunction with :class:`dgl.dataloading.DataLoader`.
    Notes
    -----
@@ -23,61 +24,53 @@ class ClusterGCNSubgraphIterator(SubgraphIterator):
    ----------
    g : DGLGraph
        The original graph.
-    num_partitions : int
+    k : int
        The number of partitions.
-    cache_directory : str
+    cache_path : str
        The path to the cache directory for storing the partition result.
-    refresh : bool
-        If True, recompute the partition.
-    Examples
-    --------
-    Assuming that you have a graph ``g``:
-    >>> sgiter = dgl.dataloading.ClusterGCNSubgraphIterator(
-    ...     g, num_partitions=100, cache_directory='.', refresh=True)
-    >>> dataloader = dgl.dataloading.GraphDataLoader(sgiter, batch_size=4, num_workers=0)
-    >>> for subgraph_batch in dataloader:
-    ...     train_on(subgraph_batch)
    """
-    def __init__(self, g, num_partitions, cache_directory, refresh=False):
+    def __init__(self, g, k, balance_ntypes=None, balance_edges=False, mode='k-way',
-        if os.name == 'nt':
+                 prefetch_node_feats=None, prefetch_edge_feats=None, output_device=None,
-            raise NotImplementedError("METIS partitioning is not supported on Windows yet.")
+                 cache_path='cluster_gcn.pkl'):
-        super().__init__(g)
+        if os.path.exists(cache_path):
+            try:
-        # First see if the cache is already there.  If so, directly read from cache.
+                with open(cache_path, 'rb') as f:
-        if not refresh and self._load_parts(cache_directory):
+                    self.partition_offset, self.partition_node_ids = pickle.load(f)
-            return
+            except (EOFError, TypeError, ValueError):
+                raise DGLError(
-        # Otherwise, build the cache.
+                    f'The contents in the cache file {cache_path} is invalid. '
-        assignment = F.asnumpy(metis_partition_assignment(g, num_partitions))
+                    f'Please remove the cache file {cache_path} or specify another path.')
-        self._save_parts(assignment, cache_directory)
+            if len(self.partition_offset) != k + 1:
+                raise DGLError(
-    def _cache_file_path(self, cache_directory):
+                    f'Number of partitions in the cache does not match the value of k. '
-        return os.path.join(cache_directory, 'cluster_gcn_cache')
+                    f'Please remove the cache file {cache_path} or specify another path.')
+            if len(self.partition_node_ids) != g.num_nodes():
-    def _load_parts(self, cache_directory):
+                raise DGLError(
-        path = self._cache_file_path(cache_directory)
+                    f'Number of nodes in the cache does not match the given graph. '
-        if not os.path.exists(path):
+                    f'Please remove the cache file {cache_path} or specify another path.')
-            return False
+        else:
+            partition_ids = metis_partition_assignment(
-        with open(path, 'rb') as file_:
+                g, k, balance_ntypes=balance_ntypes, balance_edges=balance_edges, mode=mode)
-            self.part_indptr, self.part_indices = pickle.load(file_)
+            partition_ids = F.asnumpy(partition_ids)
-        return True
+            partition_node_ids = np.argsort(partition_ids)
+            partition_size = F.zerocopy_from_numpy(np.bincount(partition_ids, minlength=k))
-    def _save_parts(self, assignment, cache_directory):
+            partition_offset = F.zerocopy_from_numpy(np.insert(np.cumsum(partition_size), 0, 0))
-        os.makedirs(cache_directory, exist_ok=True)
+            partition_node_ids = F.zerocopy_from_numpy(partition_ids)
+            with open(cache_path, 'wb') as f:
-        self.part_indices = np.argsort(assignment)
+                pickle.dump((partition_offset, partition_node_ids), f)
-        num_nodes_per_part = np.bincount(assignment)
+            self.partition_offset = partition_offset
-        self.part_indptr = np.insert(np.cumsum(num_nodes_per_part), 0, 0)
+            self.partition_node_ids = partition_node_ids
-        with open(self._cache_file_path(cache_directory), 'wb') as file_:
+        self.prefetch_node_feats = prefetch_node_feats or []
-            pickle.dump((self.part_indptr, self.part_indices), file_)
+        self.prefetch_edge_feats = prefetch_edge_feats or []
+        self.output_device = output_device
-    def __len__(self):
-        return self.part_indptr.shape[0] - 1
+    def sample(self, g, partition_ids):
+        """Samples a subgraph given a list of partition IDs."""
-    def __getitem__(self, i):
+        node_ids = F.cat([
-        nodes = self.part_indices[self.part_indptr[i]:self.part_indptr[i+1]]
+            self.partition_node_ids[self.partition_offset[i]:self.partition_offset[i+1]]
-        return self.g.subgraph(nodes)
+            for i in F.asnumpy(partition_ids)], 0)
+        sg = g.subgraph(node_ids, relabel_nodes=True, output_device=self.output_device)
+        set_node_lazy_features(sg, self.prefetch_node_feats)
+        set_edge_lazy_features(sg, self.prefetch_edge_feats)
+        return sg
--- a/python/dgl/dataloading/dataloader.py
+++ b/python/dgl/dataloading/dataloader.py
--- a/python/dgl/dataloading/dist_dataloader.py
+++ b/python/dgl/dataloading/dist_dataloader.py
+"""Distributed dataloaders.
+"""
+import inspect
+from ..distributed import DistDataLoader
+# Still depends on the legacy NodeCollator...
+from .._dataloading.dataloader import NodeCollator, EdgeCollator
+def _remove_kwargs_dist(kwargs):
+    if 'num_workers' in kwargs:
+        del kwargs['num_workers']
+    if 'pin_memory' in kwargs:
+        del kwargs['pin_memory']
+        print('Distributed DataLoaders do not support pin_memory.')
+    return kwargs
+class DistNodeDataLoader(DistDataLoader):
+    """PyTorch dataloader for batch-iterating over a set of nodes, generating the list
+    of message flow graphs (MFGs) as computation dependency of the said minibatch, on
+    a distributed graph.
+    All the arguments have the same meaning as the single-machine counterpart
+    :class:`dgl.dataloading.pytorch.NodeDataLoader` except the first argument
+    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
+    Parameters
+    ----------
+    g : DistGraph
+        The distributed graph.
+    nids, graph_sampler, device, kwargs :
+        See :class:`dgl.dataloading.pytorch.NodeDataLoader`.
+    See also
+    --------
+    dgl.dataloading.pytorch.NodeDataLoader
+    """
+    def __init__(self, g, nids, graph_sampler, device=None, **kwargs):
+        collator_kwargs = {}
+        dataloader_kwargs = {}
+        _collator_arglist = inspect.getfullargspec(NodeCollator).args
+        for k, v in kwargs.items():
+            if k in _collator_arglist:
+                collator_kwargs[k] = v
+            else:
+                dataloader_kwargs[k] = v
+        if device is None:
+            # for the distributed case default to the CPU
+            device = 'cpu'
+        assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.'
+        # Distributed DataLoader currently does not support heterogeneous graphs
+        # and does not copy features.  Fallback to normal solution
+        self.collator = NodeCollator(g, nids, graph_sampler, **collator_kwargs)
+        _remove_kwargs_dist(dataloader_kwargs)
+        super().__init__(self.collator.dataset,
+                         collate_fn=self.collator.collate,
+                         **dataloader_kwargs)
+        self.device = device
+class DistEdgeDataLoader(DistDataLoader):
+    """PyTorch dataloader for batch-iterating over a set of edges, generating the list
+    of message flow graphs (MFGs) as computation dependency of the said minibatch for
+    edge classification, edge regression, and link prediction, on a distributed
+    graph.
+    All the arguments have the same meaning as the single-machine counterpart
+    :class:`dgl.dataloading.pytorch.EdgeDataLoader` except the first argument
+    :attr:`g` which must be a :class:`dgl.distributed.DistGraph`.
+    Parameters
+    ----------
+    g : DistGraph
+        The distributed graph.
+    eids, graph_sampler, device, kwargs :
+        See :class:`dgl.dataloading.pytorch.EdgeDataLoader`.
+    See also
+    --------
+    dgl.dataloading.pytorch.EdgeDataLoader
+    """
+    def __init__(self, g, eids, graph_sampler, device=None, **kwargs):
+        collator_kwargs = {}
+        dataloader_kwargs = {}
+        _collator_arglist = inspect.getfullargspec(EdgeCollator).args
+        for k, v in kwargs.items():
+            if k in _collator_arglist:
+                collator_kwargs[k] = v
+            else:
+                dataloader_kwargs[k] = v
+        if device is None:
+            # for the distributed case default to the CPU
+            device = 'cpu'
+        assert device == 'cpu', 'Only cpu is supported in the case of a DistGraph.'
+        # Distributed DataLoader currently does not support heterogeneous graphs
+        # and does not copy features.  Fallback to normal solution
+        self.collator = EdgeCollator(g, eids, graph_sampler, **collator_kwargs)
+        _remove_kwargs_dist(dataloader_kwargs)
+        super().__init__(self.collator.dataset,
+                         collate_fn=self.collator.collate,
+                         **dataloader_kwargs)
+        self.device = device
--- a/python/dgl/dataloading/negative_sampler.py
+++ b/python/dgl/dataloading/negative_sampler.py
 """Negative samplers"""
 from collections.abc import Mapping
 from .. import backend as F
-from ..sampling import global_uniform_negative_sampling
 class _BaseNegativeSampler(object):
    def _generate(self, g, eids, canonical_etype):
@@ -26,7 +25,7 @@ class _BaseNegativeSampler(object):
            eids = {g.to_canonical_etype(k): v for k, v in eids.items()}
            neg_pair = {k: self._generate(g, v, k) for k, v in eids.items()}
        else:
-            assert len(g.etypes) == 1, \
+            assert len(g.canonical_etypes) == 1, \
                'please specify a dict of etypes and ids for graphs with multiple edge types'
            neg_pair = self._generate(g, eids, g.canonical_etypes[0])
@@ -64,7 +63,7 @@ class PerSourceUniform(_BaseNegativeSampler):
        shape = (shape[0] * self.k,)
        src, _ = g.find_edges(eids, etype=canonical_etype)
        src = F.repeat(src, self.k, 0)
-        dst = F.randint(shape, dtype, ctx, 0, g.number_of_nodes(vtype))
+        dst = F.randint(shape, dtype, ctx, 0, g.num_nodes(vtype))
        return src, dst
 # Alias
@@ -90,14 +89,6 @@ class GlobalUniform(_BaseNegativeSampler):
    replace : bool, optional
        Whether to sample with replacement.  Setting it to True will make things
        faster.  (Default: True)
-    redundancy : float, optional
-        Indicates how much more negative samples to actually generate during rejection sampling
-        before finding the unique pairs.
-        Increasing it will increase the likelihood of getting :attr:`k` negative samples
-        per edge, but will also take more time and memory.
-        (Default: automatically determined by the density of graph)
    Notes
    -----
@@ -113,13 +104,11 @@ class GlobalUniform(_BaseNegativeSampler):
    >>> neg_sampler(g, torch.LongTensor([0, 1]))
    (tensor([0, 1, 3, 2]), tensor([2, 0, 2, 1]))
    """
-    def __init__(self, k, exclude_self_loops=True, replace=False, redundancy=None):
+    def __init__(self, k, exclude_self_loops=True, replace=False):
        self.k = k
        self.exclude_self_loops = exclude_self_loops
        self.replace = replace
-        self.redundancy = redundancy
    def _generate(self, g, eids, canonical_etype):
-        return global_uniform_negative_sampling(
+        return g.global_uniform_negative_sampling(
-            g, len(eids) * self.k, self.exclude_self_loops, self.replace,
+            len(eids) * self.k, self.exclude_self_loops, self.replace, canonical_etype)
-            canonical_etype, self.redundancy)
--- a/python/dgl/dataloading/neighbor_sampler.py
+++ b/python/dgl/dataloading/neighbor_sampler.py
+"""Data loading components for neighbor sampling"""
+from ..base import NID, EID
+from ..transform import to_block
+from .base import BlockSampler
+class NeighborSampler(BlockSampler):
+    """Sampler that builds computational dependency of node representations via
+    neighbor sampling for multilayer GNN.
+    This sampler will make every node gather messages from a fixed number of neighbors
+    per edge type.  The neighbors are picked uniformly.
+    Parameters
+    ----------
+    fanouts : list[int] or list[dict[etype, int]]
+        List of neighbors to sample per edge type for each GNN layer, with the i-th
+        element being the fanout for the i-th GNN layer.
+        If only a single integer is provided, DGL assumes that every edge type
+        will have the same fanout.
+        If -1 is provided for one edge type on one layer, then all inbound edges
+        of that edge type will be included.
+    replace : bool, default False
+        Whether to sample with replacement
+    prob : str, optional
+        If given, the probability of each neighbor being sampled is proportional
+        to the edge feature value with the given name in ``g.edata``.  The feature must be
+        a scalar on each edge.
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from 5, 10, 15 neighbors for
+    the first, second, and third layer respectively (assuming the backend is PyTorch):
+    >>> sampler = dgl.dataloading.NeighborSampler([5, 10, 15])
+    >>> dataloader = dgl.dataloading.NodeDataLoader(
+    ...     g, train_nid, sampler,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, output_nodes, blocks in dataloader:
+    ...     train_on(blocks)
+    If training on a heterogeneous graph and you want different number of neighbors for each
+    edge type, one should instead provide a list of dicts.  Each dict would specify the
+    number of neighbors to pick per edge type.
+    >>> sampler = dgl.dataloading.NeighborSampler([
+    ...     {('user', 'follows', 'user'): 5,
+    ...      ('user', 'plays', 'game'): 4,
+    ...      ('game', 'played-by', 'user'): 3}] * 3)
+    If you would like non-uniform neighbor sampling:
+    >>> g.edata['p'] = torch.rand(g.num_edges())   # any non-negative 1D vector works
+    >>> sampler = dgl.dataloading.NeighborSampler([5, 10, 15], prob='p')
+    Notes
+    -----
+    For the concept of MFGs, please refer to
+    :ref:`User Guide Section 6 <guide-minibatch>` and
+    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
+    """
+    def __init__(self, fanouts, edge_dir='in', prob=None, replace=False, **kwargs):
+        super().__init__(**kwargs)
+        self.fanouts = fanouts
+        self.edge_dir = edge_dir
+        self.prob = prob
+        self.replace = replace
+    def sample_blocks(self, g, seed_nodes, exclude_eids=None):
+        output_nodes = seed_nodes
+        blocks = []
+        for fanout in reversed(self.fanouts):
+            frontier = g.sample_neighbors(
+                seed_nodes, fanout, edge_dir=self.edge_dir, prob=self.prob,
+                replace=self.replace, output_device=self.output_device,
+                exclude_edges=exclude_eids)
+            eid = frontier.edata[EID]
+            block = to_block(frontier, seed_nodes)
+            block.edata[EID] = eid
+            seed_nodes = block.srcdata[NID]
+            blocks.insert(0, block)
+        return seed_nodes, output_nodes, blocks
+MultiLayerNeighborSampler = NeighborSampler
+class MultiLayerFullNeighborSampler(NeighborSampler):
+    """Sampler that builds computational dependency of node representations by taking messages
+    from all neighbors for multilayer GNN.
+    This sampler will make every node gather messages from every single neighbor per edge type.
+    Parameters
+    ----------
+    n_layers : int
+        The number of GNN layers to sample.
+    return_eids : bool, default False
+        Whether to return the edge IDs involved in message passing in the MFG.
+        If True, the edge IDs will be stored as an edge feature named ``dgl.EID``.
+    Examples
+    --------
+    To train a 3-layer GNN for node classification on a set of nodes ``train_nid`` on
+    a homogeneous graph where each node takes messages from all neighbors for the first,
+    second, and third layer respectively (assuming the backend is PyTorch):
+    >>> sampler = dgl.dataloading.MultiLayerFullNeighborSampler(3)
+    >>> dataloader = dgl.dataloading.NodeDataLoader(
+    ...     g, train_nid, sampler,
+    ...     batch_size=1024, shuffle=True, drop_last=False, num_workers=4)
+    >>> for input_nodes, output_nodes, blocks in dataloader:
+    ...     train_on(blocks)
+    Notes
+    -----
+    For the concept of MFGs, please refer to
+    :ref:`User Guide Section 6 <guide-minibatch>` and
+    :doc:`Minibatch Training Tutorials <tutorials/large/L0_neighbor_sampling_overview>`.
+    """
+    def __init__(self, num_layers, edge_dir='in', prob=None, replace=False, **kwargs):
+        super().__init__([-1] * num_layers, edge_dir=edge_dir, prob=prob, replace=replace,
+                         **kwargs)
--- a/python/dgl/dataloading/shadow.py
+++ b/python/dgl/dataloading/shadow.py
 """ShaDow-GNN subgraph samplers."""
-from ..utils import prepare_tensor_or_dict
+from ..sampling.utils import EidExcluder
-from ..base import NID
 from .. import transform
-from ..sampling import sample_neighbors
+from ..base import NID
-from .neighbor import NeighborSamplingMixin
+from .base import set_node_lazy_features, set_edge_lazy_features
-from .dataloader import exclude_edges, Sampler
-class ShaDowKHopSampler(NeighborSamplingMixin, Sampler):
+class ShaDowKHopSampler(object):
    """K-hop subgraph sampler used by
    `ShaDow-GNN <https://arxiv.org/abs/2012.01380>`__.
@@ -70,29 +68,32 @@ class ShaDowKHopSampler(NeighborSamplingMixin, Sampler):
    If you would like non-uniform neighbor sampling:
    >>> g.edata['p'] = torch.rand(g.num_edges())   # any non-negative 1D vector works
-    >>> sampler = dgl.dataloading.MultiLayerNeighborSampler([5, 10, 15], prob='p')
+    >>> sampler = dgl.dataloading.ShaDowKHopSampler([5, 10, 15], prob='p')
    """
-    def __init__(self, fanouts, replace=False, prob=None, output_ctx=None):
+    def __init__(self, fanouts, replace=False, prob=None, prefetch_node_feats=None,
-        super().__init__(output_ctx)
+                 prefetch_edge_feats=None, output_device=None):
        self.fanouts = fanouts
        self.replace = replace
        self.prob = prob
-        self.set_output_context(output_ctx)
+        self.prefetch_node_feats = prefetch_node_feats
+        self.prefetch_edge_feats = prefetch_edge_feats
+        self.output_device = output_device
-    def sample(self, g, seed_nodes, exclude_eids=None):
+    def sample(self, g, seed_nodes, exclude_edges=None):
-        self._build_fanout(len(self.fanouts), g)
+        """Sample a subgraph given a tensor of seed nodes."""
-        self._build_prob_arrays(g)
-        seed_nodes = prepare_tensor_or_dict(g, seed_nodes, 'seed nodes')
        output_nodes = seed_nodes
+        for fanout in reversed(self.fanouts):
-        for i in range(len(self.fanouts)):
+            frontier = g.sample_neighbors(
-            fanout = self.fanouts[i]
+                seed_nodes, fanout, output_device=self.output_device,
-            frontier = sample_neighbors(
+                replace=self.replace, prob=self.prob, exclude_edges=exclude_edges)
-                g, seed_nodes, fanout, replace=self.replace, prob=self.prob_arrays)
            block = transform.to_block(frontier, seed_nodes)
            seed_nodes = block.srcdata[NID]
-        subg = g.subgraph(seed_nodes, relabel_nodes=True)
+        subg = g.subgraph(seed_nodes, relabel_nodes=True, output_device=self.output_device)
-        subg = exclude_edges(subg, exclude_eids, self.output_device)
+        if exclude_edges is not None:
+            subg = EidExcluder(exclude_edges)(subg)
+        set_node_lazy_features(subg, self.prefetch_node_feats)
+        set_edge_lazy_features(subg, self.prefetch_edge_feats)
-        return seed_nodes, output_nodes, [subg]
+        return seed_nodes, output_nodes, subg
--- a/python/dgl/distributed/dist_graph.py
+++ b/python/dgl/distributed/dist_graph.py
@@ -26,6 +26,7 @@ from . import rpc
 from . import role
 from .server_state import ServerState
 from .rpc_server import start_server
+from . import graph_services
 from .graph_services import find_edges as dist_find_edges
 from .graph_services import out_degrees as dist_out_degrees
 from .graph_services import in_degrees as dist_in_degrees
@@ -1223,6 +1224,20 @@ class DistGraph:
        '''
        self._client.barrier()
+    def sample_neighbors(self, seed_nodes, fanout, edge_dir='in', prob=None,
+                         exclude_edges=None, replace=False,
+                         output_device=None):
+        # pylint: disable=unused-argument
+        """Sample neighbors from a distributed graph."""
+        # Currently prob, exclude_edges, output_device, and edge_dir are ignored.
+        if len(self.etypes) > 1:
+            frontier = graph_services.sample_etype_neighbors(
+                self, seed_nodes, ETYPE, fanout, replace=replace)
+        else:
+            frontier = graph_services.sample_neighbors(
+                self, seed_nodes, fanout, replace=replace)
+        return frontier
    def _get_ndata_names(self, ntype=None):
        ''' Get the names of all node data.
        '''

--- a/python/dgl/frame.py
+++ b/python/dgl/frame.py
--- a/python/dgl/geometry/capi.py
+++ b/python/dgl/geometry/capi.py
 """Python interfaces to DGL farthest point sampler."""
-from dgl._ffi.base import DGLError
 import numpy as np
+from .._ffi.base import DGLError
 from .._ffi.function import _init_api
 from .. import backend as F
 from .. import ndarray as nd

--- a/python/dgl/heterograph.py
+++ b/python/dgl/heterograph.py
@@ -1600,6 +1600,14 @@ class DGLHeteroGraph(object):
    # View
    #################################################################
+    def get_node_storage(self, key, ntype=None):
+        """Get storage object of node feature of type :attr:`ntype` and name :attr:`key`."""
+        return self._node_frames[self.get_ntype_id(ntype)]._columns[key]
+    def get_edge_storage(self, key, etype=None):
+        """Get storage object of edge feature of type :attr:`etype` and name :attr:`key`."""
+        return self._edge_frames[self.get_etype_id(etype)]._columns[key]
    @property
    def nodes(self):
        """Return a node view