[Doc and bugfix] Add docs and user guide and update tutorial for sampling pipeline (#3774)

* huuuuge update * remove * lint * lint * fix * what happened to nccl * update multi-gpu unsupervised graphsage example * replace most of the dgl.mp.process with torch.mp.spawn * update if condition for use_uva case * update user guide * address comments * incorporating suggestions from @jermainewang * oops * fix tutorial to pass CI * oops * fix again Co-authored-by: Xin Yao <xiny@nvidia.com>

[Doc and bugfix] Add docs and user guide and update tutorial for sampling pipeline (#3774)
* huuuuge update * remove * lint * lint * fix * what happened to nccl * update multi-gpu unsupervised graphsage example * replace most of the dgl.mp.process with torch.mp.spawn * update if condition for use_uva case * update user guide * address comments * incorporating suggestions from @jermainewang * oops * fix tutorial to pass CI * oops * fix again Co-authored-by: Xin Yao <xiny@nvidia.com>
d41d07d0 · Quan (Andy) Gan · GitHub · 3bd5a9b6 · d41d07d0 · d41d07d0
Unverified Commit d41d07d0 authored Feb 28, 2022 by Quan (Andy) Gan Committed by GitHub Feb 28, 2022
7 changed files
--- a/python/dgl/frame.py
+++ b/python/dgl/frame.py
@@ -40,7 +40,45 @@ class _LazyIndex(object):
        return flat_index
 class LazyFeature(object):
-    """Placeholder for prefetching from DataLoader.
+    """Placeholder for feature prefetching.
+    One can assign this object to ``ndata`` or ``edata`` of the graphs returned by various
+    samplers' :attr:`sample` method.  When DGL's dataloader receives the subgraphs
+    returned by the sampler, it will automatically look up all the ``ndata`` and ``edata``
+    whose data is a LazyFeature, replacing them with the actual data of the corresponding
+    nodes/edges from the original graph instead.  In particular, for a subgraph returned
+    by the sampler has a LazyFeature with name ``k`` in ``subgraph.ndata[key]``:
+    .. code:: python
+       subgraph.ndata[key] = LazyFeature(k)
+    Assuming that ``graph`` is the original graph, DGL's dataloader will perform
+    .. code:: python
+       subgraph.ndata[key] = graph.ndata[k][subgraph.ndata[dgl.NID]]
+    DGL dataloader performs similar replacement for ``edata``.
+    For heterogeneous graphs, the replacement is:
+    .. code:: python
+       subgraph.nodes[ntype].data[key] = graph.nodes[ntype].data[k][
+           subgraph.nodes[ntype].data[dgl.NID]]
+    For MFGs' ``srcdata`` (and similarly ``dstdata``), the replacement is
+    .. code:: python
+       mfg.srcdata[key] = graph.ndata[k][mfg.srcdata[dgl.NID]]
+    Parameters
+    ----------
+    name : str
+        The name of the data in the original graph.
+    id_ : Tensor, optional
+        The ID tensor.
    """
    __slots__ = ['name', 'id_']
    def __init__(self, name=None, id_=None):

--- a/python/dgl/utils/internal.py
+++ b/python/dgl/utils/internal.py
@@ -910,8 +910,73 @@ def alias_func(func):
    _fn.__doc__ = """Alias of :func:`dgl.{}`.""".format(func.__name__)
    return _fn
+def apply_each(data, fn, *args, **kwargs):
+    """Apply a function to every element in a container.
+    If the input data is a list or any sequence other than a string, returns a list
+    whose elements are the same elements applied with the given function.
+    If the input data is a dict or any mapping, returns a dict whose keys are the same
+    and values are the elements applied with the given function.
+    The first argument of the function will be passed with the individual elements from
+    the input data, followed by the arguments in :attr:`args` and :attr:`kwargs`.
+    Parameters
+    ----------
+    data : any
+        Any object.
+    fn : callable
+        Any function.
+    args, kwargs :
+        Additional arguments and keyword-arguments passed to the function.
+    Examples
+    --------
+    Applying a ReLU function to a dictionary of tensors:
+    >>> h = {k: torch.randn(3) for k in ['A', 'B', 'C']}
+    >>> h = apply_each(h, torch.nn.functional.relu)
+    >>> assert all((v >= 0).all() for v in h.values())
+    """
+    if isinstance(data, Mapping):
+        return {k: fn(v, *args, **kwargs) for k, v in data.items()}
+    elif isinstance(data, Sequence):
+        return [fn(v, *args, **kwargs) for v in data]
+    else:
+        return fn(data, *args, **kwargs)
 def recursive_apply(data, fn, *args, **kwargs):
    """Recursively apply a function to every element in a container.
+    If the input data is a list or any sequence other than a string, returns a list
+    whose elements are the same elements applied with the given function.
+    If the input data is a dict or any mapping, returns a dict whose keys are the same
+    and values are the elements applied with the given function.
+    If the input data is a nested container, the result will have the same nested
+    structure where each element is transformed recursively.
+    The first argument of the function will be passed with the individual elements from
+    the input data, followed by the arguments in :attr:`args` and :attr:`kwargs`.
+    Parameters
+    ----------
+    data : any
+        Any object.
+    fn : callable
+        Any function.
+    args, kwargs :
+        Additional arguments and keyword-arguments passed to the function.
+    Examples
+    --------
+    Applying a ReLU function to a dictionary of tensors:
+    >>> h = {k: torch.randn(3) for k in ['A', 'B', 'C']}
+    >>> h = recursive_apply(h, torch.nn.functional.relu)
+    >>> assert all((v >= 0).all() for v in h.values())
    """
    if isinstance(data, str):   # str is a Sequence
        return fn(data, *args, **kwargs)

--- a/src/graph/sampling/neighbor/neighbor.cc
+++ b/src/graph/sampling/neighbor/neighbor.cc
@@ -79,6 +79,8 @@ HeteroSubgraph SampleNeighbors(
  CHECK_EQ(prob.size(), hg->NumEdgeTypes())
    << "Number of probability tensors must match the number of edge types.";
+  DLContext ctx = aten::GetContextOf(nodes);
  std::vector<HeteroGraphPtr> subrels(hg->NumEdgeTypes());
  std::vector<IdArray> induced_edges(hg->NumEdgeTypes());
  for (dgl_type_t etype = 0; etype < hg->NumEdgeTypes(); ++etype) {
@@ -93,8 +95,8 @@ HeteroSubgraph SampleNeighbors(
        hg->GetRelationGraph(etype)->NumVertexTypes(),
        hg->NumVertices(src_vtype),
        hg->NumVertices(dst_vtype),
-        hg->DataType(), hg->Context());
+        hg->DataType(), ctx);
-      induced_edges[etype] = aten::NullArray(hg->DataType(), hg->Context());
+      induced_edges[etype] = aten::NullArray(hg->DataType(), ctx);
    } else if (fanouts[etype] == -1) {
      const auto &earr = (dir == EdgeDir::kOut) ?
        hg->OutEdges(etype, nodes_ntype) :

--- a/tutorials/large/L1_large_node_classification.py
+++ b/tutorials/large/L1_large_node_classification.py
@@ -85,11 +85,11 @@ test_nids = idx_split['test']
 # DGL provides tools to iterate over the dataset in minibatches
 # while generating the computation dependencies to compute their outputs
 # with the MFGs above. For node classification, you can use
-# ``dgl.dataloading.NodeDataLoader`` for iterating over the dataset.
+# ``dgl.dataloading.DataLoader`` for iterating over the dataset.
 # It accepts a sampler object to control how to generate the computation
 # dependencies in the form of MFGs.  DGL provides
 # implementations of common sampling algorithms such as
-# ``dgl.dataloading.MultiLayerNeighborSampler`` which randomly picks
+# ``dgl.dataloading.NeighborSampler`` which randomly picks
 # a fixed number of neighbors for each node.
 #
 # .. note::
@@ -97,7 +97,7 @@ test_nids = idx_split['test']
 #    To write your own neighbor sampler, please refer to :ref:`this user
 #    guide section <guide-minibatch-customizing-neighborhood-sampler>`.
 #
-# The syntax of ``dgl.dataloading.NodeDataLoader`` is mostly similar to a
+# The syntax of ``dgl.dataloading.DataLoader`` is mostly similar to a
 # PyTorch ``DataLoader``, with the addition that it needs a graph to
 # generate computation dependency from, a set of node IDs to iterate on,
 # and the neighbor sampler you defined.
@@ -107,9 +107,9 @@ test_nids = idx_split['test']
 # like the following.
 #
-sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
+sampler = dgl.dataloading.NeighborSampler([4, 4])
-train_dataloader = dgl.dataloading.NodeDataLoader(
+train_dataloader = dgl.dataloading.DataLoader(
-    # The following arguments are specific to NodeDataLoader.
+    # The following arguments are specific to DGL's DataLoader.
    graph,              # The graph
    train_nids,         # The node IDs to iterate over in minibatches
    sampler,            # The neighbor sampler
@@ -141,7 +141,7 @@ print("To compute {} nodes' outputs, we need {} nodes' input features".format(le
 ######################################################################
-# ``NodeDataLoader`` gives us three items per iteration.
+# DGL's ``DataLoader`` gives us three items per iteration.
 #
 # -  An ID tensor for the input nodes, i.e., nodes whose input features
 #    are needed on the first GNN layer for this minibatch.
@@ -262,7 +262,7 @@ opt = torch.optim.Adam(model.parameters())
 # loader.
 #
-valid_dataloader = dgl.dataloading.NodeDataLoader(
+valid_dataloader = dgl.dataloading.DataLoader(
    graph, valid_nids, sampler,
    batch_size=1024,
    shuffle=False,

--- a/tutorials/large/L2_large_link_prediction.py
+++ b/tutorials/large/L2_large_link_prediction.py
@@ -91,7 +91,7 @@ test_nids = idx_split['test']
 # in a similar fashion introduced in the :doc:`large-scale node classification
 # tutorial <L1_large_node_classification>`.
 #
-# DGL provides ``dgl.dataloading.EdgeDataLoader`` to
+# DGL provides ``dgl.dataloading.as_edge_prediction_sampler`` to
 # iterate over edges for edge classification or link prediction tasks.
 #
 # To perform link prediction, you need to specify a negative sampler. DGL
@@ -105,18 +105,19 @@ negative_sampler = dgl.dataloading.negative_sampler.Uniform(5)
 ######################################################################
 # After defining the negative sampler, one can then define the edge data
-# loader with neighbor sampling.  To create an ``EdgeDataLoader`` for
+# loader with neighbor sampling.  To create an ``DataLoader`` for
 # link prediction, provide a neighbor sampler object as well as the negative
 # sampler object created above.
 #
-sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
+sampler = dgl.dataloading.NeighborSampler([4, 4])
-train_dataloader = dgl.dataloading.EdgeDataLoader(
+sampler = dgl.dataloading.as_edge_prediction_sampler(
-    # The following arguments are specific to EdgeDataLoader.
+    sampler, negative_sampler=negative_sampler)
+train_dataloader = dgl.dataloading.DataLoader(
+    # The following arguments are specific to DataLoader.
    graph,                                  # The graph
    torch.arange(graph.number_of_edges()),  # The edges to iterate over
    sampler,                                # The neighbor sampler
-    negative_sampler=negative_sampler,      # The negative sampler
    device=device,                          # Put the MFGs on CPU or GPU
    # The following arguments are inherited from PyTorch DataLoader.
    batch_size=1024,    # Batch size
@@ -247,8 +248,8 @@ def inference(model, graph, node_features):
    with torch.no_grad():
        nodes = torch.arange(graph.number_of_nodes())
-        sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
+        sampler = dgl.dataloading.NeighborSampler([4, 4])
-        train_dataloader = dgl.dataloading.NodeDataLoader(
+        train_dataloader = dgl.dataloading.DataLoader(
            graph, torch.arange(graph.number_of_nodes()), sampler,
            batch_size=1024,
            shuffle=False,
@@ -390,80 +391,27 @@ test_neg_dst = torch.randint(0, graph.num_nodes(), (n_test_pos,))
 ######################################################################
-# First you need to construct a graph for ``dgl.dataloading.EdgeDataLoader``
+# First you need to compute the node representations for all the nodes
-# to iterate on, i.e. with the testing node pairs as edges.
+# with the ``inference`` method above:
-# You also need to label the edges, 1 if positive and 0 if negative.
 #
-test_src = torch.cat([test_pos_src, test_pos_dst])
+node_reprs = inference(model, graph, node_features)
-test_dst = torch.cat([test_neg_src, test_neg_dst])
-test_graph = dgl.graph((test_src, test_dst), num_nodes=graph.num_nodes())
-test_ground_truth = torch.cat(
-        [torch.ones_like(test_pos_src), torch.zeros_like(test_neg_src)])
-######################################################################
-# You will need to merge the test graph with the original graph.  The
-# testing edges' ID will be starting from ``graph.num_edges()``.
-#
-new_graph = dgl.merge([graph, test_graph])
-test_edge_ids = torch.arange(graph.num_edges(), new_graph.num_edges())
-######################################################################
-# Then you could create a new ``EdgeDataLoader`` instance that
-# iterates on the new ``test_graph``, but uses the original ``graph``
-# for neighbor sampling.
-#
-# Note that you do not need negative sampling in this dataloader: the
-# negative pairs are already in the new test graph.
-#
-test_dataloader = dgl.dataloading.EdgeDataLoader(
-    # The following arguments are specific to EdgeDataLoader.
-    new_graph,                              # The graph to iterate edges over
-    test_edge_ids,                          # The edges to iterate over
-    sampler,                                # The neighbor sampler
-    device=device,                          # Put the MFGs on CPU or GPU
-    exclude=test_edge_ids,                  # Do not sample test edges as neighbors
-    # The following arguments are inherited from PyTorch DataLoader.
-    batch_size=1024,    # Batch size
-    shuffle=True,       # Whether to shuffle the nodes for every epoch
-    drop_last=False,    # Whether to drop the last incomplete batch
-    num_workers=0       # Number of sampler processes
-)
 ######################################################################
-# The rest is similar to training except that you no longer compute
+# Since the predictor is a dot product, you can now easily compute the
-# the gradients, and you collect all the scores and ground truth
+# score of positive and negative test pairs to compute metrics such
-# labels for final metric calculation.
+# as AUC:
-#
-# .. note::
-#
-#    If the graph does not change, you can also precompute all the
-#    node representations beforehand with ``inference`` function.
-#    You can then feed the precomputed results directly into the
-#    predictor without passing the MFGs into the model.
 #
-test_preds = []
-test_labels = []
-with tqdm.tqdm(test_dataloader) as tq, torch.no_grad():
-    for step, (input_nodes, pair_graph, mfgs) in enumerate(tq):
-        # feature copy from CPU to GPU takes place here
-        inputs = mfgs[0].srcdata['feat']
-        outputs = model(mfgs, inputs)
+h_pos_src = node_reprs[test_pos_src]
-        test_preds.append(predictor(pair_graph, outputs))
+h_pos_dst = node_reprs[test_pos_dst]
-        test_labels.append(
+h_neg_src = node_reprs[test_neg_src]
-            # Need to map the IDs of test edges in the merged graph back
+h_neg_dst = node_reprs[test_neg_dst]
-            # to that of test_ground_truth.
+score_pos = (h_pos_src * h_pos_dst).sum(1)
-            test_ground_truth[pair_graph.edata[dgl.EID] - graph.num_edges()])
+score_neg = (h_neg_src * h_neg_dst).sum(1)
+test_preds = torch.cat([score_pos, score_neg]).cpu().numpy()
+test_labels = torch.cat([torch.ones_like(score_pos), torch.zeros_like(score_neg)]).cpu().numpy()
-test_preds = torch.cat(test_preds).cpu().numpy()
-test_labels = torch.cat(test_labels).cpu().numpy()
 auc = sklearn.metrics.roc_auc_score(test_labels, test_preds)
 print('Link Prediction AUC:', auc)

--- a/tutorials/multi/1_graph_classification.py
+++ b/tutorials/multi/1_graph_classification.py
@@ -74,7 +74,7 @@ import torch.distributed as dist
 def init_process_group(world_size, rank):
    dist.init_process_group(
-        backend='nccl',
+        backend='gloo',     # change to 'nccl' for multiple GPUs
        init_method='tcp://127.0.0.1:12345',
        world_size=world_size,
        rank=rank)
@@ -144,7 +144,10 @@ from torch.nn.parallel import DistributedDataParallel
 def init_model(seed, device):
    torch.manual_seed(seed)
    model = GIN().to(device)
-    model = DistributedDataParallel(model, device_ids=[device], output_device=device)
+    if device.type == 'cpu':
+        model = DistributedDataParallel(model)
+    else:
+        model = DistributedDataParallel(model, device_ids=[device], output_device=device)
    return model
@@ -182,9 +185,11 @@ from torch.optim import Adam
 def main(rank, world_size, dataset, seed=0):
    init_process_group(world_size, rank)
-    # Assume the GPU ID to be the same as the process ID
+    if torch.cuda.is_available():
-    device = torch.device('cuda:{:d}'.format(rank))
+        device = torch.device('cuda:{:d}'.format(rank))
-    torch.cuda.set_device(device)
+        torch.cuda.set_device(device)
+    else:
+        device = torch.device('cpu')
    model = init_model(seed, device)
    criterion = nn.CrossEntropyLoss()
@@ -223,28 +228,16 @@ def main(rank, world_size, dataset, seed=0):
 ###############################################################################
 # Finally we load the dataset and launch the processes.
 # 
-# .. note::
-# 
-#    You will need to use ``dgl.multiprocessing`` instead of the Python
-#    ``multiprocessing`` package. ``dgl.multiprocessing`` is identical to
-#    Python’s built-in ``multiprocessing`` except that it handles the
-#    subtleties between forking and multithreading in Python.
-#
 if __name__ == '__main__':
-    import dgl.multiprocessing as mp
+    import torch.multiprocessing as mp
    from dgl.data import GINDataset
    num_gpus = 4
    procs = []
    dataset = GINDataset(name='IMDBBINARY', self_loop=False)
-    for rank in range(num_gpus):
+    mp.spawn(main, args=(num_gpus, dataset), nprocs=num_gpus)
-        p = mp.Process(target=main, args=(rank, num_gpus, dataset))
-        p.start()
-        procs.append(p)
-    for p in procs:
-        p.join()
 # Thumbnail credits: DGL
 # sphinx_gallery_thumbnail_path = '_static/blitz_5_graph_classification.png'
--- a/tutorials/multi/2_node_classification.py
+++ b/tutorials/multi/2_node_classification.py
@@ -118,8 +118,8 @@ def run(proc_id, devices):
    # Define training and validation dataloader, copied from the previous tutorial
    # but with one line of difference: use_ddp to enable distributed data parallel
    # data loading.
-    sampler = dgl.dataloading.MultiLayerNeighborSampler([4, 4])
+    sampler = dgl.dataloading.NeighborSampler([4, 4])
-    train_dataloader = dgl.dataloading.NodeDataLoader(
+    train_dataloader = dgl.dataloading.DataLoader(
        # The following arguments are specific to NodeDataLoader.
        graph,              # The graph
        train_nids,         # The node IDs to iterate over in minibatches
@@ -133,7 +133,7 @@ def run(proc_id, devices):
        drop_last=False,    # Whether to drop the last incomplete batch
        num_workers=0       # Number of sampler processes
    )
-    valid_dataloader = dgl.dataloading.NodeDataLoader(
+    valid_dataloader = dgl.dataloading.DataLoader(
        graph, valid_nids, sampler,
        device=device,
        use_ddp=False,
@@ -247,16 +247,10 @@ graph.create_formats_()
 # 
 # Say you have four GPUs.
-num_gpus = 4
+if __name__ == '__main__':
-import dgl.multiprocessing as mp
+    num_gpus = 4
-devices = list(range(num_gpus))
+    import torch.multiprocessing as mp
-procs = []
+    mp.spawn(run, args=(list(range(num_gpus)),), nprocs=num_gpus)
-for proc_id in range(num_gpus):
-    p = mp.Process(target=run, args=(proc_id, devices))
-    p.start()
-    procs.append(p)
-for p in procs:
-    p.join()
 # Thumbnail credits: Stanford CS224W Notes
 # sphinx_gallery_thumbnail_path = '_static/blitz_1_introduction.png'