[Perf] Accelerate block_compute when all nodes are invoked. (#434)

* refactor. * accelerate update_all in nodeflow. * fix. * refactor. * fix lint. * fix lint. * reorganize. * reorg. * remove. * add doc. * impl block_incidence_matrix * fix lint. * fix. * simple fix. * fix test. * fix interface. * fix eid. * fix comments.

[Perf] Accelerate block_compute when all nodes are invoked. (#434)
* refactor. * accelerate update_all in nodeflow. * fix. * refactor. * fix lint. * fix lint. * reorganize. * reorg. * remove. * add doc. * impl block_incidence_matrix * fix lint. * fix. * simple fix. * fix test. * fix interface. * fix eid. * fix comments.
8651be54 · Da Zheng · GitHub · ca2a7e1c · 8651be54 · 8651be54
Unverified Commit 8651be54 authored Mar 07, 2019 by Da Zheng Committed by GitHub Mar 07, 2019
10 changed files
--- a/include/dgl/immutable_graph.h
+++ b/include/dgl/immutable_graph.h
@@ -534,23 +534,27 @@ class ImmutableGraph: public GraphInterface {
    return edge_list_;
  }

- protected:
-  DGLIdIters GetInEdgeIdRef(dgl_id_t src, dgl_id_t dst) const;
-  DGLIdIters GetOutEdgeIdRef(dgl_id_t src, dgl_id_t dst) const;
-
  /*!
   * \brief Get the CSR array that represents the in-edges.
   * This method copies data from std::vector to IdArray.
+   * \param start the first row to copy.
+   * \param end the last row to copy (exclusive).
   * \return the CSR array.
   */
-  CSRArray GetInCSRArray() const;
+  CSRArray GetInCSRArray(size_t start, size_t end) const;

  /*!
   * \brief Get the CSR array that represents the out-edges.
   * This method copies data from std::vector to IdArray.
+   * \param start the first row to copy.
+   * \param end the last row to copy (exclusive).
   * \return the CSR array.
   */
-  CSRArray GetOutCSRArray() const;
+  CSRArray GetOutCSRArray(size_t start, size_t end) const;
+
+ protected:
+  DGLIdIters GetInEdgeIdRef(dgl_id_t src, dgl_id_t dst) const;
+  DGLIdIters GetOutEdgeIdRef(dgl_id_t src, dgl_id_t dst) const;

  /*!
   * \brief Compact a subgraph.

--- a/include/dgl/nodeflow.h
+++ b/include/dgl/nodeflow.h
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file dgl/nodeflow.h
+ * \brief DGL NodeFlow class.
+ */
+#ifndef DGL_NODEFLOW_H_
+#define DGL_NODEFLOW_H_
+
+#include <vector>
+#include <string>
+
+#include "graph_interface.h"
+
+namespace dgl {
+
+class ImmutableGraph;
+
+/*!
+ * \brief A NodeFlow graph stores the sampling results for a sampler that samples
+ * nodes/edges in layers.
+ *
+ * We store multiple layers of the sampling results in a single graph, which results
+ * in a more compact format. We store extra information,
+ * such as the node and edge mapping from the NodeFlow graph to the parent graph.
+ */
+struct NodeFlow {
+  /*! \brief The graph. */
+  GraphPtr graph;
+  /*!
+   * \brief the offsets of each layer.
+   */
+  IdArray layer_offsets;
+  /*!
+   * \brief the offsets of each flow.
+   */
+  IdArray flow_offsets;
+  /*!
+   * \brief The node mapping from the NodeFlow graph to the parent graph.
+   */
+  IdArray node_mapping;
+  /*!
+   * \brief The edge mapping from the NodeFlow graph to the parent graph.
+   */
+  IdArray edge_mapping;
+};
+
+/*!
+ * \brief Get a slice on a graph that represents a NodeFlow.
+ *
+ * The entire block has to be taken as a slice. Users have to specify the
+ * correct starting and ending location of a layer.
+ *
+ * If remap is false, the returned arrays can be viewed as a sub-matrix slice
+ * of the adjmat of the input graph. Let the adjmat of the input graph be A,
+ * then the slice is equal to (in numpy syntax):
+ *   A[layer1_start:layer1_end, layer0_start:layer0_end]
+ *
+ * If remap is true,  the returned arrays represents an adjacency matrix
+ * of shape NxM, where N is the number of nodes in layer1 and M is
+ * the number of nodes in layer0. Nodes in layer0 will be remapped to
+ * [0, M) and nodes in layer1 will be remapped to [0, N).
+ *
+ * A row of the returned adjacency matrix represents the destination
+ * of an edge and the column represents the source.
+ *
+ * If fmt == "csr", the function returns three arrays: indptr, indices, eid.
+ * If fmt == "coo", the function returns two arrays: idx, eid. Here, the idx array
+ *   is the concatenation of src and dst node id arrays.
+ *
+ * \param graph An immutable graph.
+ * \param fmt the format of the returned adjacency matrix.
+ * \param layer0_size the size of the first layer in the block.
+ * \param layer1_start the location where the second layer starts.
+ * \param layer1_end the location where the secnd layer ends.
+ * \param remap Indicates to remap all vertex ids and edge Ids to local Id
+ * space.
+ * \return a vector of IdArrays.
+ */
+std::vector<IdArray> GetNodeFlowSlice(const ImmutableGraph &graph, const std::string &fmt,
+                                      size_t layer0_size, size_t layer1_start,
+                                      size_t layer1_end, bool remap);
+
+}  // namespace dgl
+
+#endif  // DGL_NODEFLOW_H_
+
--- a/include/dgl/sampler.h
+++ b/include/dgl/sampler.h
@@ -9,40 +9,12 @@
 #include <vector>
 #include <string>
 #include "graph_interface.h"
+#include "nodeflow.h"

 namespace dgl {

 class ImmutableGraph;

-/*!
- * \brief A NodeFlow graph stores the sampling results for a sampler that samples
- * nodes/edges in layers.
- *
- * We store multiple layers of the sampling results in a single graph, which results
- * in a more compact format. We store extra information,
- * such as the node and edge mapping from the NodeFlow graph to the parent graph.
- */
-struct NodeFlow {
-  /*! \brief The graph. */
-  GraphPtr graph;
-  /*!
-   * \brief the offsets of each layer.
-   */
-  IdArray layer_offsets;
-  /*!
-   * \brief the offsets of each flow.
-   */
-  IdArray flow_offsets;
-  /*!
-   * \brief The node mapping from the NodeFlow graph to the parent graph.
-   */
-  IdArray node_mapping;
-  /*!
-   * \brief The edge mapping from the NodeFlow graph to the parent graph.
-   */
-  IdArray edge_mapping;
-};
-
 class SamplerOp {
 public:
  /*!

--- a/python/dgl/nodeflow.py
+++ b/python/dgl/nodeflow.py
@@ -395,6 +395,161 @@ class NodeFlow(DGLBaseGraph):
        assert F.asnumpy(F.sum(ret == -1, 0)) == 0, "The eid in the parent graph is invalid."
        return ret

+    def block_edges(self, block_id):
+        """Return the edges in a block.
+
+        Parameters
+        ----------
+        block_id : int
+            The specified block to return the edges.
+
+        Returns
+        -------
+        Tensor
+            The src nodes.
+        Tensor
+            The dst nodes.
+        Tensor
+            The edge ids.
+        """
+        layer0_size = self._layer_offsets[block_id + 1] - self._layer_offsets[block_id]
+        rst = _CAPI_NodeFlowGetBlockAdj(self._graph._handle, "coo", layer0_size,
+                                        self._layer_offsets[block_id + 1],
+                                        self._layer_offsets[block_id + 2])
+        idx = utils.toindex(rst(0)).tousertensor()
+        eid = utils.toindex(rst(1))
+        num_edges = int(len(idx) / 2)
+        assert len(eid) == num_edges
+        return idx[num_edges:len(idx)], idx[0:num_edges], eid.tousertensor()
+
+    def block_adjacency_matrix(self, block_id, ctx):
+        """Return the adjacency matrix representation for a specific block in a NodeFlow.
+
+        A row of the returned adjacency matrix represents the destination
+        of an edge and the column represents the source.
+
+        Parameters
+        ----------
+        block_id : int
+            The specified block to return the adjacency matrix.
+        ctx : context
+            The context of the returned matrix.
+
+        Returns
+        -------
+        SparseTensor
+            The adjacency matrix.
+        Tensor
+            A index for data shuffling due to sparse format change. Return None
+            if shuffle is not required.
+        """
+        fmt = F.get_preferred_sparse_format()
+        # We need to extract two layers.
+        layer0_size = self._layer_offsets[block_id + 1] - self._layer_offsets[block_id]
+        rst = _CAPI_NodeFlowGetBlockAdj(self._graph._handle, fmt, layer0_size,
+                                        self._layer_offsets[block_id + 1],
+                                        self._layer_offsets[block_id + 2])
+        num_rows = self.layer_size(block_id + 1)
+        num_cols = self.layer_size(block_id)
+
+        if fmt == "csr":
+            indptr = F.copy_to(utils.toindex(rst(0)).tousertensor(), ctx)
+            indices = F.copy_to(utils.toindex(rst(1)).tousertensor(), ctx)
+            shuffle = utils.toindex(rst(2))
+            dat = F.ones(indices.shape, dtype=F.float32, ctx=ctx)
+            return F.sparse_matrix(dat, ('csr', indices, indptr),
+                                   (num_rows, num_cols))[0], shuffle.tousertensor()
+        elif fmt == "coo":
+            ## FIXME(minjie): data type
+            idx = F.copy_to(utils.toindex(rst(0)).tousertensor(), ctx)
+            m = self.block_size(block_id)
+            idx = F.reshape(idx, (2, m))
+            dat = F.ones((m,), dtype=F.float32, ctx=ctx)
+            adj, shuffle_idx = F.sparse_matrix(dat, ('coo', idx), (num_rows, num_cols))
+            return adj, shuffle_idx
+        else:
+            raise Exception("unknown format")
+
+    def block_incidence_matrix(self, block_id, typestr, ctx):
+        """Return the incidence matrix representation of the block.
+
+        An incidence matrix is an n x m sparse matrix, where n is
+        the number of nodes and m is the number of edges. Each nnz
+        value indicating whether the edge is incident to the node
+        or not.
+
+        There are three types of an incidence matrix `I`:
+        * "in":
+          - I[v, e] = 1 if e is the in-edge of v (or v is the dst node of e);
+          - I[v, e] = 0 otherwise.
+        * "out":
+          - I[v, e] = 1 if e is the out-edge of v (or v is the src node of e);
+          - I[v, e] = 0 otherwise.
+        * "both":
+          - I[v, e] = 1 if e is the in-edge of v;
+          - I[v, e] = -1 if e is the out-edge of v;
+          - I[v, e] = 0 otherwise (including self-loop).
+
+        Parameters
+        ----------
+        block_id : int
+            The specified block to return the incidence matrix.
+        typestr : str
+            Can be either "in", "out" or "both"
+        ctx : context
+            The context of returned incidence matrix.
+
+        Returns
+        -------
+        SparseTensor
+            The incidence matrix.
+        Tensor
+            A index for data shuffling due to sparse format change. Return None
+            if shuffle is not required.
+        """
+        src, dst, eid = self.block_edges(block_id)
+        src = F.copy_to(src, ctx)  # the index of the ctx will be cached
+        dst = F.copy_to(dst, ctx)  # the index of the ctx will be cached
+        eid = F.copy_to(eid, ctx)  # the index of the ctx will be cached
+        if typestr == 'in':
+            n = self.layer_size(block_id + 1)
+            m = self.block_size(block_id)
+            row = F.unsqueeze(dst, 0)
+            col = F.unsqueeze(eid, 0)
+            idx = F.cat([row, col], dim=0)
+            # FIXME(minjie): data type
+            dat = F.ones((m,), dtype=F.float32, ctx=ctx)
+            inc, shuffle_idx = F.sparse_matrix(dat, ('coo', idx), (n, m))
+        elif typestr == 'out':
+            n = self.layer_size(block_id)
+            m = self.block_size(block_id)
+            row = F.unsqueeze(src, 0)
+            col = F.unsqueeze(eid, 0)
+            idx = F.cat([row, col], dim=0)
+            # FIXME(minjie): data type
+            dat = F.ones((m,), dtype=F.float32, ctx=ctx)
+            inc, shuffle_idx = F.sparse_matrix(dat, ('coo', idx), (n, m))
+        elif typestr == 'both':
+            # TODO does it work for bipartite graph?
+            # first remove entries for self loops
+            mask = F.logical_not(F.equal(src, dst))
+            src = F.boolean_mask(src, mask)
+            dst = F.boolean_mask(dst, mask)
+            eid = F.boolean_mask(eid, mask)
+            n_entries = F.shape(src)[0]
+            # create index
+            row = F.unsqueeze(F.cat([src, dst], dim=0), 0)
+            col = F.unsqueeze(F.cat([eid, eid], dim=0), 0)
+            idx = F.cat([row, col], dim=0)
+            # FIXME(minjie): data type
+            x = -F.ones((n_entries,), dtype=F.float32, ctx=ctx)
+            y = F.ones((n_entries,), dtype=F.float32, ctx=ctx)
+            dat = F.cat([x, y], dim=0)
+            inc, shuffle_idx = F.sparse_matrix(dat, ('coo', idx), (n, m))
+        else:
+            raise DGLError('Invalid incidence matrix type: %s' % str(typestr))
+        return inc, shuffle_idx
+
    def set_n_initializer(self, initializer, layer_id=ALL, field=None):
        """Set the initializer for empty node features.

@@ -651,12 +806,13 @@ class NodeFlow(DGLBaseGraph):
        assert reduce_func is not None

        if is_all(v):
-            dest_nodes = utils.toindex(self.layer_nid(block_id + 1))
-            u, v, _ = self._graph.in_edges(dest_nodes)
-            u = utils.toindex(self._glb2lcl_nid(u.tousertensor(), block_id))
-            v = utils.toindex(self._glb2lcl_nid(v.tousertensor(), block_id + 1))
-            dest_nodes = utils.toindex(F.arange(0, self.layer_size(block_id + 1)))
-            eid = utils.toindex(F.arange(0, self.block_size(block_id)))
+            with ir.prog() as prog:
+                scheduler.schedule_nodeflow_update_all(graph=self,
+                                                       block_id=block_id,
+                                                       message_func=message_func,
+                                                       reduce_func=reduce_func,
+                                                       apply_func=apply_node_func)
+                Runtime.run(prog)
        else:
            dest_nodes = utils.toindex(v)
            u, v, eid = self._graph.in_edges(dest_nodes)

--- a/python/dgl/runtime/scheduler.py
+++ b/python/dgl/runtime/scheduler.py
@@ -180,6 +180,7 @@ def schedule_update_all(graph,
            nodes = utils.toindex(slice(0, graph.number_of_nodes()))
            schedule_apply_nodes(graph, nodes, apply_func, inplace=False)
    else:
+        # TODO is the eid here correct?
        eid = utils.toindex(slice(0, graph.number_of_edges()))  # shortcut for ALL
        recv_nodes = utils.toindex(slice(0, graph.number_of_nodes()))  # shortcut for ALL
        # create vars
@@ -243,8 +244,8 @@ def schedule_nodeflow_apply_nodes(graph,

    Parameters
    ----------
-    graph: DGLGraph
-        The DGLGraph to use
+    graph: NodeFlow
+        The NodeFlow to use
    layer_id : int
        The layer where we apply node update function.
    v : utils.Index
@@ -266,6 +267,7 @@ def schedule_nodeflow_apply_nodes(graph,
        return apply_func(nbatch)
    afunc = var.FUNC(_afunc_wrapper)
    applied_feat = ir.NODE_UDF(afunc, v_nf)
+    # TODO we need to avoid index_copy here.
    if inplace:
        ir.WRITE_ROW_INPLACE_(var_nf, var_v, applied_feat)
    else:
@@ -324,8 +326,8 @@ def schedule_nodeflow_apply_edges(graph, block_id,

    Parameters
    ----------
-    graph: DGLGraph
-        The DGLGraph to use
+    graph: NodeFlow
+        The NodeFlow to use
    block_id : int
        The block whose edges we apply edge update function.
    u : utils.Index
@@ -359,6 +361,7 @@ def schedule_nodeflow_apply_edges(graph, block_id,
        return apply_func(ebatch)
    _efunc = var.FUNC(_efunc_wrapper)
    new_fdedge = ir.EDGE_UDF(_efunc, fdsrc, fdedge, fddst)
+    # TODO we need to avoid index_copy here.
    if inplace:
        ir.WRITE_ROW_INPLACE_(var_ef, var_eid, new_fdedge)
    else:
@@ -490,6 +493,53 @@ def schedule_group_apply_edge(graph,
    else:
        ir.WRITE_ROW_(var_ef, var_eid, var_out)

+
+def schedule_nodeflow_update_all(graph,
+                                 block_id,
+                                 message_func,
+                                 reduce_func,
+                                 apply_func):
+    """get update_all schedule in a block.
+
+    Parameters
+    ----------
+    graph: NodeFlow
+        The NodeFlow to use
+    block_id : int
+        The block where we perform computation.
+    message_func: callable or list of callable
+        The message function
+    reduce_func: callable or list of callable
+        The reduce function
+    apply_func: callable
+        The apply node function
+    """
+    # A NodeFlow shouldn't have 0 edges.
+    assert graph.block_size(block_id) > 0
+    eid = utils.toindex(slice(0, graph.block_size(block_id)))  # shortcut for ALL
+    dest_nodes = utils.toindex(slice(0, graph.layer_size(block_id + 1)))  # shortcut for ALL
+    # create vars
+    var_nf = var.FEAT_DICT(graph._get_node_frame(block_id + 1), name='out_nf')
+    var_dest_nodes = var.IDX(dest_nodes, name='dest_nodes')
+    var_eid = var.IDX(eid)
+    # generate send + reduce
+    def uv_getter():
+        # TODO get all edges in the block.
+        src, dst, _ = graph.block_edges(block_id)
+        return var.IDX(utils.toindex(src)), var.IDX(utils.toindex(dst))
+    adj_creator = lambda: spmv.build_block_adj_matrix_graph(graph, block_id)
+    inc_creator = lambda: spmv.build_block_inc_matrix_graph(graph, block_id)
+    reduced_feat = _gen_send_reduce(graph, graph._get_node_frame(block_id),
+                                    graph._get_node_frame(block_id + 1),
+                                    graph._get_edge_frame(block_id),
+                                    message_func, reduce_func,
+                                    var_eid, var_dest_nodes,
+                                    uv_getter, adj_creator, inc_creator)
+    # generate optional apply
+    final_feat = _apply_with_accum(graph, var_dest_nodes, var_nf, reduced_feat, apply_func)
+    ir.WRITE_DICT_(var_nf, final_feat)
+
+
 def schedule_nodeflow_compute(graph,
                              block_id,
                              u, v, eid,
@@ -502,8 +552,8 @@ def schedule_nodeflow_compute(graph,

    Parameters
    ----------
-    graph: DGLGraph
-        The DGLGraph to use
+    graph: NodeFlow
+        The NodeFlow to use
    block_id : int
        The block where we perform computation.
    u : utils.Index
@@ -527,7 +577,7 @@ def schedule_nodeflow_compute(graph,
    if len(eid) == 0:
        # All the nodes are 0deg; downgrades to apply.
        if apply_func is not None:
-            schedule_nodeflow_apply_nodes(graph, block_id + 1, v, apply_func, inplace)
+            schedule_nodeflow_apply_nodes(graph, block_id + 1, dest_nodes, apply_func, inplace)
    else:
        # create vars
        var_nf = var.FEAT_DICT(graph._get_node_frame(block_id + 1), name='out_nf')

--- a/python/dgl/runtime/spmv.py
+++ b/python/dgl/runtime/spmv.py
@@ -131,6 +131,30 @@ def gen_e2v_spmv_schedule(inc, spmv_rfunc, mfr, out):
        ftdst = ir.SPMV(inc_var, ftmsg)
        ir.WRITE_COL_(out, var.STR(rfn.out_field), ftdst)

+def build_block_adj_matrix_graph(graph, block_id):
+    """Build adjacency matrix of the whole graph.
+
+    Parameters
+    ----------
+    graph : NodeFlow
+        The NodeFlow
+
+    block_id : int
+        the block Id
+
+    Returns
+    -------
+    utils.CtxCachedObject
+        Get be used to get adjacency matrix on the provided ctx.
+    utils.Index
+        A index for data shuffling due to sparse format change. Return None
+        if shuffle is not required.
+    """
+    #TODO why is this constructed twice?
+    _, shuffle_idx = graph.block_adjacency_matrix(block_id, F.cpu())
+    shuffle_idx = utils.toindex(shuffle_idx) if shuffle_idx is not None else None
+    return lambda ctx: graph.block_adjacency_matrix(block_id, ctx)[0], shuffle_idx
+
 def build_adj_matrix_graph(graph):
    """Build adjacency matrix of the whole graph.

@@ -148,6 +172,7 @@ def build_adj_matrix_graph(graph):
        if shuffle is not required.
    """
    gidx = graph._graph
+    # TODO Why invoking adjacency_matrix twice?
    _, shuffle_idx = gidx.adjacency_matrix(False, F.cpu())
    return lambda ctx: gidx.adjacency_matrix(False, ctx)[0], shuffle_idx

@@ -226,6 +251,28 @@ def build_adj_matrix_uv(edges, reduce_nodes, num_sources):
    shuffle_idx = utils.toindex(shuffle_idx) if shuffle_idx is not None else None
    return utils.CtxCachedObject(lambda ctx: F.copy_to(mat, ctx)), shuffle_idx

+def build_block_inc_matrix_graph(graph, block_id):
+    """Build incidence matrix.
+
+    Parameters
+    ----------
+    graph : NodeFlow
+        The NodeFlow.
+
+    block_id : int
+        The block Id
+
+    Returns
+    -------
+    utils.CtxCachedObject
+        Get be used to get incidence matrix on the provided ctx.
+    utils.Index
+        A index for data shuffling due to sparse format change. Return None
+        if shuffle is not required.
+    """
+    # inc mat will not use data tensor so conversion index is not needed
+    return lambda ctx: graph.block_incidence_matrix(block_id, 'in', ctx)[0], None
+
 def build_inc_matrix_graph(graph):
    """Build incidence matrix.


--- a/src/graph/graph_apis.cc
+++ b/src/graph/graph_apis.cc
@@ -7,6 +7,7 @@
 #include <dgl/immutable_graph.h>
 #include <dgl/graph_op.h>
 #include <dgl/sampler.h>
+#include <dgl/nodeflow.h>
 #include "../c_api_common.h"

 using dgl::runtime::DGLArgs;
@@ -419,6 +420,19 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphGetAdj")
    *rv = ConvertAdjToPackedFunc(res);
  });

+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetBlockAdj")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    GraphHandle ghandle = args[0];
+    std::string format = args[1];
+    int64_t layer0_size = args[2];
+    int64_t start = args[3];
+    int64_t end = args[4];
+    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
+    const ImmutableGraph* gptr = dynamic_cast<const ImmutableGraph*>(ptr);
+    auto res = GetNodeFlowSlice(*gptr, format, layer0_size, start, end, true);
+    *rv = ConvertAdjToPackedFunc(res);
+  });
+
 DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphRandomWalk")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];

--- a/src/graph/immutable_graph.cc
+++ b/src/graph/immutable_graph.cc
@@ -604,43 +604,39 @@ Subgraph ImmutableGraph::EdgeSubgraph(IdArray eids) const {
  return subg;
 }

-ImmutableGraph::CSRArray ImmutableGraph::GetInCSRArray() const {
-  auto in_csr = GetInCSR();
-  IdArray indptr = IdArray::Empty({static_cast<int64_t>(in_csr->indptr.size())},
+ImmutableGraph::CSRArray GetCSRArray(ImmutableGraph::CSR::Ptr csr, size_t start, size_t end) {
+  size_t num_rows = end - start;
+  size_t nnz = csr->indptr[end] - csr->indptr[start];
+  IdArray indptr = IdArray::Empty({static_cast<int64_t>(num_rows + 1)},
                                  DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  IdArray indices = IdArray::Empty({static_cast<int64_t>(in_csr->NumEdges())},
+  IdArray indices = IdArray::Empty({static_cast<int64_t>(nnz)},
                                   DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  IdArray eids = IdArray::Empty({static_cast<int64_t>(in_csr->NumEdges())},
+  IdArray eids = IdArray::Empty({static_cast<int64_t>(nnz)},
                                DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
  int64_t *indptr_data = static_cast<int64_t*>(indptr->data);
  dgl_id_t* indices_data = static_cast<dgl_id_t*>(indices->data);
  dgl_id_t* eid_data = static_cast<dgl_id_t*>(eids->data);
-  std::copy(in_csr->indptr.begin(), in_csr->indptr.end(), indptr_data);
-  std::copy(in_csr->indices.begin(), in_csr->indices.end(), indices_data);
-  std::copy(in_csr->edge_ids.begin(), in_csr->edge_ids.end(), eid_data);
-  return CSRArray{indptr, indices, eids};
+  for (size_t i = start; i < end + 1; i++)
+    indptr_data[i - start] = csr->indptr[i] - csr->indptr[start];
+  std::copy(csr->indices.begin() + csr->indptr[start],
+            csr->indices.begin() + csr->indptr[end], indices_data);
+  std::copy(csr->edge_ids.begin() + csr->indptr[start],
+            csr->edge_ids.begin() + csr->indptr[end], eid_data);
+  return ImmutableGraph::CSRArray{indptr, indices, eids};
 }

-ImmutableGraph::CSRArray ImmutableGraph::GetOutCSRArray() const {
-  auto out_csr = GetOutCSR();
-  IdArray indptr = IdArray::Empty({static_cast<int64_t>(out_csr->indptr.size())},
-                                  DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  IdArray indices = IdArray::Empty({static_cast<int64_t>(out_csr->NumEdges())},
-                                   DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  IdArray eids = IdArray::Empty({static_cast<int64_t>(out_csr->NumEdges())},
-                                DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-  int64_t *indptr_data = static_cast<int64_t*>(indptr->data);
-  dgl_id_t* indices_data = static_cast<dgl_id_t*>(indices->data);
-  dgl_id_t* eid_data = static_cast<dgl_id_t*>(eids->data);
-  std::copy(out_csr->indptr.begin(), out_csr->indptr.end(), indptr_data);
-  std::copy(out_csr->indices.begin(), out_csr->indices.end(), indices_data);
-  std::copy(out_csr->edge_ids.begin(), out_csr->edge_ids.end(), eid_data);
-  return CSRArray{indptr, indices, eids};
+ImmutableGraph::CSRArray ImmutableGraph::GetInCSRArray(size_t start, size_t end) const {
+  return GetCSRArray(GetInCSR(), start, end);
+}
+
+ImmutableGraph::CSRArray ImmutableGraph::GetOutCSRArray(size_t start, size_t end) const {
+  return GetCSRArray(GetOutCSR(), start, end);
 }

 std::vector<IdArray> ImmutableGraph::GetAdj(bool transpose, const std::string &fmt) const {
  if (fmt == "csr") {
-    CSRArray arrs = transpose ? this->GetOutCSRArray() : this->GetInCSRArray();
+    CSRArray arrs = transpose ? this->GetOutCSRArray(0, NumVertices())
+        : this->GetInCSRArray(0, NumVertices());
    return std::vector<IdArray>{arrs.indptr, arrs.indices, arrs.id};
  } else if (fmt == "coo") {
    int64_t num_edges = this->NumEdges();

--- a/src/graph/nodeflow.cc
+++ b/src/graph/nodeflow.cc
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file graph/nodeflow.cc
+ * \brief DGL NodeFlow related functions.
+ */
+
+#include <dgl/immutable_graph.h>
+#include <dgl/nodeflow.h>
+
+#include <string.h>
+
+#include "../c_api_common.h"
+
+namespace dgl {
+
+std::vector<IdArray> GetNodeFlowSlice(const ImmutableGraph &graph, const std::string &fmt,
+                                      size_t layer0_size, size_t layer1_start,
+                                      size_t layer1_end, bool remap) {
+  CHECK_GE(layer1_start, layer0_size);
+  if (fmt == "csr") {
+    dgl_id_t first_vid = layer1_start - layer0_size;
+    ImmutableGraph::CSRArray arrs = graph.GetInCSRArray(layer1_start, layer1_end);
+    if (remap) {
+      dgl_id_t *indices_data = static_cast<dgl_id_t*>(arrs.indices->data);
+      dgl_id_t *eid_data = static_cast<dgl_id_t*>(arrs.id->data);
+      const size_t len = arrs.indices->shape[0];
+      dgl_id_t first_eid = eid_data[0];
+      for (size_t i = 0; i < len; i++) {
+        CHECK_GE(indices_data[i], first_vid);
+        indices_data[i] -= first_vid;
+        CHECK_GE(eid_data[i], first_eid);
+        eid_data[i] -= first_eid;
+      }
+    }
+    return std::vector<IdArray>{arrs.indptr, arrs.indices, arrs.id};
+  } else if (fmt == "coo") {
+    ImmutableGraph::CSR::Ptr csr = graph.GetInCSR();
+    int64_t nnz = csr->indptr[layer1_end] - csr->indptr[layer1_start];
+    IdArray idx = IdArray::Empty({2 * nnz}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+    IdArray eid = IdArray::Empty({nnz}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+    int64_t *idx_data = static_cast<int64_t*>(idx->data);
+    dgl_id_t *eid_data = static_cast<dgl_id_t*>(eid->data);
+    size_t num_edges = 0;
+    for (size_t i = layer1_start; i < layer1_end; i++) {
+      for (int64_t j = csr->indptr[i]; j < csr->indptr[i + 1]; j++) {
+        // These nodes are all in a layer. We need to remap them to the node id
+        // local to the layer.
+        idx_data[num_edges] = remap ? i - layer1_start : i;
+        num_edges++;
+      }
+    }
+    CHECK_EQ(num_edges, nnz);
+    if (remap) {
+      size_t edge_start = csr->indptr[layer1_start];
+      dgl_id_t first_eid = csr->edge_ids[edge_start];
+      dgl_id_t first_vid = layer1_start - layer0_size;
+      for (int64_t i = 0; i < nnz; i++) {
+        CHECK_GE(csr->indices[edge_start + i], first_vid);
+        idx_data[nnz + i] = csr->indices[edge_start + i] - first_vid;
+        eid_data[i] = csr->edge_ids[edge_start + i] - first_eid;
+      }
+    } else {
+      std::copy(csr->indices.begin() + csr->indptr[layer1_start],
+                csr->indices.begin() + csr->indptr[layer1_end], idx_data + nnz);
+      std::copy(csr->edge_ids.begin() + csr->indptr[layer1_start],
+                csr->edge_ids.begin() + csr->indptr[layer1_end], eid_data);
+    }
+    return std::vector<IdArray>{idx, eid};
+  } else {
+    LOG(FATAL) << "unsupported adjacency matrix format";
+    return std::vector<IdArray>();
+  }
+}
+
+}  // namespace dgl
--- a/tests/compute/test_nodeflow.py
+++ b/tests/compute/test_nodeflow.py
@@ -250,8 +250,33 @@ def test_copy():
        nf.block_compute(i, partial(msg_func, ind=i), partial(reduce_func, ind=i))


+def test_block_adj_matrix():
+    num_layers = 3
+    g = generate_rand_graph(100)
+    nf = create_mini_batch(g, num_layers)
+    assert nf.num_layers == num_layers + 1
+    for i in range(nf.num_blocks):
+        src, dst, eid = nf.block_edges(i)
+        dest_nodes = utils.toindex(nf.layer_nid(i + 1))
+        u, v, _ = nf._graph.in_edges(dest_nodes)
+        u = nf._glb2lcl_nid(u.tousertensor(), i)
+        v = nf._glb2lcl_nid(v.tousertensor(), i + 1)
+        assert F.array_equal(src, u)
+        assert F.array_equal(dst, v)
+
+        adj, _ = nf.block_adjacency_matrix(i, F.cpu())
+        adj = F.sparse_to_numpy(adj)
+        data = np.ones((len(u)), dtype=np.float32)
+        v = utils.toindex(v)
+        u = utils.toindex(u)
+        coo = sp.sparse.coo_matrix((data, (v.tonumpy(), u.tonumpy())),
+                                   shape=adj.shape).todense()
+        assert np.array_equal(adj, coo)
+
+
 if __name__ == '__main__':
    test_basic()
+    test_block_adj_matrix()
    test_copy()
    test_apply_nodes()
    test_apply_edges()