[Refactor] Immutable graph index (#543)

* WIP * header * WIP .cc * WIP * transpose * wip * immutable graph .h and .cc * WIP: nodeflow.cc * compile * remove all tmp dl managed ctx; they caused refcount issue * one simple test * WIP: testing * test_graph * fix graph index * fix bug in sampler; pass pytorch utest * WIP on mxnet * fix lint * fix mxnet unittest w/ unfortunate workaround * fix msvc * fix lint * SliceRows and test_nodeflow * resolve reviews * resolve reviews * try fix win ci * try fix win ci * poke win ci again * poke * lazy multigraph flag; stackoverflow error * revert node subgraph test * lazy object * try fix win build * try fix win build * poke ci * fix build script * fix compile * add a todo * fix reviews * fix compile

[Refactor] Immutable graph index (#543)
* WIP * header * WIP .cc * WIP * transpose * wip * immutable graph .h and .cc * WIP: nodeflow.cc * compile * remove all tmp dl managed ctx; they caused refcount issue * one simple test * WIP: testing * test_graph * fix graph index * fix bug in sampler; pass pytorch utest * WIP on mxnet * fix lint * fix mxnet unittest w/ unfortunate workaround * fix msvc * fix lint * SliceRows and test_nodeflow * resolve reviews * resolve reviews * try fix win ci * try fix win ci * poke win ci again * poke * lazy multigraph flag; stackoverflow error * revert node subgraph test * lazy object * try fix win build * try fix win build * poke ci * fix build script * fix compile * add a todo * fix reviews * fix compile
605b5185 · Minjie Wang · GitHub · b2b8be25 · 605b5185 · 605b5185
Unverified Commit 605b5185 authored May 21, 2019 by Minjie Wang Committed by GitHub May 21, 2019
20 changed files
--- a/include/dgl/array.h
+++ b/include/dgl/array.h
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file dgl/array.h
+ * \brief Array types and common array operations required by DGL.
+ *
+ * Note that this is not meant for a full support of array library such as ATen.
+ * Only a limited set of operators required by DGL are implemented.
+ */
+#ifndef DGL_ARRAY_H_
+#define DGL_ARRAY_H_
+
+#include <dgl/runtime/ndarray.h>
+#include <vector>
+
+namespace dgl {
+
+typedef uint64_t dgl_id_t;
+typedef dgl::runtime::NDArray IdArray;
+typedef dgl::runtime::NDArray DegreeArray;
+typedef dgl::runtime::NDArray BoolArray;
+typedef dgl::runtime::NDArray IntArray;
+typedef dgl::runtime::NDArray FloatArray;
+
+/*! \brief Create a new id array with given length (on CPU) */
+IdArray NewIdArray(int64_t length);
+
+/*! \brief Create a new id array with the given vector data (on CPU) */
+IdArray VecToIdArray(const std::vector<dgl_id_t>& vec);
+
+/*! \brief Create a copy of the given array */
+IdArray Clone(IdArray arr);
+
+/*! \brief Arithmetic functions */
+IdArray Add(IdArray lhs, IdArray rhs);
+IdArray Sub(IdArray lhs, IdArray rhs);
+IdArray Mul(IdArray lhs, IdArray rhs);
+IdArray Div(IdArray lhs, IdArray rhs);
+
+IdArray Add(IdArray lhs, dgl_id_t rhs);
+IdArray Sub(IdArray lhs, dgl_id_t rhs);
+IdArray Mul(IdArray lhs, dgl_id_t rhs);
+IdArray Div(IdArray lhs, dgl_id_t rhs);
+
+IdArray Add(dgl_id_t lhs, IdArray rhs);
+IdArray Sub(dgl_id_t lhs, IdArray rhs);
+IdArray Mul(dgl_id_t lhs, IdArray rhs);
+IdArray Div(dgl_id_t lhs, IdArray rhs);
+
+
+/*! \brief Stack two arrays (of len L) into a 2*L length array */
+IdArray HStack(IdArray arr1, IdArray arr2);
+
+
+/*! \brief Plain CSR matrix */
+struct CSRMatrix {
+  IdArray indptr, indices, data;
+};
+
+/*! \brief Plain COO structure */
+struct COOMatrix {
+  IdArray row, col, data;
+};
+
+/*! \brief Slice rows of the given matrix and return. */
+CSRMatrix SliceRows(const CSRMatrix& csr, int64_t start, int64_t end);
+
+/*! \brief Convert COO matrix to CSR matrix. */
+CSRMatrix ToCSR(const COOMatrix);
+
+/*! \brief Convert COO matrix to CSR matrix. */
+COOMatrix ToCOO(const CSRMatrix);
+
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_H_
--- a/include/dgl/graph.h
+++ b/include/dgl/graph.h
@@ -40,8 +40,7 @@ class Graph: public GraphInterface {
  explicit Graph(bool multigraph = false) : is_multigraph_(multigraph) {}

  /*! \brief construct a graph from the coo format. */
-  Graph(IdArray src_ids, IdArray dst_ids, IdArray edge_ids, size_t num_nodes,
-      bool multigraph = false);
+  Graph(IdArray src_ids, IdArray dst_ids, size_t num_nodes, bool multigraph = false);

  /*! \brief default copy constructor */
  Graph(const Graph& other) = default;

--- a/include/dgl/graph_interface.h
+++ b/include/dgl/graph_interface.h
@@ -9,16 +9,11 @@
 #include <string>
 #include <vector>
 #include <utility>
-#include "runtime/ndarray.h"
+#include <algorithm>

-namespace dgl {
+#include "array.h"

-typedef uint64_t dgl_id_t;
-typedef dgl::runtime::NDArray IdArray;
-typedef dgl::runtime::NDArray DegreeArray;
-typedef dgl::runtime::NDArray BoolArray;
-typedef dgl::runtime::NDArray IntArray;
-typedef dgl::runtime::NDArray FloatArray;
+namespace dgl {

 struct Subgraph;
 struct NodeFlow;
@@ -32,8 +27,10 @@ const dgl_id_t DGL_INVALID_ID = static_cast<dgl_id_t>(-1);
 * but it doesn't own data itself. instead, it only references data in std::vector.
 */
 class DGLIdIters {
-  const dgl_id_t *begin_, *end_;
 public:
+  /* !\brief default constructor to create an empty range */
+  DGLIdIters() {}
+  /* !\brief constructor with given begin and end */
  DGLIdIters(const dgl_id_t *begin, const dgl_id_t *end) {
    this->begin_ = begin;
    this->end_ = end;
@@ -50,6 +47,8 @@ class DGLIdIters {
  size_t size() const {
    return this->end_ - this->begin_;
  }
+ private:
+  const dgl_id_t *begin_{nullptr}, *end_{nullptr};
 };

 class GraphInterface;
@@ -118,13 +117,49 @@ class GraphInterface {
  virtual bool HasVertex(dgl_id_t vid) const = 0;

  /*! \return a 0-1 array indicating whether the given vertices are in the graph.*/
-  virtual BoolArray HasVertices(IdArray vids) const = 0;
+  virtual BoolArray HasVertices(IdArray vids) const {
+    const auto len = vids->shape[0];
+    BoolArray rst = BoolArray::Empty({len}, vids->dtype, vids->ctx);
+    const dgl_id_t* vid_data = static_cast<dgl_id_t*>(vids->data);
+    dgl_id_t* rst_data = static_cast<dgl_id_t*>(rst->data);
+    const uint64_t nverts = NumVertices();
+    for (int64_t i = 0; i < len; ++i) {
+      rst_data[i] = (vid_data[i] < nverts)? 1 : 0;
+    }
+    return rst;
+  }

  /*! \return true if the given edge is in the graph.*/
  virtual bool HasEdgeBetween(dgl_id_t src, dgl_id_t dst) const = 0;

  /*! \return a 0-1 array indicating whether the given edges are in the graph.*/
-  virtual BoolArray HasEdgesBetween(IdArray src_ids, IdArray dst_ids) const = 0;
+  virtual BoolArray HasEdgesBetween(IdArray src_ids, IdArray dst_ids) const {
+    const auto srclen = src_ids->shape[0];
+    const auto dstlen = dst_ids->shape[0];
+    const auto rstlen = std::max(srclen, dstlen);
+    BoolArray rst = BoolArray::Empty({rstlen}, src_ids->dtype, src_ids->ctx);
+    dgl_id_t* rst_data = static_cast<dgl_id_t*>(rst->data);
+    const dgl_id_t* src_data = static_cast<dgl_id_t*>(src_ids->data);
+    const dgl_id_t* dst_data = static_cast<dgl_id_t*>(dst_ids->data);
+    if (srclen == 1) {
+      // one-many
+      for (int64_t i = 0; i < dstlen; ++i) {
+        rst_data[i] = HasEdgeBetween(src_data[0], dst_data[i])? 1 : 0;
+      }
+    } else if (dstlen == 1) {
+      // many-one
+      for (int64_t i = 0; i < srclen; ++i) {
+        rst_data[i] = HasEdgeBetween(src_data[i], dst_data[0])? 1 : 0;
+      }
+    } else {
+      // many-many
+      CHECK(srclen == dstlen) << "Invalid src and dst id array.";
+      for (int64_t i = 0; i < srclen; ++i) {
+        rst_data[i] = HasEdgeBetween(src_data[i], dst_data[i])? 1 : 0;
+      }
+    }
+    return rst;
+  }

  /*!
   * \brief Find the predecessors of a vertex.
@@ -329,6 +364,13 @@ class GraphInterface {
   *
   * By default, a row of returned adjacency matrix represents the destination
   * of an edge and the column represents the source.
+   *
+   * If the fmt is 'csr', the function should return three arrays, representing
+   *  indptr, indices and edge ids
+   *
+   * If the fmt is 'coo', the function should return one array of shape (2, nnz),
+   * representing a horitonzal stack of row and col indices.
+   *
   * \param transpose A flag to transpose the returned adjacency matrix.
   * \param fmt the format of the returned adjacency matrix.
   * \return a vector of IdArrays.

--- a/include/dgl/immutable_graph.h
+++ b/include/dgl/immutable_graph.h
--- a/include/dgl/lazy.h
+++ b/include/dgl/lazy.h
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file dgl/lazy.h
+ * \brief Lazy object that will be materialized only when being queried.
+ */
+#ifndef DGL_LAZY_H_
+#define DGL_LAZY_H_
+
+#include <memory>
+
+namespace dgl {
+
+/*!
+ * \brief Lazy object that will be materialized only when being queried.
+ *
+ * The object should be immutable -- no mutation once materialized.
+ * The object is currently not threaad safe.
+ */
+template <typename T>
+class LazyObject {
+ public:
+  /*!\brief default constructor to construct a lazy object */
+  LazyObject() {}
+
+  /*!\brief constructor to construct an object with given value (non-lazy case) */
+  explicit LazyObject(const T& val): ptr_(new T(val)) {}
+
+  /*!\brief destructor */
+  ~LazyObject() = default;
+
+  /*!
+   * \brief Get the value of this object. If the object has not been instantiated,
+   *        using the provided function to create it.
+   * \param fn The creator function.
+   * \return the object value.
+   */
+  template <typename Fn>
+  const T& Get(Fn fn) {
+    if (!ptr_) {
+      ptr_.reset(new T(fn()));
+    }
+    return *ptr_;
+  }
+
+ private:
+  /*!\brief the internal data pointer */
+  std::shared_ptr<T> ptr_{nullptr};
+};
+
+}  // namespace dgl
+
+#endif  // DGL_LAZY_H_
--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
@@ -92,6 +92,8 @@ class NDArray {
  inline int use_count() const;
  /*! \return Pointer to content of DLTensor */
  inline const DLTensor* operator->() const;
+  /*! \return True if the ndarray is contiguous. */
+  bool IsContiguous() const;
  /*!
   * \brief Copy data content from another array.
   * \param other The source array to be copied from.
@@ -129,10 +131,11 @@ class NDArray {
   * \brief Create a NDArray that shares the data memory with the current one.
   * \param shape The shape of the new array.
   * \param dtype The data type of the new array.
+   * \param offset The offset (in bytes) of the starting pointer.
   * \note The memory size of new array must be smaller than the current one.
   */
  DGL_DLL NDArray CreateView(
-      std::vector<int64_t> shape, DLDataType dtype);
+      std::vector<int64_t> shape, DLDataType dtype, int64_t offset = 0);
  /*!
   * \brief Create a reference view of NDArray that
   *  represents as DLManagedTensor.

--- a/python/dgl/graph.py
+++ b/python/dgl/graph.py
@@ -898,6 +898,7 @@ class DGLGraph(DGLBaseGraph):
                 readonly=False):
        # graph
        super(DGLGraph, self).__init__(create_graph_index(graph_data, multigraph, readonly))
+
        # node and edge frame
        if node_frame is None:
            self._node_frame = FrameRef(Frame(num_rows=self.number_of_nodes()))
@@ -1977,7 +1978,7 @@ class DGLGraph(DGLBaseGraph):
        assert func is not None

        if is_all(edges):
-            u, v, _ = self._graph.edges()
+            u, v, _ = self._graph.edges('eid')
            eid = utils.toindex(slice(0, self.number_of_edges()))
        elif isinstance(edges, tuple):
            u, v = edges

--- a/python/dgl/graph_index.py
+++ b/python/dgl/graph_index.py
@@ -51,21 +51,19 @@ class GraphIndex(object):
        self._multigraph = multigraph
        self._readonly = readonly
        if readonly:
-            self._init(src, dst, utils.toindex(F.arange(0, len(src))), n_nodes)
+            self._init(src, dst, n_nodes)
        else:
            self._handle = _CAPI_DGLGraphCreateMutable(multigraph)
            self.clear()
            self.add_nodes(n_nodes)
            self.add_edges(src, dst)

-    def _init(self, src_ids, dst_ids, edge_ids, num_nodes):
+    def _init(self, src_ids, dst_ids, num_nodes):
        """The actual init function"""
        assert len(src_ids) == len(dst_ids)
-        assert len(src_ids) == len(edge_ids)
        self._handle = _CAPI_DGLGraphCreate(
            src_ids.todgltensor(),
            dst_ids.todgltensor(),
-            edge_ids.todgltensor(),
            self._multigraph,
            int(num_nodes),
            self._readonly)
@@ -631,8 +629,9 @@ class GraphIndex(object):
            indices = F.copy_to(utils.toindex(rst(1)).tousertensor(), ctx)
            shuffle = utils.toindex(rst(2))
            dat = F.ones(indices.shape, dtype=F.float32, ctx=ctx)
-            return F.sparse_matrix(dat, ('csr', indices, indptr),
-                                   (self.number_of_nodes(), self.number_of_nodes()))[0], shuffle
+            spmat = F.sparse_matrix(dat, ('csr', indices, indptr),
+                                    (self.number_of_nodes(), self.number_of_nodes()))[0]
+            return spmat, shuffle
        elif fmt == "coo":
            ## FIXME(minjie): data type
            idx = F.copy_to(utils.toindex(rst(0)).tousertensor(), ctx)
@@ -786,13 +785,11 @@ class GraphIndex(object):
            for e in nx_graph.edges:
                src.append(e[0])
                dst.append(e[1])
-        eid = np.arange(0, len(src), dtype=np.int64)
        num_nodes = nx_graph.number_of_nodes()
        # We store edge Ids as an edge attribute.
-        eid = utils.toindex(eid)
        src = utils.toindex(src)
        dst = utils.toindex(dst)
-        self._init(src, dst, eid, num_nodes)
+        self._init(src, dst, num_nodes)


    def from_scipy_sparse_matrix(self, adj):
@@ -808,9 +805,7 @@ class GraphIndex(object):
        adj_coo = adj.tocoo()
        src = utils.toindex(adj_coo.row)
        dst = utils.toindex(adj_coo.col)
-        edge_ids = utils.toindex(F.arange(0, len(adj_coo.row)))
-        self._init(src, dst, edge_ids, num_nodes)
-
+        self._init(src, dst, num_nodes)

    def from_csr_matrix(self, indptr, indices, edge_dir, shared_mem_name=""):
        """Load a graph from the CSR matrix.
@@ -881,8 +876,7 @@ class GraphIndex(object):
        min_nodes = min(src.min(), dst.min())
        if min_nodes != 0:
            raise DGLError('Invalid edge list. Nodes must start from 0.')
-        edge_ids = utils.toindex(F.arange(0, len(src)))
-        self._init(src_ids, dst_ids, edge_ids, num_nodes)
+        self._init(src_ids, dst_ids, num_nodes)

    def line_graph(self, backtracking=True):
        """Return the line graph of this graph.

--- a/python/dgl/runtime/scheduler.py
+++ b/python/dgl/runtime/scheduler.py
@@ -191,7 +191,7 @@ def schedule_update_all(graph,
        var_eid = var.IDX(eid)
        # generate send + reduce
        def uv_getter():
-            src, dst, _ = graph._graph.edges()
+            src, dst, _ = graph._graph.edges('eid')
            return var.IDX(src), var.IDX(dst)
        adj_creator = lambda: spmv.build_adj_matrix_graph(graph)
        inc_creator = lambda: spmv.build_inc_matrix_graph(graph)

--- a/src/array.cc
+++ b/src/array.cc
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file array.cc
+ * \brief DGL array utilities implementation
+ */
+#include <dgl/array.h>
+
+namespace dgl {
+
+// TODO(minjie): currently these operators are only on CPU.
+
+IdArray NewIdArray(int64_t length) {
+  return IdArray::Empty({length}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+}
+
+IdArray VecToIdArray(const std::vector<dgl_id_t>& vec) {
+  IdArray ret = NewIdArray(vec.size());
+  std::copy(vec.begin(), vec.end(), static_cast<dgl_id_t*>(ret->data));
+  return ret;
+}
+
+IdArray Clone(IdArray arr) {
+  IdArray ret = NewIdArray(arr->shape[0]);
+  ret.CopyFrom(arr);
+  return ret;
+}
+
+IdArray Add(IdArray lhs, IdArray rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  const dgl_id_t* rhs_data = static_cast<dgl_id_t*>(rhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] + rhs_data[i];
+  }
+  return ret;
+}
+
+IdArray Sub(IdArray lhs, IdArray rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  const dgl_id_t* rhs_data = static_cast<dgl_id_t*>(rhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] - rhs_data[i];
+  }
+  return ret;
+}
+
+IdArray Mul(IdArray lhs, IdArray rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  const dgl_id_t* rhs_data = static_cast<dgl_id_t*>(rhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] * rhs_data[i];
+  }
+  return ret;
+}
+
+IdArray Div(IdArray lhs, IdArray rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  const dgl_id_t* rhs_data = static_cast<dgl_id_t*>(rhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] / rhs_data[i];
+  }
+  return ret;
+}
+
+IdArray Add(IdArray lhs, dgl_id_t rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] + rhs;
+  }
+  return ret;
+}
+
+IdArray Sub(IdArray lhs, dgl_id_t rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] - rhs;
+  }
+  return ret;
+}
+
+IdArray Mul(IdArray lhs, dgl_id_t rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] * rhs;
+  }
+  return ret;
+}
+
+IdArray Div(IdArray lhs, dgl_id_t rhs) {
+  IdArray ret = NewIdArray(lhs->shape[0]);
+  const dgl_id_t* lhs_data = static_cast<dgl_id_t*>(lhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < lhs->shape[0]; ++i) {
+    ret_data[i] = lhs_data[i] / rhs;
+  }
+  return ret;
+}
+
+IdArray Add(dgl_id_t lhs, IdArray rhs) {
+  return Add(rhs, lhs);
+}
+
+IdArray Sub(dgl_id_t lhs, IdArray rhs) {
+  IdArray ret = NewIdArray(rhs->shape[0]);
+  const dgl_id_t* rhs_data = static_cast<dgl_id_t*>(rhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < rhs->shape[0]; ++i) {
+    ret_data[i] = lhs - rhs_data[i];
+  }
+  return ret;
+}
+
+IdArray Mul(dgl_id_t lhs, IdArray rhs) {
+  return Mul(rhs, lhs);
+}
+
+IdArray Div(dgl_id_t lhs, IdArray rhs) {
+  IdArray ret = NewIdArray(rhs->shape[0]);
+  const dgl_id_t* rhs_data = static_cast<dgl_id_t*>(rhs->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < rhs->shape[0]; ++i) {
+    ret_data[i] = lhs / rhs_data[i];
+  }
+  return ret;
+}
+
+IdArray HStack(IdArray arr1, IdArray arr2) {
+  CHECK_EQ(arr1->shape[0], arr2->shape[0]);
+  const int64_t L = arr1->shape[0];
+  IdArray ret = NewIdArray(2 * L);
+  const dgl_id_t* arr1_data = static_cast<dgl_id_t*>(arr1->data);
+  const dgl_id_t* arr2_data = static_cast<dgl_id_t*>(arr2->data);
+  dgl_id_t* ret_data = static_cast<dgl_id_t*>(ret->data);
+  for (int64_t i = 0; i < L; ++i) {
+    ret_data[i] = arr1_data[i];
+    ret_data[i + L] = arr2_data[i];
+  }
+  return ret;
+}
+
+CSRMatrix SliceRows(const CSRMatrix& csr, int64_t start, int64_t end) {
+  const dgl_id_t* indptr = static_cast<dgl_id_t*>(csr.indptr->data);
+  const dgl_id_t* indices = static_cast<dgl_id_t*>(csr.indices->data);
+  const dgl_id_t* data = static_cast<dgl_id_t*>(csr.data->data);
+  const int64_t num_rows = end - start;
+  const int64_t nnz = indptr[end] - indptr[start];
+  CSRMatrix ret;
+  ret.indptr = NewIdArray(num_rows + 1);
+  ret.indices = NewIdArray(nnz);
+  ret.data = NewIdArray(nnz);
+  dgl_id_t* r_indptr = static_cast<dgl_id_t*>(ret.indptr->data);
+  dgl_id_t* r_indices = static_cast<dgl_id_t*>(ret.indices->data);
+  dgl_id_t* r_data = static_cast<dgl_id_t*>(ret.data->data);
+  for (int64_t i = start; i < end + 1; ++i) {
+    r_indptr[i - start] = indptr[i] - indptr[start];
+  }
+  std::copy(indices + indptr[start], indices + indptr[end], r_indices);
+  std::copy(data + indptr[start], data + indptr[end], r_data);
+  return ret;
+}
+
+}  // namespace dgl
--- a/src/c_api_common.cc
+++ b/src/c_api_common.cc
@@ -13,15 +13,6 @@ using dgl::runtime::NDArray;

 namespace dgl {

-DLManagedTensor* CreateTmpDLManagedTensor(const DGLArgValue& arg) {
-  const DLTensor* dl_tensor = arg;
-  DLManagedTensor* ret = new DLManagedTensor();
-  ret->deleter = [] (DLManagedTensor* self) { delete self; };
-  ret->manager_ctx = nullptr;
-  ret->dl_tensor = *dl_tensor;
-  return ret;
-}
-
 PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector<NDArray>& vec) {
    auto body = [vec](DGLArgs args, DGLRetValue* rv) {
        const uint64_t which = args[0];

--- a/src/c_api_common.h
+++ b/src/c_api_common.h
@@ -19,13 +19,6 @@ typedef void* GraphHandle;

 // Communicator handler type
 typedef void* CommunicatorHandle;
-/*!
- * \brief Convert the given DLTensor to DLManagedTensor.
- *
- * Return a temporary DLManagedTensor that does not own memory.
- */
-DLManagedTensor* CreateTmpDLManagedTensor(
-    const dgl::runtime::DGLArgValue& arg);

 /*!
 * \brief Convert a vector of NDArray to PackedFunc.

--- a/src/graph/graph.cc
+++ b/src/graph/graph.cc
@@ -14,33 +14,28 @@

 namespace dgl {

-Graph::Graph(IdArray src_ids, IdArray dst_ids, IdArray edge_ids, size_t num_nodes,
+Graph::Graph(IdArray src_ids, IdArray dst_ids, size_t num_nodes,
    bool multigraph): is_multigraph_(multigraph) {
  CHECK(IsValidIdArray(src_ids));
  CHECK(IsValidIdArray(dst_ids));
-  CHECK(IsValidIdArray(edge_ids));
  this->AddVertices(num_nodes);
  num_edges_ = src_ids->shape[0];
  CHECK(static_cast<int64_t>(num_edges_) == dst_ids->shape[0])
    << "vectors in COO must have the same length";
-  CHECK(static_cast<int64_t>(num_edges_) == edge_ids->shape[0])
-    << "vectors in COO must have the same length";
  const dgl_id_t *src_data = static_cast<dgl_id_t*>(src_ids->data);
  const dgl_id_t *dst_data = static_cast<dgl_id_t*>(dst_ids->data);
-  const dgl_id_t *edge_data = static_cast<dgl_id_t*>(edge_ids->data);
  all_edges_src_.reserve(num_edges_);
  all_edges_dst_.reserve(num_edges_);
  for (uint64_t i = 0; i < num_edges_; i++) {
    auto src = src_data[i];
    auto dst = dst_data[i];
-    auto eid = edge_data[i];
    CHECK(HasVertex(src) && HasVertex(dst))
      << "Invalid vertices: src=" << src << " dst=" << dst;

    adjlist_[src].succ.push_back(dst);
-    adjlist_[src].edge_id.push_back(eid);
+    adjlist_[src].edge_id.push_back(i);
    reverse_adjlist_[dst].succ.push_back(src);
-    reverse_adjlist_[dst].edge_id.push_back(eid);
+    reverse_adjlist_[dst].edge_id.push_back(i);

    all_edges_src_.push_back(src);
    all_edges_dst_.push_back(dst);
@@ -104,7 +99,7 @@ BoolArray Graph::HasVertices(IdArray vids) const {
  int64_t* rst_data = static_cast<int64_t*>(rst->data);
  const int64_t nverts = NumVertices();
  for (int64_t i = 0; i < len; ++i) {
-    rst_data[i] = (vid_data[i] < nverts)? 1 : 0;
+    rst_data[i] = (vid_data[i] < nverts && vid_data[i] >= 0)? 1 : 0;
  }
  return rst;
 }

--- a/src/graph/graph_apis.cc
+++ b/src/graph/graph_apis.cc
@@ -77,14 +77,13 @@ namespace {
 template<typename T>
 void DGLDisjointPartitionByNum(const T *gptr, DGLArgs args, DGLRetValue *rv) {
  int64_t num = args[1];
-  const std::vector<T> &&rst = GraphOp::DisjointPartitionByNum(gptr, num);
+  std::vector<T> &&rst = GraphOp::DisjointPartitionByNum(gptr, num);
  // return the pointer array as an integer array
  const int64_t len = rst.size();
  NDArray ptr_array = NDArray::Empty({len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
  int64_t *ptr_array_data = static_cast<int64_t *>(ptr_array->data);
  for (size_t i = 0; i < rst.size(); ++i) {
-    T *ptr = new T();
-    *ptr = std::move(rst[i]);
+    GraphInterface *ptr = rst[i].Reset();
    ptr_array_data[i] = reinterpret_cast<std::intptr_t>(ptr);
  }
  *rv = ptr_array;
@@ -100,9 +99,7 @@ void DGLDisjointUnion(GraphHandle *inhandles, int list_size, DGLRetValue *rv) {
    graphs.push_back(gr);
  }

-  T *gptr = new T();
-  *gptr = GraphOp::DisjointUnion(std::move(graphs));
-  GraphHandle ghandle = gptr;
+  GraphHandle ghandle = GraphOp::DisjointUnion(std::move(graphs)).Reset();
  *rv = ghandle;
 }

@@ -114,8 +111,7 @@ void DGLDisjointPartitionBySizes(const T *gptr, const IdArray sizes, DGLRetValue
  NDArray ptr_array = NDArray::Empty({len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
  int64_t *ptr_array_data = static_cast<int64_t *>(ptr_array->data);
  for (size_t i = 0; i < rst.size(); ++i) {
-    T *ptr = new T();
-    *ptr = std::move(rst[i]);
+    GraphInterface *ptr = rst[i].Reset();
    ptr_array_data[i] = reinterpret_cast<std::intptr_t>(ptr);
  }
  *rv = ptr_array;
@@ -133,39 +129,48 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphCreateMutable")

 DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphCreate")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const IdArray src_ids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[0]));
-    const IdArray dst_ids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
-    const IdArray edge_ids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
-    const bool multigraph = static_cast<bool>(args[3]);
-    const int64_t num_nodes = static_cast<int64_t>(args[4]);
-    const bool readonly = static_cast<bool>(args[5]);
+    const IdArray src_ids = args[0];
+    const IdArray dst_ids = args[1];
+    const bool multigraph = static_cast<bool>(args[2]);
+    const int64_t num_nodes = static_cast<int64_t>(args[3]);
+    const bool readonly = static_cast<bool>(args[4]);
    GraphHandle ghandle;
-    if (readonly)
-      ghandle = new ImmutableGraph(src_ids, dst_ids, edge_ids, num_nodes, multigraph);
-    else
-      ghandle = new Graph(src_ids, dst_ids, edge_ids, num_nodes, multigraph);
+    if (readonly) {
+      // TODO(minjie): The array copy here is unnecessary and adds extra overhead.
+      //   However, with MXNet backend, the memory would be corrupted if we directly
+      //   save the passed-in ndarrays into DGL's graph object. We hope MXNet team
+      //   could help look into this.
+      COOPtr coo(new COO(num_nodes, Clone(src_ids), Clone(dst_ids), multigraph));
+      ghandle = new ImmutableGraph(coo);
+    } else {
+      ghandle = new Graph(src_ids, dst_ids, num_nodes, multigraph);
+    }
    *rv = ghandle;
  });

 DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphCSRCreate")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const IdArray indptr = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[0]));
-    const IdArray indices = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
-    const IdArray edge_ids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
+    const IdArray indptr = args[0];
+    const IdArray indices = args[1];
+    const IdArray edge_ids = args[2];
    const std::string shared_mem_name = args[3];
    const bool multigraph = static_cast<bool>(args[4]);
    const std::string edge_dir = args[5];
-    ImmutableGraph::CSR::Ptr csr;
+    CSRPtr csr;
    if (shared_mem_name.empty())
-      csr.reset(new ImmutableGraph::CSR(indptr, indices, edge_ids));
+      // TODO(minjie): The array copy here is unnecessary and adds extra overhead.
+      //   However, with MXNet backend, the memory would be corrupted if we directly
+      //   save the passed-in ndarrays into DGL's graph object. We hope MXNet team
+      //   could help look into this.
+      csr.reset(new CSR(Clone(indptr), Clone(indices), Clone(edge_ids), multigraph));
    else
-      csr.reset(new ImmutableGraph::CSR(indptr, indices, edge_ids, shared_mem_name));
+      csr.reset(new CSR(indptr, indices, edge_ids, multigraph, shared_mem_name));

    GraphHandle ghandle;
    if (edge_dir == "in")
-      ghandle = new ImmutableGraph(csr, nullptr, multigraph);
+      ghandle = new ImmutableGraph(csr, nullptr);
    else
-      ghandle = new ImmutableGraph(nullptr, csr, multigraph);
+      ghandle = new ImmutableGraph(nullptr, csr);
    *rv = ghandle;
  });

@@ -176,13 +181,13 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphCSRCreateMMap")
    const int64_t num_edges = args[2];
    const bool multigraph = static_cast<bool>(args[3]);
    const std::string edge_dir = args[4];
-    ImmutableGraph::CSR::Ptr csr(new ImmutableGraph::CSR(shared_mem_name,
-                                                         num_vertices, num_edges));
+    // TODO(minjie): how to know multigraph
+    CSRPtr csr(new CSR(shared_mem_name, num_vertices, num_edges, multigraph));
    GraphHandle ghandle;
    if (edge_dir == "in")
-      ghandle = new ImmutableGraph(csr, nullptr, multigraph);
+      ghandle = new ImmutableGraph(csr, nullptr);
    else
-      ghandle = new ImmutableGraph(nullptr, csr, multigraph);
+      ghandle = new ImmutableGraph(nullptr, csr);
    *rv = ghandle;
  });

@@ -214,8 +219,8 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphAddEdges")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray src = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
-    const IdArray dst = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
+    const IdArray src = args[1];
+    const IdArray dst = args[2];
    gptr->AddEdges(src, dst);
  });

@@ -268,14 +273,14 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphHasVertices")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray vids = args[1];
    *rv = gptr->HasVertices(vids);
  });

 DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLMapSubgraphNID")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const IdArray parent_vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[0]));
-    const IdArray query = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray parent_vids = args[0];
+    const IdArray query = args[1];
    *rv = GraphOp::MapParentIdToSubgraphId(parent_vids, query);
  });

@@ -292,8 +297,8 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphHasEdgesBetween")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray src = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
-    const IdArray dst = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
+    const IdArray src = args[1];
+    const IdArray dst = args[2];
    *rv = gptr->HasEdgesBetween(src, dst);
  });

@@ -328,8 +333,8 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphEdgeIds")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray src = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
-    const IdArray dst = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
+    const IdArray src = args[1];
+    const IdArray dst = args[2];
    *rv = ConvertEdgeArrayToPackedFunc(gptr->EdgeIds(src, dst));
  });

@@ -337,7 +342,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphFindEdges")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray eids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray eids = args[1];
    *rv = ConvertEdgeArrayToPackedFunc(gptr->FindEdges(eids));
  });

@@ -353,7 +358,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphInEdges_2")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray vids = args[1];
    *rv = ConvertEdgeArrayToPackedFunc(gptr->InEdges(vids));
  });

@@ -369,7 +374,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphOutEdges_2")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray vids = args[1];
    *rv = ConvertEdgeArrayToPackedFunc(gptr->OutEdges(vids));
  });

@@ -393,7 +398,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphInDegrees")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray vids = args[1];
    *rv = gptr->InDegrees(vids);
  });

@@ -409,7 +414,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphOutDegrees")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray vids = args[1];
    *rv = gptr->OutDegrees(vids);
  });

@@ -417,7 +422,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphVertexSubgraph")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface* gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray vids = args[1];
    *rv = ConvertSubgraphToPackedFunc(gptr->VertexSubgraph(vids));
  });

@@ -425,7 +430,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphEdgeSubgraph")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
    const GraphInterface *gptr = static_cast<GraphInterface*>(ghandle);
-    const IdArray eids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray eids = args[1];
    *rv = ConvertSubgraphToPackedFunc(gptr->EdgeSubgraph(eids));
  });

@@ -462,7 +467,7 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLDisjointPartitionByNum")
 DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLDisjointPartitionBySizes")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];
-    const IdArray sizes = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const IdArray sizes = args[1];
    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
    const Graph* gptr = dynamic_cast<const Graph*>(ptr);
    const ImmutableGraph* im_gptr = dynamic_cast<const ImmutableGraph*>(ptr);
@@ -497,17 +502,4 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphGetAdj")
    *rv = ConvertAdjToPackedFunc(res);
  });

-DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetBlockAdj")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    GraphHandle ghandle = args[0];
-    std::string format = args[1];
-    int64_t layer0_size = args[2];
-    int64_t start = args[3];
-    int64_t end = args[4];
-    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
-    const ImmutableGraph* gptr = dynamic_cast<const ImmutableGraph*>(ptr);
-    auto res = GetNodeFlowSlice(*gptr, format, layer0_size, start, end, true);
-    *rv = ConvertAdjToPackedFunc(res);
-  });
-
 }  // namespace dgl
--- a/src/graph/graph_op.cc
+++ b/src/graph/graph_op.cc
@@ -108,40 +108,44 @@ std::vector<Graph> GraphOp::DisjointPartitionBySizes(const Graph* graph, IdArray


 ImmutableGraph GraphOp::DisjointUnion(std::vector<const ImmutableGraph *> graphs) {
-  dgl_id_t num_nodes = 0;
-  dgl_id_t num_edges = 0;
+  int64_t num_nodes = 0;
+  int64_t num_edges = 0;
  for (const ImmutableGraph *gr : graphs) {
    num_nodes += gr->NumVertices();
    num_edges += gr->NumEdges();
  }
-  ImmutableGraph::CSR::Ptr batched_csr_ptr = std::make_shared<ImmutableGraph::CSR>(num_nodes,
-                                                                                   num_edges);
-  batched_csr_ptr->indptr[0] = 0;
+  IdArray indptr_arr = NewIdArray(num_nodes + 1);
+  IdArray indices_arr = NewIdArray(num_edges);
+  IdArray edge_ids_arr = NewIdArray(num_edges);
+  dgl_id_t* indptr = static_cast<dgl_id_t*>(indptr_arr->data);
+  dgl_id_t* indices = static_cast<dgl_id_t*>(indices_arr->data);
+  dgl_id_t* edge_ids = static_cast<dgl_id_t*>(edge_ids_arr->data);
+
+  indptr[0] = 0;
  dgl_id_t cum_num_nodes = 0;
  dgl_id_t cum_num_edges = 0;
-  dgl_id_t indptr_idx = 1;
  for (const ImmutableGraph *gr : graphs) {
-    const ImmutableGraph::CSR::Ptr &g_csrptr = gr->GetInCSR();
-    dgl_id_t g_num_nodes = g_csrptr->NumVertices();
-    dgl_id_t g_num_edges = g_csrptr->NumEdges();
-    ImmutableGraph::CSR::vector<dgl_id_t> &g_indices = g_csrptr->indices;
-    ImmutableGraph::CSR::vector<int64_t> &g_indptr = g_csrptr->indptr;
-    ImmutableGraph::CSR::vector<dgl_id_t> &g_edge_ids = g_csrptr->edge_ids;
-    for (dgl_id_t i = 1; i < g_indptr.size(); ++i) {
-      batched_csr_ptr->indptr[indptr_idx] = g_indptr[i] + cum_num_edges;
-      indptr_idx++;
+    const CSRPtr g_csrptr = gr->GetInCSR();
+    const int64_t g_num_nodes = g_csrptr->NumVertices();
+    const int64_t g_num_edges = g_csrptr->NumEdges();
+    dgl_id_t* g_indptr = static_cast<dgl_id_t*>(g_csrptr->indptr()->data);
+    dgl_id_t* g_indices = static_cast<dgl_id_t*>(g_csrptr->indices()->data);
+    dgl_id_t* g_edge_ids = static_cast<dgl_id_t*>(g_csrptr->edge_ids()->data);
+    for (dgl_id_t i = 1; i < g_num_nodes + 1; ++i) {
+      indptr[cum_num_nodes + i] = g_indptr[i] + cum_num_edges;
    }
-    for (dgl_id_t i = 0; i < g_indices.size(); ++i) {
-      batched_csr_ptr->indices.push_back(g_indices[i] + cum_num_nodes);
+    for (dgl_id_t i = 0; i < g_num_edges; ++i) {
+      indices[cum_num_edges + i] = g_indices[i] + cum_num_nodes;
    }

-    for (dgl_id_t i = 0; i < g_edge_ids.size(); ++i) {
-      batched_csr_ptr->edge_ids.push_back(g_edge_ids[i] + cum_num_edges);
+    for (dgl_id_t i = 0; i < g_num_edges; ++i) {
+      edge_ids[cum_num_edges + i] = g_edge_ids[i] + cum_num_edges;
    }
    cum_num_nodes += g_num_nodes;
    cum_num_edges += g_num_edges;
  }

+  CSRPtr batched_csr_ptr = CSRPtr(new CSR(indptr_arr, indices_arr, edge_ids_arr));
  return ImmutableGraph(batched_csr_ptr, nullptr);
 }

@@ -157,9 +161,11 @@ std::vector<ImmutableGraph> GraphOp::DisjointPartitionByNum(const ImmutableGraph

 std::vector<ImmutableGraph> GraphOp::DisjointPartitionBySizes(const ImmutableGraph *batched_graph,
        IdArray sizes) {
+  // TODO(minjie): use array views to speedup this operation
  const int64_t len = sizes->shape[0];
  const int64_t *sizes_data = static_cast<int64_t *>(sizes->data);
  std::vector<int64_t> cumsum;
+  cumsum.reserve(len + 1);
  cumsum.push_back(0);
  for (int64_t i = 0; i < len; ++i) {
    cumsum.push_back(cumsum[i] + sizes_data[i]);
@@ -167,35 +173,40 @@ std::vector<ImmutableGraph> GraphOp::DisjointPartitionBySizes(const ImmutableGra
  CHECK_EQ(cumsum[len], batched_graph->NumVertices())
    << "Sum of the given sizes must equal to the number of nodes.";
  std::vector<ImmutableGraph> rst;
-  const ImmutableGraph::CSR::Ptr &in_csr_ptr = batched_graph->GetInCSR();
-  ImmutableGraph::CSR::vector<int64_t> &bg_indptr = in_csr_ptr->indptr;
-  ImmutableGraph::CSR::vector<dgl_id_t> &bg_indices = in_csr_ptr->indices;
+  CSRPtr in_csr_ptr = batched_graph->GetInCSR();
+  const dgl_id_t* indptr = static_cast<dgl_id_t*>(in_csr_ptr->indptr()->data);
+  const dgl_id_t* indices = static_cast<dgl_id_t*>(in_csr_ptr->indices()->data);
+  const dgl_id_t* edge_ids = static_cast<dgl_id_t*>(in_csr_ptr->edge_ids()->data);
  dgl_id_t cum_sum_edges = 0;
  for (int64_t i = 0; i < len; ++i) {
-    int64_t start_pos = cumsum[i];
-    int64_t end_pos = cumsum[i + 1];
-    int64_t g_num_edges = bg_indptr[end_pos] - bg_indptr[start_pos];
-    ImmutableGraph::CSR::Ptr g_in_csr_ptr = std::make_shared<ImmutableGraph::CSR>(sizes_data[i],
-            g_num_edges);
-    ImmutableGraph::CSR::vector<int64_t> &g_indptr = g_in_csr_ptr->indptr;
-    ImmutableGraph::CSR::vector<dgl_id_t> &g_indices = g_in_csr_ptr->indices;
-    ImmutableGraph::CSR::vector<dgl_id_t> &g_edge_ids = g_in_csr_ptr->edge_ids;
+    const int64_t start_pos = cumsum[i];
+    const int64_t end_pos = cumsum[i + 1];
+    const int64_t g_num_nodes = sizes_data[i];
+    const int64_t g_num_edges = indptr[end_pos] - indptr[start_pos];
+    IdArray indptr_arr = NewIdArray(g_num_nodes + 1);
+    IdArray indices_arr = NewIdArray(g_num_edges);
+    IdArray edge_ids_arr = NewIdArray(g_num_edges);
+    dgl_id_t* g_indptr = static_cast<dgl_id_t*>(indptr_arr->data);
+    dgl_id_t* g_indices = static_cast<dgl_id_t*>(indices_arr->data);
+    dgl_id_t* g_edge_ids = static_cast<dgl_id_t*>(edge_ids_arr->data);

+    const dgl_id_t idoff = indptr[start_pos];
+    g_indptr[0] = 0;
    for (int l = start_pos + 1; l < end_pos + 1; ++l) {
-      g_indptr[l - start_pos] = bg_indptr[l] - bg_indptr[start_pos];
+      g_indptr[l - start_pos] = indptr[l] - indptr[start_pos];
    }

-    for (int j = bg_indptr[start_pos]; j < bg_indptr[end_pos]; ++j) {
-      g_indices.push_back(bg_indices[j] - cumsum[i]);
+    for (int j = indptr[start_pos]; j < indptr[end_pos]; ++j) {
+      g_indices[j - idoff] = indices[j] - cumsum[i];
    }

-    for (int k = bg_indptr[start_pos]; k < bg_indptr[end_pos]; ++k) {
-      g_edge_ids.push_back(in_csr_ptr->edge_ids[k] - cum_sum_edges);
+    for (int k = indptr[start_pos]; k < indptr[end_pos]; ++k) {
+      g_edge_ids[k - idoff] = edge_ids[k] - cum_sum_edges;
    }

    cum_sum_edges += g_num_edges;
-    ImmutableGraph graph(g_in_csr_ptr, nullptr);
-    rst.push_back(graph);
+    CSRPtr g_in_csr_ptr = CSRPtr(new CSR(indptr_arr, indices_arr, edge_ids_arr));
+    rst.emplace_back(g_in_csr_ptr, nullptr);
  }
  return rst;
 }

--- a/src/graph/immutable_graph.cc
+++ b/src/graph/immutable_graph.cc
--- a/src/graph/network.cc
+++ b/src/graph/network.cc
@@ -85,10 +85,10 @@ DGL_REGISTER_GLOBAL("network._CAPI_SenderSendSubgraph")
    CommunicatorHandle chandle = args[0];
    int recv_id = args[1];
    GraphHandle ghandle = args[2];
-    const IdArray node_mapping = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[3]));
-    const IdArray edge_mapping = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[4]));
-    const IdArray layer_offsets = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[5]));
-    const IdArray flow_offsets = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[6]));
+    const IdArray node_mapping = args[3];
+    const IdArray edge_mapping = args[4];
+    const IdArray layer_offsets = args[5];
+    const IdArray flow_offsets = args[6];
    ImmutableGraph *ptr = static_cast<ImmutableGraph*>(ghandle);
    network::Sender* sender = static_cast<network::Sender*>(chandle);
    auto csr = ptr->GetInCSR();
@@ -160,7 +160,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_ReceiverRecvSubgraph")
    int control = *buffer;
    if (control == CONTROL_NODEFLOW) {
      NodeFlow* nf = new NodeFlow();
-      ImmutableGraph::CSR::Ptr csr;
+      CSRPtr csr;
      // Deserialize nodeflow from recv_data_buffer
      network::DeserializeSampledSubgraph(buffer+sizeof(CONTROL_NODEFLOW),
                                          &(csr),
@@ -168,7 +168,7 @@ DGL_REGISTER_GLOBAL("network._CAPI_ReceiverRecvSubgraph")
                                          &(nf->edge_mapping),
                                          &(nf->layer_offsets),
                                          &(nf->flow_offsets));
-      nf->graph = GraphPtr(new ImmutableGraph(csr, nullptr, false));
+      nf->graph = GraphPtr(new ImmutableGraph(csr, nullptr));
      std::vector<NodeFlow*> subgs(1);
      subgs[0] = nf;
      *rv = WrapVectorReturn(subgs);

--- a/src/graph/network/serialize.cc
+++ b/src/graph/network/serialize.cc
@@ -18,7 +18,7 @@ namespace network {
 const int kNumTensor = 7;  // We need to serialize 7 conponents (tensor) here

 int64_t SerializeSampledSubgraph(char* data,
-                                 const ImmutableGraph::CSR::Ptr csr,
+                                 const CSRPtr csr,
                                 const IdArray& node_mapping,
                                 const IdArray& edge_mapping,
                                 const IdArray& layer_offsets,
@@ -30,9 +30,9 @@ int64_t SerializeSampledSubgraph(char* data,
  int64_t edge_mapping_size = edge_mapping->shape[0] * sizeof(dgl_id_t);
  int64_t layer_offsets_size = layer_offsets->shape[0] * sizeof(dgl_id_t);
  int64_t flow_offsets_size = flow_offsets->shape[0] * sizeof(dgl_id_t);
-  int64_t indptr_size = csr->indptr.size() * sizeof(int64_t);
-  int64_t indices_size = csr->indices.size() * sizeof(dgl_id_t);
-  int64_t edge_ids_size = csr->edge_ids.size() * sizeof(dgl_id_t);
+  int64_t indptr_size = csr->indptr().GetSize();
+  int64_t indices_size = csr->indices().GetSize();
+  int64_t edge_ids_size = csr->edge_ids().GetSize();
  total_size += node_mapping_size;
  total_size += edge_mapping_size;
  total_size += layer_offsets_size;
@@ -52,9 +52,9 @@ int64_t SerializeSampledSubgraph(char* data,
  dgl_id_t* edge_map_data = static_cast<dgl_id_t*>(edge_mapping->data);
  dgl_id_t* layer_off_data = static_cast<dgl_id_t*>(layer_offsets->data);
  dgl_id_t* flow_off_data = static_cast<dgl_id_t*>(flow_offsets->data);
-  int64_t* indptr = static_cast<int64_t*>(csr->indptr.data());
-  dgl_id_t* indices = static_cast<dgl_id_t*>(csr->indices.data());
-  dgl_id_t* edge_ids = static_cast<dgl_id_t*>(csr->edge_ids.data());
+  dgl_id_t* indptr = static_cast<dgl_id_t*>(csr->indptr()->data);
+  dgl_id_t* indices = static_cast<dgl_id_t*>(csr->indices()->data);
+  dgl_id_t* edge_ids = static_cast<dgl_id_t*>(csr->edge_ids()->data);
  // node_mapping
  *(reinterpret_cast<int64_t*>(data_ptr)) = node_mapping_size;
  data_ptr += sizeof(int64_t);
@@ -94,7 +94,7 @@ int64_t SerializeSampledSubgraph(char* data,
 }

 void DeserializeSampledSubgraph(char* data,
-                                ImmutableGraph::CSR::Ptr* csr,
+                                CSRPtr* csr,
                                IdArray* node_mapping,
                                IdArray* edge_mapping,
                                IdArray* layer_offsets,
@@ -139,25 +139,24 @@ void DeserializeSampledSubgraph(char* data,
  memcpy(edge_mapping_data, data_ptr, tensor_size);
  data_ptr += tensor_size;
  // Construct sub_csr_graph
-  *csr = std::make_shared<ImmutableGraph::CSR>(num_vertices, num_edges);
-  (*csr)->indices.resize(num_edges);
-  (*csr)->edge_ids.resize(num_edges);
+  // TODO(minjie): multigraph flag
+  *csr = CSRPtr(new CSR(num_vertices, num_edges, false));
  // indices (CSR)
  tensor_size = *(reinterpret_cast<int64_t*>(data_ptr));
  data_ptr += sizeof(int64_t);
-  dgl_id_t* col_list_out = (*csr)->indices.data();
+  dgl_id_t* col_list_out = static_cast<dgl_id_t*>((*csr)->indices()->data);
  memcpy(col_list_out, data_ptr, tensor_size);
  data_ptr += tensor_size;
  // edge_ids (CSR)
  tensor_size = *(reinterpret_cast<int64_t*>(data_ptr));
  data_ptr += sizeof(int64_t);
-  dgl_id_t* edge_ids = (*csr)->edge_ids.data();
+  dgl_id_t* edge_ids = static_cast<dgl_id_t*>((*csr)->edge_ids()->data);
  memcpy(edge_ids, data_ptr, tensor_size);
  data_ptr += tensor_size;
  // indptr (CSR)
  tensor_size = *(reinterpret_cast<int64_t*>(data_ptr));
  data_ptr += sizeof(int64_t);
-  int64_t* indptr_out = (*csr)->indptr.data();
+  dgl_id_t* indptr_out = static_cast<dgl_id_t*>((*csr)->indptr()->data);
  memcpy(indptr_out, data_ptr, tensor_size);
  data_ptr += tensor_size;
 }

--- a/src/graph/network/serialize.h
+++ b/src/graph/network/serialize.h
@@ -23,7 +23,7 @@ namespace network {
 * \return the total size of the serialized binary data
 */
 int64_t SerializeSampledSubgraph(char* data,
-                                 const ImmutableGraph::CSR::Ptr csr,
+                                 const CSRPtr csr,
                                 const IdArray& node_mapping,
                                 const IdArray& edge_mapping,
                                 const IdArray& layer_offsets,
@@ -39,7 +39,7 @@ int64_t SerializeSampledSubgraph(char* data,
 * \param flow_offsets flow offsets in NodeFlowIndex
 */
 void DeserializeSampledSubgraph(char* data,
-                                ImmutableGraph::CSR::Ptr* csr,
+                                CSRPtr* csr,
                                IdArray* node_mapping,
                                IdArray* edge_mapping,
                                IdArray* layer_offsets,

--- a/src/graph/nodeflow.cc
+++ b/src/graph/nodeflow.cc
@@ -11,38 +11,42 @@

 #include "../c_api_common.h"

+using dgl::runtime::DGLArgs;
+using dgl::runtime::DGLArgValue;
+using dgl::runtime::DGLRetValue;
+using dgl::runtime::PackedFunc;
+
 namespace dgl {

 std::vector<IdArray> GetNodeFlowSlice(const ImmutableGraph &graph, const std::string &fmt,
                                      size_t layer0_size, size_t layer1_start,
                                      size_t layer1_end, bool remap) {
  CHECK_GE(layer1_start, layer0_size);
-  if (fmt == "csr") {
+  if (fmt == std::string("csr")) {
    dgl_id_t first_vid = layer1_start - layer0_size;
-    ImmutableGraph::CSRArray arrs = graph.GetInCSRArray(layer1_start, layer1_end);
+    CSRMatrix csr = SliceRows(graph.GetInCSR()->ToCSRMatrix(), layer1_start, layer1_end);
    if (remap) {
-      dgl_id_t *indices_data = static_cast<dgl_id_t*>(arrs.indices->data);
-      dgl_id_t *eid_data = static_cast<dgl_id_t*>(arrs.id->data);
-      const size_t len = arrs.indices->shape[0];
-      dgl_id_t first_eid = eid_data[0];
-      for (size_t i = 0; i < len; i++) {
-        CHECK_GE(indices_data[i], first_vid);
-        indices_data[i] -= first_vid;
-        CHECK_GE(eid_data[i], first_eid);
-        eid_data[i] -= first_eid;
-      }
+      dgl_id_t *eid_data = static_cast<dgl_id_t*>(csr.data->data);
+      const dgl_id_t first_eid = eid_data[0];
+      IdArray new_indices = Sub(csr.indices, first_vid);
+      IdArray new_data = Sub(csr.data, first_eid);
+      return {csr.indptr, new_indices, new_data};
+    } else {
+      return {csr.indptr, csr.indices, csr.data};
    }
-    return std::vector<IdArray>{arrs.indptr, arrs.indices, arrs.id};
-  } else if (fmt == "coo") {
-    ImmutableGraph::CSR::Ptr csr = graph.GetInCSR();
-    int64_t nnz = csr->indptr[layer1_end] - csr->indptr[layer1_start];
-    IdArray idx = IdArray::Empty({2 * nnz}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-    IdArray eid = IdArray::Empty({nnz}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+  } else if (fmt == std::string("coo")) {
+    CSRMatrix csr = graph.GetInCSR()->ToCSRMatrix();
+    const dgl_id_t* indptr = static_cast<dgl_id_t*>(csr.indptr->data);
+    const dgl_id_t* indices = static_cast<dgl_id_t*>(csr.indices->data);
+    const dgl_id_t* edge_ids = static_cast<dgl_id_t*>(csr.data->data);
+    int64_t nnz = indptr[layer1_end] - indptr[layer1_start];
+    IdArray idx = NewIdArray(2 * nnz);
+    IdArray eid = NewIdArray(nnz);
    int64_t *idx_data = static_cast<int64_t*>(idx->data);
    dgl_id_t *eid_data = static_cast<dgl_id_t*>(eid->data);
    size_t num_edges = 0;
    for (size_t i = layer1_start; i < layer1_end; i++) {
-      for (int64_t j = csr->indptr[i]; j < csr->indptr[i + 1]; j++) {
+      for (dgl_id_t j = indptr[i]; j < indptr[i + 1]; j++) {
        // These nodes are all in a layer. We need to remap them to the node id
        // local to the layer.
        idx_data[num_edges] = remap ? i - layer1_start : i;
@@ -51,25 +55,38 @@ std::vector<IdArray> GetNodeFlowSlice(const ImmutableGraph &graph, const std::st
    }
    CHECK_EQ(num_edges, nnz);
    if (remap) {
-      size_t edge_start = csr->indptr[layer1_start];
-      dgl_id_t first_eid = csr->edge_ids[edge_start];
+      size_t edge_start = indptr[layer1_start];
+      dgl_id_t first_eid = edge_ids[edge_start];
      dgl_id_t first_vid = layer1_start - layer0_size;
      for (int64_t i = 0; i < nnz; i++) {
-        CHECK_GE(csr->indices[edge_start + i], first_vid);
-        idx_data[nnz + i] = csr->indices[edge_start + i] - first_vid;
-        eid_data[i] = csr->edge_ids[edge_start + i] - first_eid;
+        CHECK_GE(indices[edge_start + i], first_vid);
+        idx_data[nnz + i] = indices[edge_start + i] - first_vid;
+        eid_data[i] = edge_ids[edge_start + i] - first_eid;
      }
    } else {
-      std::copy(csr->indices.begin() + csr->indptr[layer1_start],
-                csr->indices.begin() + csr->indptr[layer1_end], idx_data + nnz);
-      std::copy(csr->edge_ids.begin() + csr->indptr[layer1_start],
-                csr->edge_ids.begin() + csr->indptr[layer1_end], eid_data);
+      std::copy(indices + indptr[layer1_start],
+                indices + indptr[layer1_end], idx_data + nnz);
+      std::copy(edge_ids + indptr[layer1_start],
+                edge_ids + indptr[layer1_end], eid_data);
    }
    return std::vector<IdArray>{idx, eid};
  } else {
    LOG(FATAL) << "unsupported adjacency matrix format";
-    return std::vector<IdArray>();
+    return {};
  }
 }

+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetBlockAdj")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    GraphHandle ghandle = args[0];
+    std::string format = args[1];
+    int64_t layer0_size = args[2];
+    int64_t start = args[3];
+    int64_t end = args[4];
+    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
+    const ImmutableGraph* gptr = dynamic_cast<const ImmutableGraph*>(ptr);
+    auto res = GetNodeFlowSlice(*gptr, format, layer0_size, start, end, true);
+    *rv = ConvertNDArrayVectorToPackedFunc(res);
+  });
+
 }  // namespace dgl