[Feature] Improve sampling speed; Better pickle/unpickle; other fixes (#1299)

* improve performance of sample_neighbors * some more improve * test script * benchmarks * multi process * update more tests * WIP * adding two API for state saving * add create from state * upd test * missing file * wip: pickle/unpickle * more c apis * find the problem of empty data array * add null array; pickling speed is bad * still bad perf * still bad perf * wip * fix the pickle speed test; now everything looks good * minor fix * bugfix * some lint fix * address comments * more fix * fix lint * add utest for random.choice * add utest for dgl.rand_graph * fix cpp utests * try fix ci * fix bug in TF backend * upd choice docstring * address comments * upd * try fix compile * add comment

[Feature] Improve sampling speed; Better pickle/unpickle; other fixes (#1299)
* improve performance of sample_neighbors * some more improve * test script * benchmarks * multi process * update more tests * WIP * adding two API for state saving * add create from state * upd test * missing file * wip: pickle/unpickle * more c apis * find the problem of empty data array * add null array; pickling speed is bad * still bad perf * still bad perf * wip * fix the pickle speed test; now everything looks good * minor fix * bugfix * some lint fix * address comments * more fix * fix lint * add utest for random.choice * add utest for dgl.rand_graph * fix cpp utests * try fix ci * fix bug in TF backend * upd choice docstring * address comments * upd * try fix compile * add comment
5dd35580 · Minjie Wang · GitHub · 00ba4094 · 5dd35580 · 5dd35580
Unverified Commit 5dd35580 authored Mar 02, 2020 by Minjie Wang Committed by GitHub Mar 02, 2020
20 changed files
--- a/include/dgl/array.h
+++ b/include/dgl/array.h
@@ -9,13 +9,16 @@
 #ifndef DGL_ARRAY_H_
 #define DGL_ARRAY_H_

-#include <dgl/runtime/ndarray.h>
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
 #include <algorithm>
 #include <vector>
 #include <tuple>
 #include <utility>
+#include <string>
+
+#include "./runtime/ndarray.h"
+#include "./runtime/object.h"

 namespace dgl {

@@ -31,12 +34,75 @@ typedef NDArray IntArray;
 typedef NDArray FloatArray;
 typedef NDArray TypeArray;

+/*!
+ * \brief Sparse format.
+ */
+enum class SparseFormat {
+  ANY = 0,
+  COO = 1,
+  CSR = 2,
+  CSC = 3
+};
+
+// Parse sparse format from string.
+inline SparseFormat ParseSparseFormat(const std::string& name) {
+  if (name == "coo")
+    return SparseFormat::COO;
+  else if (name == "csr")
+    return SparseFormat::CSR;
+  else if (name == "csc")
+    return SparseFormat::CSC;
+  else
+    return SparseFormat::ANY;
+}
+
+// Sparse matrix object that is exposed to python API.
+struct SparseMatrix : public runtime::Object {
+  // Sparse format.
+  int32_t format = 0;
+
+  // Shape of this matrix.
+  int64_t num_rows = 0, num_cols = 0;
+
+  // Index arrays. For CSR, it is {indptr, indices, data}. For COO, it is {row, col, data}.
+  std::vector<IdArray> indices;
+
+  // Boolean flags.
+  // TODO(minjie): We might revisit this later to provide a more general solution. Currently,
+  //   we only consider aten::COOMatrix and aten::CSRMatrix.
+  std::vector<bool> flags;
+
+  SparseMatrix() {}
+
+  SparseMatrix(int32_t fmt, int64_t nrows, int64_t ncols,
+               const std::vector<IdArray>& idx,
+               const std::vector<bool>& flg)
+    : format(fmt), num_rows(nrows), num_cols(ncols), indices(idx), flags(flg) {}
+
+  static constexpr const char* _type_key = "aten.SparseMatrix";
+  DGL_DECLARE_OBJECT_TYPE_INFO(SparseMatrix, runtime::Object);
+};
+// Define SparseMatrixRef
+DGL_DEFINE_OBJECT_REF(SparseMatrixRef, SparseMatrix);
+
 namespace aten {

 //////////////////////////////////////////////////////////////////////
 // ID array
 //////////////////////////////////////////////////////////////////////

+/*! \return A special array to represent null. */
+inline NDArray NullArray() {
+  return NDArray::Empty({0}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+}
+
+/*!
+ * \return Whether the input array is a null array.
+ */
+inline bool IsNullArray(NDArray array) {
+  return array->shape[0] == 0;
+}
+
 /*!
 * \brief Create a new id array with given length
 * \param length The array length
@@ -232,7 +298,7 @@ struct CSRMatrix {
  int64_t num_rows = 0, num_cols = 0;
  /*! \brief CSR index arrays */
  IdArray indptr, indices;
-  /*! \brief data index array. When empty, assume it is from 0 to NNZ - 1. */
+  /*! \brief data index array. When is null, assume it is from 0 to NNZ - 1. */
  IdArray data;
  /*! \brief whether the column indices per row are sorted */
  bool sorted = false;
@@ -240,10 +306,24 @@ struct CSRMatrix {
  CSRMatrix() = default;
  /*! \brief constructor */
  CSRMatrix(int64_t nrows, int64_t ncols,
-            IdArray parr, IdArray iarr, IdArray darr = IdArray(),
+            IdArray parr, IdArray iarr, IdArray darr = NullArray(),
            bool sorted_flag = false)
    : num_rows(nrows), num_cols(ncols), indptr(parr), indices(iarr),
      data(darr), sorted(sorted_flag) {}
+
+  /*! \brief constructor from SparseMatrix object */
+  explicit CSRMatrix(const SparseMatrix& spmat)
+    : num_rows(spmat.num_rows), num_cols(spmat.num_cols),
+      indptr(spmat.indices[0]), indices(spmat.indices[1]), data(spmat.indices[2]),
+      sorted(spmat.flags[0]) {}
+
+  // Convert to a SparseMatrix object that can return to python.
+  SparseMatrix ToSparseMatrix() const {
+    return SparseMatrix(static_cast<int32_t>(SparseFormat::CSR),
+                        num_rows, num_cols,
+                        {indptr, indices, data},
+                        {sorted});
+  }
 };

 /*!
@@ -262,7 +342,7 @@ struct COOMatrix {
  int64_t num_rows = 0, num_cols = 0;
  /*! \brief COO index arrays */
  IdArray row, col;
-  /*! \brief data index array. When empty, assume it is from 0 to NNZ - 1. */
+  /*! \brief data index array. When is null, assume it is from 0 to NNZ - 1. */
  IdArray data;
  /*! \brief whether the row indices are sorted */
  bool row_sorted = false;
@@ -272,10 +352,24 @@ struct COOMatrix {
  COOMatrix() = default;
  /*! \brief constructor */
  COOMatrix(int64_t nrows, int64_t ncols,
-            IdArray rarr, IdArray carr, IdArray darr = IdArray(),
+            IdArray rarr, IdArray carr, IdArray darr = NullArray(),
            bool rsorted = false, bool csorted = false)
    : num_rows(nrows), num_cols(ncols), row(rarr), col(carr), data(darr),
      row_sorted(rsorted), col_sorted(csorted) {}
+
+  /*! \brief constructor from SparseMatrix object */
+  explicit COOMatrix(const SparseMatrix& spmat)
+    : num_rows(spmat.num_rows), num_cols(spmat.num_cols),
+      row(spmat.indices[0]), col(spmat.indices[1]), data(spmat.indices[2]),
+      row_sorted(spmat.flags[0]), col_sorted(spmat.flags[1]) {}
+
+  // Convert to a SparseMatrix object that can return to python.
+  SparseMatrix ToSparseMatrix() const {
+    return SparseMatrix(static_cast<int32_t>(SparseFormat::COO),
+                        num_rows, num_cols,
+                        {row, col, data},
+                        {row_sorted, col_sorted});
+  }
 };

 ///////////////////////// CSR routines //////////////////////////
@@ -300,7 +394,7 @@ runtime::NDArray CSRGetRowData(CSRMatrix , int64_t row);

 /*! \brief Whether the CSR matrix contains data */
 inline bool CSRHasData(CSRMatrix csr) {
-  return csr.data.defined();
+  return !IsNullArray(csr.data);
 }

 /* \brief Get data. The return type is an ndarray due to possible duplicate entries. */
@@ -492,7 +586,7 @@ COOGetRowDataAndIndices(COOMatrix , int64_t row);

 /*! \brief Whether the COO matrix contains data */
 inline bool COOHasData(COOMatrix csr) {
-  return csr.data.defined();
+  return !IsNullArray(csr.data);
 }

 /*! \brief Get data. The return type is an ndarray due to possible duplicate entries. */

--- a/include/dgl/base_heterograph.h
+++ b/include/dgl/base_heterograph.h
@@ -34,16 +34,6 @@ enum class EdgeDir {
  kOut  // out edge direction
 };

-/*!
- * \brief Sparse graph format.
- */
-enum class SparseFormat {
-  ANY = 0,
-  COO = 1,
-  CSR = 2,
-  CSC = 3
-};
-
 /*!
 * \brief Base heterogenous graph.
 *
@@ -488,6 +478,7 @@ struct HeteroSubgraph : public runtime::Object {
  static constexpr const char* _type_key = "graph.HeteroSubgraph";
  DGL_DECLARE_OBJECT_TYPE_INFO(HeteroSubgraph, runtime::Object);
 };
+
 // Define HeteroSubgraphRef
 DGL_DEFINE_OBJECT_REF(HeteroSubgraphRef, HeteroSubgraph);

@@ -547,18 +538,7 @@ struct FlattenedHeteroGraph : public runtime::Object {
 };
 DGL_DEFINE_OBJECT_REF(FlattenedHeteroGraphRef, FlattenedHeteroGraph);

-// creators
-
-inline SparseFormat ParseSparseFormat(const std::string& name) {
-  if (name == "coo")
-    return SparseFormat::COO;
-  else if (name == "csr")
-    return SparseFormat::CSR;
-  else if (name == "csc")
-    return SparseFormat::CSC;
-  else
-    return SparseFormat::ANY;
-}
+// Declarations of functions and algorithms

 /*! \brief Create a heterograph from meta graph and a list of bipartite graph */
 HeteroGraphPtr CreateHeteroGraph(
@@ -612,6 +592,87 @@ HeteroSubgraph InEdgeGraph(const HeteroGraphPtr graph, const std::vector<IdArray
 */
 HeteroSubgraph OutEdgeGraph(const HeteroGraphPtr graph, const std::vector<IdArray>& nodes);

-};  // namespace dgl
+
+/*!
+ * \brief Union multiple graphs into one with each input graph as one disjoint component.
+ *
+ * All input graphs should have the same metagraph.
+ *
+ * TODO(minjie): remove the meta_graph argument
+ * 
+ * \param meta_graph Metagraph of the inputs and result.
+ * \param component_graphs Input graphs
+ * \return One graph that unions all the components
+ */
+HeteroGraphPtr DisjointUnionHeteroGraph(
+    GraphPtr meta_graph, const std::vector<HeteroGraphPtr>& component_graphs);
+
+/*!
+ * \brief Split a graph into multiple disjoin components.
+ *
+ * Edges across different components are ignored. All the result graphs have the same
+ * metagraph as the input one.
+ *
+ * The `vertex_sizes` and `edge_sizes` arrays the concatenation of arrays of each
+ * node/edge type. Suppose there are N vertex types, then the array length should
+ * be B*N, where B is the number of components to split.
+ *
+ * TODO(minjie): remove the meta_graph argument; use vector<IdArray> for vertex_sizes
+ *   and edge_sizes.
+ *
+ * \param meta_graph Metagraph.
+ * \param batched_graph Input graph.
+ * \param vertex_sizes Number of vertices of each component.
+ * \param edge_sizes Number of vertices of each component.
+ * \return A list of graphs representing each disjoint components.
+ */
+std::vector<HeteroGraphPtr> DisjointPartitionHeteroBySizes(
+    GraphPtr meta_graph,
+    HeteroGraphPtr batched_graph,
+    IdArray vertex_sizes,
+    IdArray edge_sizes);
+
+/*! 
+ * \brief Structure for pickle/unpickle.
+ *
+ * The design principle is to leverage the NDArray class as much as possible so
+ * that when they are converted to backend-specific tensors, we could leverage
+ * the efficient pickle/unpickle solutions from the backend framework.
+ *
+ * NOTE(minjie): This is a temporary solution before we support shared memory
+ *   storage ourselves.
+ *
+ * This class can be used as arguments and return values of a C API.
+ */
+struct HeteroPickleStates : public runtime::Object {
+  /*! \brief Metagraph. */
+  GraphPtr metagraph;
+
+  /*! \brief adjacency matrices of each relation graph */
+  std::vector<std::shared_ptr<SparseMatrix> > adjs;
+
+  static constexpr const char* _type_key = "graph.HeteroPickleStates";
+  DGL_DECLARE_OBJECT_TYPE_INFO(HeteroPickleStates, runtime::Object);
+};
+
+// Define HeteroPickleStatesRef
+DGL_DEFINE_OBJECT_REF(HeteroPickleStatesRef, HeteroPickleStates);
+
+/*!
+ * \brief Create a heterograph from pickling states.
+ *
+ * \param states Pickle states
+ * \return A heterograph pointer
+ */
+HeteroGraphPtr HeteroUnpickle(const HeteroPickleStates& states);
+
+/*!
+ * \brief Get the pickling state of the relation graph structure in backend tensors.
+ *
+ * \returnAdjacency matrices of all relation graphs in a list of arrays.
+ */
+HeteroPickleStates HeteroPickle(HeteroGraphPtr graph);
+
+}  // namespace dgl

 #endif  // DGL_BASE_HETEROGRAPH_H_
--- a/include/dgl/random.h
+++ b/include/dgl/random.h
@@ -118,11 +118,31 @@ class RandomEngine {
   * \tparam FloatType Probability value type
   * \param num Number of integers to choose
   * \param prob Array of N unnormalized probability of each element.  Must be non-negative.
+   * \param out The output buffer to write selected indices.
   * \param replace If true, choose with replacement.
-   * \return Integer array
   */
  template <typename IdxType, typename FloatType>
-  IdArray Choice(int64_t num, FloatArray prob, bool replace = true);
+  void Choice(IdxType num, FloatArray prob, IdxType* out, bool replace = true);
+
+  /*!
+   * \brief Pick random integers between 0 to N-1 according to given probabilities
+   * 
+   * If replace is false, the number of picked integers must not larger than N.
+   *
+   * \tparam IdxType Id type
+   * \tparam FloatType Probability value type
+   * \param num Number of integers to choose
+   * \param prob Array of N unnormalized probability of each element.  Must be non-negative.
+   * \param replace If true, choose with replacement.
+   * \return Picked indices
+   */
+  template <typename IdxType, typename FloatType>
+  IdArray Choice(IdxType num, FloatArray prob, bool replace = true) {
+    const DLDataType dtype{kDLInt, sizeof(IdxType) * 8, 1};
+    IdArray ret = IdArray::Empty({num}, dtype, prob->ctx);
+    Choice<IdxType, FloatType>(num, prob, static_cast<IdxType*>(ret->data), replace);
+    return ret;
+  }

  /*!
   * \brief Pick random integers from population by uniform distribution.
@@ -132,11 +152,31 @@ class RandomEngine {
   * \tparam IdxType Return integer type
   * \param num Number of integers to choose
   * \param population Total number of elements to choose from.
+   * \param out The output buffer to write selected indices.
   * \param replace If true, choose with replacement.
-   * \return Integer array
   */
  template <typename IdxType>
-  IdArray UniformChoice(int64_t num, int64_t population, bool replace = true);
+  void UniformChoice(IdxType num, IdxType population, IdxType* out, bool replace = true);
+
+  /*!
+   * \brief Pick random integers from population by uniform distribution.
+   *
+   * If replace is false, num must not be larger than population.
+   *
+   * \tparam IdxType Return integer type
+   * \param num Number of integers to choose
+   * \param population Total number of elements to choose from.
+   * \param replace If true, choose with replacement.
+   * \return Picked indices
+   */
+  template <typename IdxType>
+  IdArray UniformChoice(IdxType num, IdxType population, bool replace = true) {
+    const DLDataType dtype{kDLInt, sizeof(IdxType) * 8, 1};
+    // TODO(minjie): only CPU implementation right now
+    IdArray ret = IdArray::Empty({num}, dtype, DLContext{kDLCPU, 0});
+    UniformChoice<IdxType>(num, population, static_cast<IdxType*>(ret->data), replace);
+    return ret;
+  }

 private:
  std::default_random_engine rng_;

--- a/include/dgl/runtime/object.h
+++ b/include/dgl/runtime/object.h
@@ -63,7 +63,7 @@ class Object {
   */
  virtual void VisitAttrs(AttrVisitor* visitor) {}
  /*! \return the type index of the object */
-  virtual const uint32_t type_index() const = 0;
+  virtual uint32_t type_index() const = 0;
  /*!
   * \brief Whether this object derives from object with type_index=tid.
   *  Implemented by DGL_DECLARE_OBJECT_TYPE_INFO
@@ -71,7 +71,7 @@ class Object {
   * \param tid The type index.
   * \return the check result.
   */
-  virtual const bool _DerivedFrom(uint32_t tid) const;
+  virtual bool _DerivedFrom(uint32_t tid) const;
  /*!
   * \brief get a runtime unique type index given a type key
   * \param type_key Type key of a type.
@@ -211,11 +211,11 @@ class ObjectRef {
  const char* type_key() const final {                                  \
    return TypeName::_type_key;                                         \
  }                                                                     \
-  const uint32_t type_index() const final {                             \
+  uint32_t type_index() const final {                                   \
    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
    return tidx;                                                        \
  }                                                                     \
-  const bool _DerivedFrom(uint32_t tid) const final {                   \
+  bool _DerivedFrom(uint32_t tid) const final {                         \
    static uint32_t tidx = TypeKey2Index(TypeName::_type_key);          \
    if (tidx == tid) return true;                                       \
    return Parent::_DerivedFrom(tid);                                   \
@@ -235,14 +235,14 @@ class ObjectRef {
    return CHECK_NOTNULL(std::dynamic_pointer_cast<ObjectName>(obj_));           \
  }                                                                              \
  operator bool() const { return this->defined(); }                              \
-  using ContainerType = ObjectName;
+  using ContainerType = ObjectName

 /*! \brief Macro to generate object reference class definition */
 #define DGL_DEFINE_OBJECT_REF(TypeName, ObjectName)                                  \
  class TypeName : public ::dgl::runtime::ObjectRef {                                \
   public:                                                                           \
    DGL_DEFINE_OBJECT_REF_METHODS(TypeName, ::dgl::runtime::ObjectRef, ObjectName);  \
-  };
+  }

 // implementations of inline functions after this
 template<typename T>

--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -23,6 +23,7 @@ from .batched_graph import *
 from .batched_heterograph import *
 from .convert import *
 from .graph import DGLGraph
+from .generators import *
 from .heterograph import DGLHeteroGraph
 from .nodeflow import *
 from .traversal import *

--- a/python/dgl/_ffi/_ctypes/object.py
+++ b/python/dgl/_ffi/_ctypes/object.py
@@ -40,9 +40,10 @@ C_TO_PY_ARG_SWITCH[TypeCode.OBJECT_HANDLE] = _wrap_arg_func(
 class ObjectBase(object):
    """Object base class"""
    __slots__ = ["handle"]
+
    # pylint: disable=no-member
    def __del__(self):
-        if _LIB is not None:
+        if _LIB is not None and hasattr(self, 'handle'):
            check_call(_LIB.DGLObjectFree(self.handle))

    def __getattr__(self, name):

--- a/python/dgl/_ffi/ndarray.py
+++ b/python/dgl/_ffi/ndarray.py
@@ -272,8 +272,7 @@ class NDArrayBase(_NDArrayBase):
        return self

    def __repr__(self):
-        res = "<dgl.NDArray shape={0}, {1}>\n".format(self.shape, self.context)
-        res += self.asnumpy().__repr__()
+        res = "dgl.{0}@{1}".format(self.asnumpy().__repr__(), self.context)
        return res

    def __str__(self):

--- a/python/dgl/container.py
+++ b/python/dgl/container.py
@@ -35,7 +35,6 @@ class List(ObjectBase):
    def __len__(self):
        return _api_internal._ListSize(self)

-
 @register_object
 class Map(ObjectBase):
    """Map container of DGL.

--- a/python/dgl/generators.py
+++ b/python/dgl/generators.py
+"""Module for various graph generator functions."""
+
+from . import backend as F
+from . import convert
+from . import random
+
+__all__ = ['rand_graph']
+
+def rand_graph(num_nodes, num_edges, restrict_format='any'):
+    """Generate a random graph of the given number of edges.
+
+    It uniformly chooses ``num_edges`` from all pairs and form a graph.
+
+    TODO(minjie): support RNG as one of the arguments.
+
+    Parameters
+    ----------
+    num_nodes : int
+        The number of nodes
+    num_edges : int
+        The number of edges
+    restrict_format : 'any', 'coo', 'csr', 'csc', optional
+        Force the storage format. Default: 'any' (i.e. let DGL decide what to use).
+
+    Returns
+    -------
+    DGLHeteroGraph
+        Generated random graph.
+    """
+    eids = random.choice(num_nodes * num_nodes, num_edges, replace=False)
+    rows = F.astype(eids / num_nodes, F.dtype(eids))
+    cols = F.astype(eids % num_nodes, F.dtype(eids))
+    g = convert.graph((rows, cols),
+                      card=num_nodes, validate=False,
+                      restrict_format=restrict_format)
+    return g
--- a/python/dgl/heterograph.py
+++ b/python/dgl/heterograph.py
@@ -183,12 +183,6 @@ class DGLHeteroGraph(object):
        Edge feature storage. If None, empty frame is created.
        Otherwise, ``edge_frames[i]`` stores the edge features
        of edge type i. (default: None)
-    multigraph : bool, optional
-        Whether the graph would be a multigraph. If none, the flag will be
-        determined by scanning the whole graph. (default: None)
-    readonly : bool, optional
-        Whether the graph structure is read-only. Currently, only readonly
-        is allowed. (default: True).
    """
    # pylint: disable=unused-argument
    def __init__(self,
@@ -196,25 +190,26 @@ class DGLHeteroGraph(object):
                 ntypes,
                 etypes,
                 node_frames=None,
-                 edge_frames=None,
-                 multigraph=None,
-                 readonly=True):
-        assert readonly, "Only readonly heterogeneous graphs are supported"
+                 edge_frames=None):
+        self._init(gidx, ntypes, etypes, node_frames, edge_frames)

+    def _init(self, gidx, ntypes, etypes, node_frames, edge_frames):
+        """Init internal states."""
        self._graph = gidx
-        self._nx_metagraph = None
        self._ntypes = ntypes
        self._etypes = etypes
-        self._canonical_etypes = make_canonical_etypes(etypes, ntypes, self._graph.metagraph)
+        self._nx_metagraph = None
+        self._canonical_etypes = make_canonical_etypes(
+            self._etypes, self._ntypes, self._graph.metagraph)
        # An internal map from etype to canonical etype tuple.
        # If two etypes have the same name, an empty tuple is stored instead to indicte ambiguity.
        self._etype2canonical = {}
-        for i, ety in enumerate(etypes):
+        for i, ety in enumerate(self._etypes):
            if ety in self._etype2canonical:
                self._etype2canonical[ety] = tuple()
            else:
                self._etype2canonical[ety] = self._canonical_etypes[i]
-        self._ntypes_invmap = {t : i for i, t in enumerate(ntypes)}
+        self._ntypes_invmap = {t : i for i, t in enumerate(self._ntypes)}
        self._etypes_invmap = {t : i for i, t in enumerate(self._canonical_etypes)}

        # node and edge frame
@@ -240,9 +235,16 @@ class DGLHeteroGraph(object):
            frame.set_initializer(init.zero_initializer)
            self._msg_frames.append(frame)

-        self._is_multigraph = multigraph
+        self._is_multigraph = None
+
+    def __getstate__(self):
+        return self._graph, self._ntypes, self._etypes, self._node_frames, self._edge_frames
+
+    def __setstate__(self, state):
+        self._init(*state)

    def _get_msg_index(self, etid):
+        """Internal function for getting the message index array of the given edge type id."""
        if self._msg_indices[etid] is None:
            self._msg_indices[etid] = utils.zero_index(
                size=self._graph.number_of_edges(etid))

--- a/python/dgl/heterograph_index.py
+++ b/python/dgl/heterograph_index.py
@@ -25,27 +25,11 @@ class HeteroGraphIndex(ObjectBase):
        return obj

    def __getstate__(self):
-        metagraph = self.metagraph
-        number_of_nodes = [self.number_of_nodes(i) for i in range(self.number_of_ntypes())]
-        edges = [self.edges(i, order='eid') for i in range(self.number_of_etypes())]
-        # multigraph and readonly are not used.
-        return metagraph, number_of_nodes, edges
+        return _CAPI_DGLHeteroPickle(self)

    def __setstate__(self, state):
-        metagraph, number_of_nodes, edges = state
-
        self._cache = {}
-        # loop over etypes and recover unit graphs
-        rel_graphs = []
-        for i, edges_per_type in enumerate(edges):
-            src_ntype, dst_ntype = metagraph.find_edge(i)
-            num_src = number_of_nodes[src_ntype]
-            num_dst = number_of_nodes[dst_ntype]
-            src_id, dst_id, _ = edges_per_type
-            rel_graphs.append(create_unitgraph_from_coo(
-                1 if src_ntype == dst_ntype else 2, num_src, num_dst, src_id, dst_id, 'any'))
-        self.__init_handle_by_constructor__(
-            _CAPI_DGLHeteroCreateHeteroGraph, metagraph, rel_graphs)
+        self.__init_handle_by_constructor__(_CAPI_DGLHeteroUnpickle, state)

    @property
    def metagraph(self):
@@ -1079,8 +1063,45 @@ def disjoint_partition(graph, bnn_all_types, bne_all_types):
    return _CAPI_DGLHeteroDisjointPartitionBySizes(
        graph, bnn_all_types.todgltensor(), bne_all_types.todgltensor())

+#################################################################
+# Data structure used by C APIs
+#################################################################
+
 @register_object("graph.FlattenedHeteroGraph")
 class FlattenedHeteroGraph(ObjectBase):
    """FlattenedHeteroGraph object class in C++ backend."""

+@register_object("graph.HeteroPickleStates")
+class HeteroPickleStates(ObjectBase):
+    """Pickle states object class in C++ backend."""
+    @property
+    def metagraph(self):
+        """Metagraph
+
+        Returns
+        -------
+        GraphIndex
+            Metagraph structure
+        """
+        return _CAPI_DGLHeteroPickleStatesGetMetagraph(self)
+
+    @property
+    def adjs(self):
+        """Adjacency matrices of all the relation graphs
+
+        Returns
+        -------
+        list of dgl.ndarray.SparseMatrix
+            Adjacency matrices
+        """
+        return list(_CAPI_DGLHeteroPickleStatesGetAdjs(self))
+
+    def __getstate__(self):
+        return self.metagraph, self.adjs
+
+    def __setstate__(self, state):
+        metagraph, adjs = state
+        self.__init_handle_by_constructor__(
+            _CAPI_DGLCreateHeteroPickleStates, metagraph, adjs)
+
 _init_api("dgl.heterograph_index")
--- a/python/dgl/kernel.py
+++ b/python/dgl/kernel.py
@@ -2,7 +2,7 @@
 from __future__ import absolute_import

 from ._ffi.function import _init_api
-from .ndarray import empty
+from .ndarray import null

 # pylint: disable=invalid-name
 def infer_binary_feature_shape(op, lhs, rhs):
@@ -136,11 +136,11 @@ def binary_op_reduce(reducer, op, G, A_target, B_target, A, B, out,
        The rows to write to output tensor.
    """
    if A_rows is None:
-        A_rows = empty([])
+        A_rows = null()
    if B_rows is None:
-        B_rows = empty([])
+        B_rows = null()
    if out_rows is None:
-        out_rows = empty([])
+        out_rows = null()
    _CAPI_DGLKernelBinaryOpReduce(
        reducer, op, G,
        int(A_target), int(B_target),
@@ -200,11 +200,11 @@ def backward_lhs_binary_op_reduce(
        The rows written to output tensor.
    """
    if A_rows is None:
-        A_rows = empty([])
+        A_rows = null()
    if B_rows is None:
-        B_rows = empty([])
+        B_rows = null()
    if out_rows is None:
-        out_rows = empty([])
+        out_rows = null()
    _CAPI_DGLKernelBackwardLhsBinaryOpReduce(
        reducer, op, G,
        int(A_target), int(B_target),
@@ -265,11 +265,11 @@ def backward_rhs_binary_op_reduce(
        The rows written to output tensor.
    """
    if A_rows is None:
-        A_rows = empty([])
+        A_rows = null()
    if B_rows is None:
-        B_rows = empty([])
+        B_rows = null()
    if out_rows is None:
-        out_rows = empty([])
+        out_rows = null()
    _CAPI_DGLKernelBackwardRhsBinaryOpReduce(
        reducer, op, G,
        int(A_target), int(B_target),
@@ -364,9 +364,9 @@ def copy_reduce(reducer, G, target,
        The rows to write to output tensor.
    """
    if X_rows is None:
-        X_rows = empty([])
+        X_rows = null()
    if out_rows is None:
-        out_rows = empty([])
+        out_rows = null()
    _CAPI_DGLKernelCopyReduce(
        reducer, G, int(target),
        X, out, X_rows, out_rows)
@@ -406,9 +406,9 @@ def backward_copy_reduce(reducer, G, target,
        The rows written to output tensor.
    """
    if X_rows is None:
-        X_rows = empty([])
+        X_rows = null()
    if out_rows is None:
-        out_rows = empty([])
+        out_rows = null()
    _CAPI_DGLKernelBackwardCopyReduce(
        reducer, G, int(target),
        X, out, grad_out, grad_X,

--- a/python/dgl/ndarray.py
+++ b/python/dgl/ndarray.py
@@ -11,6 +11,8 @@ import functools
 import operator
 import numpy as _np

+from ._ffi.object import register_object, ObjectBase
+from ._ffi.function import _init_api
 from ._ffi.ndarray import DGLContext, DGLType, NDArrayBase
 from ._ffi.ndarray import context, empty, from_dlpack, numpyasarray
 from ._ffi.ndarray import _set_class_ndarray
@@ -88,4 +90,98 @@ def zerocopy_from_numpy(np_data):
    handle = ctypes.pointer(arr)
    return NDArray(handle, is_view=True)

+def null():
+    """Return a ndarray representing null value. It can be safely converted
+    to other backend tensors.
+
+    Returns
+    -------
+    NDArray
+        A null array
+    """
+    return array(_np.array([], dtype=_np.int64))
+
+class SparseFormat:
+    """Format code"""
+    ANY = 0
+    COO = 1
+    CSR = 2
+    CSC = 3
+
+    FORMAT2STR = {
+        0 : 'ANY',
+        1 : 'COO',
+        2 : 'CSR',
+        3 : 'CSC',
+    }
+
+@register_object('aten.SparseMatrix')
+class SparseMatrix(ObjectBase):
+    """Sparse matrix object class in C++ backend."""
+    @property
+    def format(self):
+        """Sparse format enum
+
+        Returns
+        -------
+        int
+        """
+        return _CAPI_DGLSparseMatrixGetFormat(self)
+
+    @property
+    def num_rows(self):
+        """Number of rows.
+
+        Returns
+        -------
+        int
+        """
+        return _CAPI_DGLSparseMatrixGetNumRows(self)
+
+    @property
+    def num_cols(self):
+        """Number of rows.
+
+        Returns
+        -------
+        int
+        """
+        return _CAPI_DGLSparseMatrixGetNumCols(self)
+
+    @property
+    def indices(self):
+        """Index arrays.
+
+        Returns
+        -------
+        list of ndarrays
+        """
+        ret = [_CAPI_DGLSparseMatrixGetIndices(self, i) for i in range(3)]
+        return [F.zerocopy_from_dgl_ndarray(arr) for arr in ret]
+        #return [F.zerocopy_from_dgl_ndarray(v.data) for v in ret]
+
+    @property
+    def flags(self):
+        """Flag arrays
+
+        Returns
+        -------
+        list of boolean
+        """
+        return [v.data for v in _CAPI_DGLSparseMatrixGetFlags(self)]
+
+    def __getstate__(self):
+        return self.format, self.num_rows, self.num_cols, self.indices, self.flags
+
+    def __setstate__(self, state):
+        fmt, nrows, ncols, indices, flags = state
+        indices = [F.zerocopy_to_dgl_ndarray(idx) for idx in indices]
+        self.__init_handle_by_constructor__(
+            _CAPI_DGLCreateSparseMatrix, fmt, nrows, ncols, indices, flags)
+
+    def __repr__(self):
+        return 'SparseMatrix(fmt="{}", shape=({},{}))'.format(
+            SparseFormat.FORMAT2STR[self.format], self.num_rows, self.num_cols)
+
 _set_class_ndarray(NDArray)
+_init_api("dgl.ndarray")
--- a/python/dgl/random.py
+++ b/python/dgl/random.py
-"""Pyhton interfaces to DGL random number generators."""
+"""Python interfaces to DGL random number generators."""
+import numpy as np
+
 from ._ffi.function import _init_api
+from . import backend as F
+from . import ndarray as nd

 def seed(val):
    """Set the seed of randomized methods in DGL.
@@ -13,4 +17,75 @@ def seed(val):
    """
    _CAPI_SetSeed(val)

+def choice(a, size, replace=True, prob=None):  # pylint: disable=invalid-name
+    """An equivalent to :func:`numpy.random.choice`.
+
+    Use this function if you:
+
+    * Perform a non-uniform sampling (probability tensor is given).
+    * Sample a small set from a very large population (ratio <5%) uniformly
+      *without* replacement.
+    * Have a backend tensor on hand and does not want to convert it to numpy
+      back and forth.
+
+    Compared to :func:`numpy.random.choice`, it is slower when replace is True
+    and is comparable when replace is False. It wins when the population is
+    very large and the number of draws are quite small (e.g., draw <5%). The
+    reasons are two folds:
+
+    * When ``a`` is a large integer, it avoids creating a large range array as
+      numpy does.
+    * When draw ratio is small, it switches to a hashmap based implementation.
+
+    It out-performs numpy for non-uniform sampling in general cases.
+
+    TODO(minjie): support RNG as one of the arguments.
+
+    Parameters
+    ----------
+    a : 1-D tensor or int
+        If an ndarray, a random sample is generated from its elements. If an int,
+        the random sample is generated as if a were F.arange(a)
+    size : int or tuple of ints
+        Output shape. E.g., for size ``(m, n, k)``, then ``m * n * k`` samples are drawn.
+    replace : bool, optional
+        If true, sample with replacement.
+    prob : 1-D tensor, optional
+        The probabilities associated with each entry in a.
+        If not given the sample assumes a uniform distribution over all entries in a.
+
+    Returns
+    -------
+    samples : 1-D tensor
+        The generated random samples
+    """
+    if isinstance(size, tuple):
+        num = np.prod(size)
+    else:
+        num = size
+
+    if F.is_tensor(a):
+        population = F.shape(a)[0]
+    else:
+        population = a
+
+    if prob is None:
+        prob = nd.null()
+    else:
+        prob = F.zerocopy_to_dgl_ndarray(prob)
+
+    bits = 64  # index array is in 64-bit
+    chosen_idx = _CAPI_Choice(int(num), int(population), prob, bool(replace), bits)
+    chosen_idx = F.zerocopy_from_dgl_ndarray(chosen_idx)
+
+    if F.is_tensor(a):
+        chosen = F.gather_row(a, chosen_idx)
+    else:
+        chosen = chosen_idx
+
+    if isinstance(size, tuple):
+        return F.reshape(chosen, size)
+    else:
+        return chosen
+
 _init_api('dgl.rng', __name__)
--- a/python/dgl/sampling/neighbor.py
+++ b/python/dgl/sampling/neighbor.py
@@ -11,7 +11,7 @@ __all__ = [
    'sample_neighbors',
    'select_topk']

-def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=True):
+def sample_neighbors(g, nodes, fanout, edge_dir='in', prob=None, replace=False):
    """Sample from the neighbors of the given nodes and return the induced subgraph.

    When sampling with replacement, the sampled subgraph could have parallel edges.

--- a/src/array/array.cc
+++ b/src/array/array.cc
@@ -4,14 +4,15 @@
 * \brief DGL array utilities implementation
 */
 #include <dgl/array.h>
+#include <dgl/packed_func_ext.h>
+#include <dgl/runtime/container.h>
 #include "../c_api_common.h"
 #include "./array_op.h"
 #include "./arith.h"

-namespace dgl {
-
-using runtime::NDArray;
+using namespace dgl::runtime;

+namespace dgl {
 namespace aten {

 IdArray NewIdArray(int64_t length, DLContext ctx, uint8_t nbits) {
@@ -437,7 +438,7 @@ COOMatrix CSRRowWiseSampling(
    CSRMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob, bool replace) {
  COOMatrix ret;
  ATEN_CSR_SWITCH(mat, XPU, IdType, {
-    if (!prob.defined() || prob->shape[0] == 0) {
+    if (IsNullArray(prob)) {
      ret = impl::CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_samples, replace);
    } else {
      ATEN_FLOAT_TYPE_SWITCH(prob->dtype, FloatType, "probability", {
@@ -580,7 +581,7 @@ COOMatrix COORowWiseSampling(
    COOMatrix mat, IdArray rows, int64_t num_samples, FloatArray prob, bool replace) {
  COOMatrix ret;
  ATEN_COO_SWITCH(mat, XPU, IdType, {
-    if (!prob.defined() || prob->shape[0] == 0) {
+    if (IsNullArray(prob)) {
      ret = impl::COORowWiseSamplingUniform<XPU, IdType>(mat, rows, num_samples, replace);
    } else {
      ATEN_FLOAT_TYPE_SWITCH(prob->dtype, FloatType, "probability", {
@@ -612,5 +613,55 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo) {
  return ret;
 }

+///////////////////////// C APIs /////////////////////////
+DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLSparseMatrixGetFormat")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    SparseMatrixRef spmat = args[0];
+    *rv = spmat->format;
+  });
+
+DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLSparseMatrixGetNumRows")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    SparseMatrixRef spmat = args[0];
+    *rv = spmat->num_rows;
+  });
+
+DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLSparseMatrixGetNumCols")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    SparseMatrixRef spmat = args[0];
+    *rv = spmat->num_cols;
+  });
+
+DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLSparseMatrixGetIndices")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    SparseMatrixRef spmat = args[0];
+    const int64_t i = args[1];
+    *rv = spmat->indices[i];
+  });
+
+DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLSparseMatrixGetFlags")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    SparseMatrixRef spmat = args[0];
+    List<Value> flags;
+    for (bool flg : spmat->flags) {
+      flags.push_back(Value(MakeValue(flg)));
+    }
+    *rv = flags;
+  });
+
+DGL_REGISTER_GLOBAL("ndarray._CAPI_DGLCreateSparseMatrix")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    const int32_t format = args[0];
+    const int64_t nrows = args[1];
+    const int64_t ncols = args[2];
+    const List<Value> indices = args[3];
+    const List<Value> flags = args[4];
+    std::shared_ptr<SparseMatrix> spmat(new SparseMatrix(
+          format, nrows, ncols,
+          ListValueToVector<IdArray>(indices),
+          ListValueToVector<bool>(flags)));
+    *rv = SparseMatrixRef(spmat);
+  });
+
 }  // namespace aten
 }  // namespace dgl
--- a/src/array/cpu/array_utils.h
+++ b/src/array/cpu/array_utils.h
@@ -39,15 +39,14 @@ class IdHashMap {
  void Update(IdArray ids) {
    const IdType* ids_data = static_cast<IdType*>(ids->data);
    const int64_t len = ids->shape[0];
-    IdType newid = oldv2newv_.size();
    for (int64_t i = 0; i < len; ++i) {
      const IdType id = ids_data[i];
-      if (!Contains(id)) {
-        oldv2newv_[id] = newid++;
+      // std::unorderd_map::insert assures that an insertion will not happen if the
+      // key already exists.
+      oldv2newv_.insert({id, oldv2newv_.size()});
      filter_[id & kFilterMask] = true;
    }
  }
-  }

  // Return true if the given id is contained in this hashmap.
  bool Contains(IdType id) const {

--- a/src/array/cpu/coo_coalesce.cc
+++ b/src/array/cpu/coo_coalesce.cc
@@ -40,7 +40,7 @@ std::pair<COOMatrix, IdArray> COOCoalesce(COOMatrix coo) {

  COOMatrix coo_result = COOMatrix{
      coo.num_rows, coo.num_cols, NDArray::FromVector(new_row), NDArray::FromVector(new_col),
-      NDArray(), true};
+      NullArray(), true};
  return std::make_pair(coo_result, NDArray::FromVector(count));
 }


--- a/src/array/cpu/rowwise_pick.h
+++ b/src/array/cpu/rowwise_pick.h
@@ -62,6 +62,9 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
  // array. The implementation consumes a little extra memory than the actual requirement.
  //
  // Otherwise, directly use the row and col arrays to construct the result COO matrix.
+  //
+  // [02/29/2020 update]: OMP is disabled for now since batch-wise parallelism is more
+  //   significant. (minjie)
  IdArray picked_row = Full(-1, num_rows * num_picks, sizeof(IdxType) * 8, ctx);
  IdArray picked_col = Full(-1, num_rows * num_picks, sizeof(IdxType) * 8, ctx);
  IdArray picked_idx = Full(-1, num_rows * num_picks, sizeof(IdxType) * 8, ctx);
@@ -73,7 +76,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
  if (replace) {
    all_has_fanout = true;
  } else {
-#pragma omp parallel for reduction(&&:all_has_fanout)
+    // #pragma omp parallel for reduction(&&:all_has_fanout)
    for (int64_t i = 0; i < num_rows; ++i) {
      const IdxType rid = rows_data[i];
      const IdxType len = indptr[rid + 1] - indptr[rid];
@@ -81,7 +84,7 @@ COOMatrix CSRRowWisePick(CSRMatrix mat, IdArray rows,
    }
  }

-#pragma omp parallel for
+  // #pragma omp parallel for
  for (int64_t i = 0; i < num_rows; ++i) {
    const IdxType rid = rows_data[i];
    CHECK_LT(rid, mat.num_rows);

--- a/src/array/cpu/rowwise_sampling.cc
+++ b/src/array/cpu/rowwise_sampling.cc
@@ -34,14 +34,11 @@ inline PickFn<IdxType> GetSamplingPickFn(
    (IdxType rowid, IdxType off, IdxType len,
     const IdxType* col, const IdxType* data,
     IdxType* out_idx) {
-      // TODO(minjie): If efficiency is a problem, consider avoid creating
-      //   explicit NDArrays by directly manipulating buffers.
      FloatArray prob_selected = DoubleSlice<IdxType, FloatType>(prob, data, off, len);
-      IdArray sampled = RandomEngine::ThreadLocal()->Choice<IdxType, FloatType>(
-          num_samples, prob_selected, replace);
-      const IdxType* sampled_data = static_cast<IdxType*>(sampled->data);
+      RandomEngine::ThreadLocal()->Choice<IdxType, FloatType>(
+          num_samples, prob_selected, out_idx, replace);
      for (int64_t j = 0; j < num_samples; ++j) {
-        out_idx[j] = off + sampled_data[j];
+        out_idx[j] += off;
      }
    };
  return pick_fn;
@@ -54,13 +51,10 @@ inline PickFn<IdxType> GetSamplingUniformPickFn(
    (IdxType rowid, IdxType off, IdxType len,
     const IdxType* col, const IdxType* data,
     IdxType* out_idx) {
-      // TODO(minjie): If efficiency is a problem, consider avoid creating
-      //   explicit NDArrays by directly manipulating buffers.
-      IdArray sampled = RandomEngine::ThreadLocal()->UniformChoice<IdxType>(
-          num_samples, len, replace);
-      const IdxType* sampled_data = static_cast<IdxType*>(sampled->data);
+      RandomEngine::ThreadLocal()->UniformChoice<IdxType>(
+          num_samples, len, out_idx, replace);
      for (int64_t j = 0; j < num_samples; ++j) {
-        out_idx[j] = off + sampled_data[j];
+        out_idx[j] += off;
      }
    };
  return pick_fn;
@@ -72,6 +66,7 @@ inline PickFn<IdxType> GetSamplingUniformPickFn(
 template <DLDeviceType XPU, typename IdxType, typename FloatType>
 COOMatrix CSRRowWiseSampling(CSRMatrix mat, IdArray rows, int64_t num_samples,
                             FloatArray prob, bool replace) {
+  CHECK(prob.defined());
  auto pick_fn = GetSamplingPickFn<IdxType, FloatType>(num_samples, prob, replace);
  return CSRRowWisePick(mat, rows, num_samples, replace, pick_fn);
 }
@@ -102,6 +97,7 @@ template COOMatrix CSRRowWiseSamplingUniform<kDLCPU, int64_t>(
 template <DLDeviceType XPU, typename IdxType, typename FloatType>
 COOMatrix COORowWiseSampling(COOMatrix mat, IdArray rows, int64_t num_samples,
                             FloatArray prob, bool replace) {
+  CHECK(prob.defined());
  auto pick_fn = GetSamplingPickFn<IdxType, FloatType>(num_samples, prob, replace);
  return COORowWisePick(mat, rows, num_samples, replace, pick_fn);
 }