[Refactor] Nodeflow, sampling, CAPI (#430)

* enable cython * add helper function and data structure for void_p vector return * move sampler from graph index to contrib.sampling * WIP * WIP * refactor layer sampling * pass tests * fix lint * fix graphsage * remove comments * pickle test * fix comments * update dev guide for cython build

[Refactor] Nodeflow, sampling, CAPI (#430)
* enable cython * add helper function and data structure for void_p vector return * move sampler from graph index to contrib.sampling * WIP * WIP * refactor layer sampling * pass tests * fix lint * fix graphsage * remove comments * pickle test * fix comments * update dev guide for cython build
ca2a7e1c · Minjie Wang · GitHub · 27e0e547 · ca2a7e1c · ca2a7e1c
Unverified Commit ca2a7e1c authored Mar 05, 2019 by Minjie Wang Committed by GitHub Mar 05, 2019
20 changed files
--- a/docs/source/contribute.rst
+++ b/docs/source/contribute.rst
@@ -112,6 +112,12 @@ To achieve this, export following environment variables:
   export DGL_LIBRARY_PATH=$DGL_HOME/build
   export PYTHONPATH=$PYTHONPATH:$DGL_HOME/python

+If you are working on performance critical part, you may want to turn on Cython build:
+
+.. code-block:: bash
+   cd python
+   python setup.py build_ext --inplace
+
 You could test the build by running the following command and see the path of your local clone.

 .. code-block:: bash

--- a/examples/mxnet/sampling/graphsage_cv.py
+++ b/examples/mxnet/sampling/graphsage_cv.py
@@ -10,8 +10,6 @@ import dgl
 import dgl.function as fn
 from dgl import DGLGraph
 from dgl.data import register_data_args, load_data
-from dgl.graph_index import map_to_nodeflow_nid
-

 class GraphSAGELayer(gluon.Block):
    def __init__(self,
@@ -116,7 +114,7 @@ class GraphSAGETrain(gluon.Block):

        for i, layer in enumerate(self.layers):
            parent_nid = dgl.utils.toindex(nf.layer_parent_nid(i+1))
-            layer_nid = map_to_nodeflow_nid(nf._graph, i, parent_nid).tousertensor()
+            layer_nid = nf.map_from_parent_nid(i, parent_nid)
            self_h = h[layer_nid]
            # activation from previous layer of myself, used in graphSAGE
            nf.layers[i+1].data['self_h'] = self_h
@@ -170,7 +168,7 @@ class GraphSAGEInfer(gluon.Block):
        for i, layer in enumerate(self.layers):
            nf.layers[i].data['h'] = h
            parent_nid = dgl.utils.toindex(nf.layer_parent_nid(i+1))
-            layer_nid = map_to_nodeflow_nid(nf._graph, i, parent_nid).tousertensor()
+            layer_nid = nf.map_from_parent_nid(i, parent_nid)
            # activation from previous layer of the nodes in (i+1)-th layer, used in graphSAGE
            self_h = h[layer_nid]
            nf.layers[i+1].data['self_h'] = self_h

--- a/include/dgl/sampler.h
+++ b/include/dgl/sampler.h
@@ -57,7 +57,8 @@ class SamplerOp {
   * \param add_self_loop whether to add self loop to the sampled subgraph
   * \return a NodeFlow graph.
   */
-  static NodeFlow NeighborUniformSample(const ImmutableGraph *graph, IdArray seeds,
+  static NodeFlow NeighborUniformSample(const ImmutableGraph *graph,
+                                        const std::vector<dgl_id_t>& seeds,
                                        const std::string &edge_type,
                                        int num_hops, int expand_factor,
                                        const bool add_self_loop);
@@ -72,9 +73,10 @@ class SamplerOp {
   * \param layer_sizes The size of layers.
   * \return a NodeFlow graph.
   */
-  static NodeFlow LayerUniformSample(const ImmutableGraph *graph, IdArray seed_array,
+  static NodeFlow LayerUniformSample(const ImmutableGraph *graph,
+                                     const std::vector<dgl_id_t>& seeds,
                                     const std::string &neigh_type,
-                                     const std::vector<size_t> &layer_sizes);
+                                     IdArray layer_sizes);

  /*!
   * \brief Batch-generate random walk traces

--- a/python/dgl/__init__.py
+++ b/python/dgl/__init__.py
@@ -11,7 +11,8 @@ from .base import ALL
 from .backend import load_backend
 from .batched_graph import *
 from .graph import DGLGraph
+from .nodeflow import *
 from .traversal import *
+from .transform import *
 from .propagate import *
 from .udf import NodeBatch, EdgeBatch
-from .transform import *
--- a/python/dgl/_ffi/_cython/.gitignore
+++ b/python/dgl/_ffi/_cython/.gitignore
+*.cpp
--- a/python/dgl/_ffi/_cython/base.pxi
+++ b/python/dgl/_ffi/_cython/base.pxi
@@ -104,17 +104,18 @@ cdef extern from "dgl/runtime/c_runtime_api.h":
                         DLManagedTensor** out)
    void DGLDLManagedTensorCallDeleter(DLManagedTensor* dltensor)

-cdef extern from "dgl/c_dsl_api.h":
-    int DGLNodeFree(NodeHandle handle)
-    int DGLNodeTypeKey2Index(const char* type_key,
-                             int* out_index)
-    int DGLNodeGetTypeIndex(NodeHandle handle,
-                            int* out_index)
-    int DGLNodeGetAttr(NodeHandle handle,
-                       const char* key,
-                       DGLValue* out_value,
-                       int* out_type_code,
-                       int* out_success)
+# (minjie): Node and class module are not used in DGL.
+#cdef extern from "dgl/c_dsl_api.h":
+#    int DGLNodeFree(NodeHandle handle)
+#    int DGLNodeTypeKey2Index(const char* type_key,
+#                             int* out_index)
+#    int DGLNodeGetTypeIndex(NodeHandle handle,
+#                            int* out_index)
+#    int DGLNodeGetAttr(NodeHandle handle,
+#                       const char* key,
+#                       DGLValue* out_value,
+#                       int* out_type_code,
+#                       int* out_success)

 cdef inline py_str(const char* x):
    if PY_MAJOR_VERSION < 3:

--- a/python/dgl/_ffi/_cython/core.pyx
+++ b/python/dgl/_ffi/_cython/core.pyx
 include "./base.pxi"
-include "./node.pxi"
+# (minjie): Node and class module are not used in DGL.
+#include "./node.pxi"
 include "./function.pxi"
 include "./ndarray.pxi"
--- a/python/dgl/_ffi/_cython/function.pxi
+++ b/python/dgl/_ffi/_cython/function.pxi
@@ -3,7 +3,8 @@ import traceback
 from cpython cimport Py_INCREF, Py_DECREF
 from numbers import Number, Integral
 from ..base import string_types
-from ..node_generic import convert_to_node, NodeGeneric
+# (minjie): Node and class module are not used in DGL.
+# from ..node_generic import convert_to_node, NodeGeneric
 from ..runtime_ctypes import DGLType, DGLContext, DGLByteArray


@@ -24,8 +25,9 @@ cdef int dgl_callback(DGLValue* args,
    for i in range(num_args):
        value = args[i]
        tcode = type_codes[i]
-        if (tcode == kNodeHandle or
-            tcode == kFuncHandle or
+        # (minjie): Node and class module are not used in DGL.
+        #if (tcode == kNodeHandle or
+        if (tcode == kFuncHandle or
            tcode == kModuleHandle or
            tcode > kExtBegin):
            CALL(DGLCbArgToReturn(&value, tcode))
@@ -79,10 +81,11 @@ cdef inline int make_arg(object arg,
                         list temp_args) except -1:
    """Pack arguments into c args dgl call accept"""
    cdef unsigned long long ptr
-    if isinstance(arg, NodeBase):
-        value[0].v_handle = (<NodeBase>arg).chandle
-        tcode[0] = kNodeHandle
-    elif isinstance(arg, NDArrayBase):
+    # (minjie): Node and class module are not used in DGL.
+    #if isinstance(arg, NodeBase):
+    #    value[0].v_handle = (<NodeBase>arg).chandle
+    #    tcode[0] = kNodeHandle
+    if isinstance(arg, NDArrayBase):
        value[0].v_handle = (<NDArrayBase>arg).chandle
        tcode[0] = (kNDArrayContainer if
                    not (<NDArrayBase>arg).c_is_view else kArrayHandle)
@@ -131,14 +134,15 @@ cdef inline int make_arg(object arg,
        value[0].v_str = tstr
        tcode[0] = kStr
        temp_args.append(tstr)
-    elif isinstance(arg, (list, tuple, dict, NodeGeneric)):
-        arg = convert_to_node(arg)
-        value[0].v_handle = (<NodeBase>arg).chandle
-        tcode[0] = kNodeHandle
-        temp_args.append(arg)
-    elif isinstance(arg, _CLASS_MODULE):
-        value[0].v_handle = c_handle(arg.handle)
-        tcode[0] = kModuleHandle
+    # (minjie): Node and class module are not used in DGL.
+    #elif isinstance(arg, (list, tuple, dict, NodeGeneric)):
+    #    arg = convert_to_node(arg)
+    #    value[0].v_handle = (<NodeBase>arg).chandle
+    #    tcode[0] = kNodeHandle
+    #    temp_args.append(arg)
+    #elif isinstance(arg, _CLASS_MODULE):
+    #    value[0].v_handle = c_handle(arg.handle)
+    #    tcode[0] = kModuleHandle
    elif isinstance(arg, FunctionBase):
        value[0].v_handle = (<FunctionBase>arg).chandle
        tcode[0] = kFuncHandle
@@ -166,9 +170,10 @@ cdef inline bytearray make_ret_bytes(void* chandle):

 cdef inline object make_ret(DGLValue value, int tcode):
    """convert result to return value."""
-    if tcode == kNodeHandle:
-        return make_ret_node(value.v_handle)
-    elif tcode == kNull:
+    # (minjie): Node and class module are not used in DGL.
+    #if tcode == kNodeHandle:
+    #    return make_ret_node(value.v_handle)
+    if tcode == kNull:
        return None
    elif tcode == kInt:
        return value.v_int64
@@ -184,8 +189,9 @@ cdef inline object make_ret(DGLValue value, int tcode):
        return ctypes_handle(value.v_handle)
    elif tcode == kDGLContext:
        return DGLContext(value.v_ctx.device_type, value.v_ctx.device_id)
-    elif tcode == kModuleHandle:
-        return _CLASS_MODULE(ctypes_handle(value.v_handle))
+    # (minjie): Node and class module are not used in DGL.
+    #elif tcode == kModuleHandle:
+    #    return _CLASS_MODULE(ctypes_handle(value.v_handle))
    elif tcode == kFuncHandle:
        fobj = _CLASS_FUNCTION(None, False)
        (<FunctionBase>fobj).chandle = value.v_handle

--- a/python/dgl/_ffi/function.py
+++ b/python/dgl/_ffi/function.py
@@ -258,7 +258,6 @@ def extract_ext_funcs(finit):
        raise RuntimeError("cannot initialize with %s" % finit)
    return fdict

-
 def _get_api(f):
    flocal = f
    flocal.is_global = True
@@ -285,19 +284,30 @@ def _init_api_prefix(module_name, prefix):
    module = sys.modules[module_name]

    for name in list_global_func_names():
-        if prefix == "api":
-            fname = name
-            if name.startswith("_"):
-                target_module = sys.modules["dgl._api_internal"]
-            else:
-                target_module = module
-        else:
-            if not name.startswith(prefix):
-                continue
-            fname = name[len(prefix)+1:]
-            target_module = module
+        if name.startswith("_"):
+            continue
+        if not name.startswith(prefix):
+            continue
+        fname = name[len(prefix)+1:]
+        target_module = module
+
+        if fname.find(".") != -1:
+            print('Warning: invalid API name "%s".' % fname)
+            continue
+        f = get_global_func(name)
+        ff = _get_api(f)
+        ff.__name__ = fname
+        ff.__doc__ = ("DGL PackedFunc %s. " % fname)
+        setattr(target_module, ff.__name__, ff)

+def _init_internal_api():
+    for name in list_global_func_names():
+        if not name.startswith("_"):
+            continue
+        target_module = sys.modules["dgl._api_internal"]
+        fname = name
        if fname.find(".") != -1:
+            print('Warning: invalid API name "%s".' % fname)
            continue
        f = get_global_func(name)
        ff = _get_api(f)

--- a/python/dgl/base.py
+++ b/python/dgl/base.py
@@ -4,6 +4,7 @@ from __future__ import absolute_import
 import warnings

 from ._ffi.base import DGLError  # pylint: disable=unused-import
+from ._ffi.function import _init_internal_api

 # A special symbol for selecting all nodes or edges.
 ALL = "__ALL__"
@@ -15,3 +16,5 @@ def is_all(arg):
 def dgl_warning(msg):
    """Print out warning messages."""
    warnings.warn(msg)
+
+_init_internal_api()
--- a/python/dgl/contrib/sampling/sampler.py
+++ b/python/dgl/contrib/sampling/sampler.py
-# This file contains NodeFlow samplers.
+"""This file contains NodeFlow samplers."""

 import sys
 import numpy as np
@@ -6,9 +6,12 @@ import threading
 import random
 import traceback

+from ..._ffi.function import _init_api
 from ... import utils
-from ...node_flow import NodeFlow
+from ...nodeflow import NodeFlow
 from ... import backend as F
+from ...utils import unwrap_to_ptr_list
+
 try:
    import Queue as queue
 except ImportError:
@@ -30,10 +33,12 @@ class SampledSubgraphLoader(object):
            self._expand_factor = expand_factor
            self._num_hops = num_hops
        elif sampler == 'layer':
-            self._layer_sizes = layer_sizes
+            self._layer_sizes = utils.toindex(layer_sizes)
        else:
-            raise NotImplementedError()
+            raise NotImplementedError('Invalid sampler option: "%s"' % sampler)
        self._node_prob = node_prob
+        if node_prob is not None:
+            raise NotImplementedError('Non-uniform sampling is currently not supported.')
        self._add_self_loop = add_self_loop
        if self._node_prob is not None:
            assert self._node_prob.shape[0] == g.number_of_nodes(), \
@@ -44,6 +49,7 @@ class SampledSubgraphLoader(object):
            self._seed_nodes = seed_nodes
        if shuffle:
            self._seed_nodes = F.rand_shuffle(self._seed_nodes)
+        self._seed_nodes = utils.toindex(self._seed_nodes)
        self._num_workers = num_workers
        self._neighbor_type = neighbor_type
        self._nflows = []
@@ -51,25 +57,31 @@ class SampledSubgraphLoader(object):
        self._nflow_idx = 0

    def _prefetch(self):
-        seed_ids = []
-        num_nodes = len(self._seed_nodes)
-        for i in range(self._num_workers):
-            start = self._nflow_idx * self._batch_size
-            # if we have visited all nodes, don't do anything.
-            if start >= num_nodes:
-                break
-            end = min((self._nflow_idx + 1) * self._batch_size, num_nodes)
-            seed_ids.append(utils.toindex(self._seed_nodes[start:end]))
-            self._nflow_idx += 1
        if self._sampler == 'neighbor':
-            sgi = self._g._graph.neighbor_sampling(seed_ids, self._expand_factor,
-                                                   self._num_hops, self._neighbor_type,
-                                                   self._node_prob, self._add_self_loop)
+            handles = unwrap_to_ptr_list(_CAPI_UniformSampling(
+                self._g._graph._handle,
+                self._seed_nodes.todgltensor(),
+                int(self._nflow_idx),    # start batch id
+                int(self._batch_size),   # batch size
+                int(self._num_workers),  # num batches
+                int(self._expand_factor),
+                int(self._num_hops),
+                self._neighbor_type,
+                self._add_self_loop))
        elif self._sampler == 'layer':
-            sgi = self._g._graph.layer_sampling(seed_ids, self._layer_sizes,
-                                                self._neighbor_type, self._node_prob)
-        nflows = [NodeFlow(self._g, i) for i in sgi]
+            handles = unwrap_to_ptr_list(_CAPI_LayerSampling(
+                self._g._graph._handle,
+                self._seed_nodes.todgltensor(),
+                int(self._nflow_idx),    # start batch id
+                int(self._batch_size),   # batch size
+                int(self._num_workers),  # num batches
+                self._layer_sizes.todgltensor(),
+                self._neighbor_type))
+        else:
+            raise NotImplementedError('Invalid sampler option: "%s"' % self._sampler)
+        nflows = [NodeFlow(self._g, hdl) for hdl in handles]
        self._nflows.extend(nflows)
+        self._nflow_idx += len(nflows)

    def __iter__(self):
        return self
@@ -240,6 +252,8 @@ def NeighborSampler(g, batch_size, expand_factor, num_hops=1,
        * str: indicates some common ways of calculating the number of sampled neighbors,
          e.g., ``sqrt(deg)``.

+        Note that no matter how large the expand_factor, the max number of sampled neighbors
+        is the neighborhood size.
    num_hops : int, optional
        The number of hops to sample (i.e, the number of layers in the NodeFlow).
        Default: 1
@@ -279,7 +293,8 @@ def NeighborSampler(g, batch_size, expand_factor, num_hops=1,
                                   expand_factor=expand_factor, num_hops=num_hops,
                                   neighbor_type=neighbor_type, node_prob=node_prob,
                                   seed_nodes=seed_nodes, shuffle=shuffle,
-                                   num_workers=num_workers)
+                                   num_workers=num_workers,
+                                   add_self_loop=add_self_loop)
    if not prefetch:
        return loader
    else:
@@ -324,3 +339,29 @@ def LayerSampler(g, batch_size, layer_sizes,
        return loader
    else:
        return _PrefetchingLoader(loader, num_prefetch=num_workers*2)
+
+def create_full_nodeflow(g, num_layers, add_self_loop=False):
+    """Convert a full graph to NodeFlow to run a L-layer GNN model.
+
+    Parameters
+    ----------
+    g : DGLGraph
+        a DGL graph
+    num_layers : int
+        The number of layers
+    add_self_loop : bool, default False
+        Whether to add self loop to the sampled NodeFlow.
+        If True, the edge IDs of the self loop edges are -1.
+
+    Returns
+    -------
+    NodeFlow
+        a NodeFlow with a specified number of layers.
+    """
+    batch_size = g.number_of_nodes()
+    expand_factor = g.number_of_nodes()
+    sampler = NeighborSampler(g, batch_size, expand_factor,
+        num_layers, add_self_loop=add_self_loop)
+    return next(sampler)
+
+_init_api('dgl.sampling', __name__)
--- a/python/dgl/graph_index.py
+++ b/python/dgl/graph_index.py
@@ -10,7 +10,6 @@ from ._ffi.base import c_array
 from ._ffi.function import _init_api
 from .base import DGLError
 from . import backend as F
-from . import ndarray as nd
 from . import utils

 GraphIndexHandle = ctypes.c_void_p
@@ -63,9 +62,13 @@ class GraphIndex(object):
        """The actual init function"""
        assert len(src_ids) == len(dst_ids)
        assert len(src_ids) == len(edge_ids)
-        self._handle = _CAPI_DGLGraphCreate(src_ids.todgltensor(), dst_ids.todgltensor(),
-                                            edge_ids.todgltensor(), self._multigraph, num_nodes,
-                                            self._readonly)
+        self._handle = _CAPI_DGLGraphCreate(
+            src_ids.todgltensor(),
+            dst_ids.todgltensor(),
+            edge_ids.todgltensor(),
+            self._multigraph,
+            int(num_nodes),
+            self._readonly)

    def add_nodes(self, num):
        """Add nodes.
@@ -218,7 +221,7 @@ class GraphIndex(object):
        bool
            True if the edge exists, False otherwise
        """
-        return bool(_CAPI_DGLGraphHasEdgeBetween(self._handle, u, v))
+        return bool(_CAPI_DGLGraphHasEdgeBetween(self._handle, int(u), int(v)))

    def has_edges_between(self, u, v):
        """Return true if the edge exists.
@@ -288,7 +291,7 @@ class GraphIndex(object):
        utils.Index
            The edge id array.
        """
-        return utils.toindex(_CAPI_DGLGraphEdgeId(self._handle, u, v))
+        return utils.toindex(_CAPI_DGLGraphEdgeId(self._handle, int(u), int(v)))

    def edge_ids(self, u, v):
        """Return a triplet of arrays that contains the edge IDs.
@@ -445,7 +448,7 @@ class GraphIndex(object):
        int
            The in degree.
        """
-        return _CAPI_DGLGraphInDegree(self._handle, v)
+        return _CAPI_DGLGraphInDegree(self._handle, int(v))

    def in_degrees(self, v):
        """Return the in degrees of the nodes.
@@ -476,7 +479,7 @@ class GraphIndex(object):
        int
            The out degree.
        """
-        return _CAPI_DGLGraphOutDegree(self._handle, v)
+        return _CAPI_DGLGraphOutDegree(self._handle, int(v))

    def out_degrees(self, v):
        """Return the out degrees of the nodes.
@@ -675,45 +678,6 @@ class GraphIndex(object):
        shuffle_idx = utils.toindex(shuffle_idx) if shuffle_idx is not None else None
        return inc, shuffle_idx

-    def neighbor_sampling(self, seed_ids, expand_factor, num_hops, neighbor_type,
-                          node_prob, add_self_loop=False):
-        """Neighborhood sampling"""
-        if len(seed_ids) == 0:
-            return []
-
-        seed_ids = [v.todgltensor() for v in seed_ids]
-        num_subgs = len(seed_ids)
-        if node_prob is None:
-            rst = _uniform_sampling(self, seed_ids, neighbor_type, num_hops,
-                                    expand_factor, add_self_loop)
-        else:
-            rst = _nonuniform_sampling(self, node_prob, seed_ids, neighbor_type, num_hops,
-                                       expand_factor)
-
-        return [NodeFlowIndex(rst(i), self, utils.toindex(rst(num_subgs + i)),
-                              utils.toindex(rst(num_subgs * 2 + i)),
-                              utils.toindex(rst(num_subgs * 3 + i)),
-                              utils.toindex(rst(num_subgs * 4 + i))) for i in range(num_subgs)]
-
-    def layer_sampling(self, seed_ids, layer_sizes, neighbor_type, node_prob=None):
-        """Layer sampling"""
-        if len(seed_ids) == 0:
-            return []
-
-        seed_ids = [v.todgltensor() for v in seed_ids]
-        layer_sizes = nd.from_dlpack(F.zerocopy_to_dlpack(F.tensor(layer_sizes)))
-        if node_prob is None:
-            rst = _layer_uniform_sampling(self, seed_ids, neighbor_type, layer_sizes)
-        else:
-            raise NotImplementedError()
-
-        num_subgs = len(seed_ids)
-        return [NodeFlowIndex(rst(i), self, utils.toindex(rst(num_subgs + i)),
-                              utils.toindex(rst(num_subgs * 2 + i)),
-                              utils.toindex(rst(num_subgs * 3 + i)),
-                              utils.toindex(rst(num_subgs * 4 + i))) for i in range(num_subgs)]
-
-
    def random_walk(self, seeds, num_traces, num_hops):
        """Random walk sampling.

@@ -918,75 +882,6 @@ class SubgraphIndex(GraphIndex):
        raise NotImplementedError(
            "SubgraphIndex unpickling is not supported yet.")

-class NodeFlowIndex(GraphIndex):
-    """Graph index for a NodeFlow graph.
-
-    Parameters
-    ----------
-    handle : GraphIndexHandle
-        The capi handle.
-    paranet : GraphIndex
-        The parent graph index.
-    node_mapping : utils.Index
-        This maps nodes to the parent graph.
-    edge_mapping : utils.Index
-        The maps edges to the parent graph.
-    layers: utils.Index
-        The offsets of the layers.
-    flows: utils.Index
-        The offsets of the flows.
-    """
-    def __init__(self, handle, parent, node_mapping, edge_mapping, layers, flows):
-        super(NodeFlowIndex, self).__init__(handle, parent.is_multigraph(), parent.is_readonly())
-        self._parent = parent
-        self._node_mapping = node_mapping
-        self._edge_mapping = edge_mapping
-        self._layers = layers
-        self._flows = flows
-
-    @property
-    def node_mapping(self):
-        """Return the node mapping to the parent graph.
-
-        Returns
-        -------
-        utils.Index
-            The node mapping.
-        """
-        return self._node_mapping
-
-    @property
-    def edge_mapping(self):
-        """Return the edge mapping to the parent graph.
-
-        Returns
-        -------
-        utils.Index
-            The edge mapping.
-        """
-        return self._edge_mapping
-
-    @property
-    def layers(self):
-        """Return layer offsets.
-        """
-        return self._layers
-
-    @property
-    def flows(self):
-        """Return flow offsets.
-        """
-        return self._flows
-
-    def __getstate__(self):
-        raise NotImplementedError(
-            "SubgraphIndex pickling is not supported yet.")
-
-    def __setstate__(self, state):
-        raise NotImplementedError(
-            "SubgraphIndex unpickling is not supported yet.")
-
-
 def map_to_subgraph_nid(subgraph, parent_nids):
    """Map parent node Ids to the subgraph node Ids.

@@ -1006,33 +901,23 @@ def map_to_subgraph_nid(subgraph, parent_nids):
    return utils.toindex(_CAPI_DGLMapSubgraphNID(subgraph.induced_nodes.todgltensor(),
                                                 parent_nids.todgltensor()))

-def map_to_nodeflow_nid(nflow, layer_id, parent_nids):
-    """Map parent node Ids to NodeFlow node Ids in a certain layer.
+def transform_ids(mapping, ids):
+    """Transform ids by the given mapping.

    Parameters
    ----------
-    nflow : NodeFlowIndex
-        The graph index of a NodeFlow.
-
-    layer_id : int
-        The layer Id.
-
-    parent_nids: utils.Index
-        Node Ids in the parent graph.
+    mapping : utils.Index
+        The id mapping. new_id = mapping[old_id]
+    ids : utils.Index
+        The old ids.

    Returns
    -------
    utils.Index
-        Node Ids in the NodeFlow.
+        The new ids.
    """
-    mapping = nflow.node_mapping.tousertensor()
-    layers = nflow.layers.tonumpy()
-    start = int(layers[layer_id])
-    end = int(layers[layer_id + 1])
-    mapping = mapping[start:end]
-    mapping = utils.toindex(mapping)
-    return utils.toindex(_CAPI_DGLMapSubgraphNID(mapping.todgltensor(),
-                                                 parent_nids.todgltensor()))
+    return utils.toindex(_CAPI_DGLMapSubgraphNID(
+        mapping.todgltensor(), ids.todgltensor()))

 def disjoint_union(graphs):
    """Return a disjoint union of the input graphs.
@@ -1145,51 +1030,3 @@ def create_graph_index(graph_data=None, multigraph=False, readonly=False):


 _init_api("dgl.graph_index")
-
-# TODO(zhengda): we'll support variable-length inputs.
-_NEIGHBOR_SAMPLING_APIS = {
-    1: _CAPI_DGLGraphUniformSampling,
-    2: _CAPI_DGLGraphUniformSampling2,
-    4: _CAPI_DGLGraphUniformSampling4,
-    8: _CAPI_DGLGraphUniformSampling8,
-    16: _CAPI_DGLGraphUniformSampling16,
-    32: _CAPI_DGLGraphUniformSampling32,
-    64: _CAPI_DGLGraphUniformSampling64,
-    128: _CAPI_DGLGraphUniformSampling128,
-}
-
-_EMPTY_ARRAYS = [utils.toindex(F.ones(shape=(0), dtype=F.int64, ctx=F.cpu()))]
-
-def _uniform_sampling(gidx, seed_ids, neigh_type, num_hops, expand_factor, add_self_loop):
-    num_seeds = len(seed_ids)
-    empty_ids = []
-    if len(seed_ids) > 1 and len(seed_ids) not in _NEIGHBOR_SAMPLING_APIS.keys():
-        remain = 2**int(math.ceil(math.log2(len(dgl_ids)))) - len(dgl_ids)
-        empty_ids = _EMPTY_ARRAYS[0:remain]
-        seed_ids.extend([empty.todgltensor() for empty in empty_ids])
-    assert len(seed_ids) in _NEIGHBOR_SAMPLING_APIS.keys()
-    return _NEIGHBOR_SAMPLING_APIS[len(seed_ids)](gidx._handle, *seed_ids, neigh_type,
-                                                  num_hops, expand_factor, num_seeds,
-                                                  add_self_loop)
-
-_LAYER_SAMPLING_APIS = {
-    1: _CAPI_DGLGraphLayerUniformSampling,
-    2: _CAPI_DGLGraphLayerUniformSampling2,
-    4: _CAPI_DGLGraphLayerUniformSampling4,
-    8: _CAPI_DGLGraphLayerUniformSampling8,
-    16: _CAPI_DGLGraphLayerUniformSampling16,
-    32: _CAPI_DGLGraphLayerUniformSampling32,
-    64: _CAPI_DGLGraphLayerUniformSampling64,
-    128: _CAPI_DGLGraphLayerUniformSampling128,
-}
-
-def _layer_uniform_sampling(gidx, seed_ids, neigh_type, layer_sizes):
-    num_seeds = len(seed_ids)
-    empty_ids = []
-    if len(seed_ids) > 1 and len(seed_ids) not in _LAYER_SAMPLING_APIS.keys():
-        remain = 2**int(math.ceil(math.log2(len(dgl_ids)))) - len(dgl_ids)
-        empty_ids = _EMPTY_ARRAYS[0:remain]
-        seed_ids.extend([empty.todgltensor() for empty in empty_ids])
-    assert len(seed_ids) in _LAYER_SAMPLING_APIS.keys()
-    return _LAYER_SAMPLING_APIS[len(seed_ids)](gidx._handle, *seed_ids, neigh_type,
-                                               layer_sizes, num_seeds)
--- a/python/dgl/node_flow.py
+++ b/python/dgl/node_flow.py
 """Class for NodeFlow data structure."""
 from __future__ import absolute_import

+import ctypes
+
+from ._ffi.function import _init_api
 from .base import ALL, is_all, DGLError
 from . import backend as F
 from .frame import Frame, FrameRef
 from .graph import DGLBaseGraph
+from .graph_index import GraphIndex, transform_ids
 from .runtime import ir, scheduler, Runtime
 from . import utils
 from .view import LayerView, BlockView

-def _copy_to_like(arr1, arr2):
-    return F.copy_to(arr1, F.context(arr2))
-
-def _get_frame(frame, names, ids):
-    col_dict = {name: frame[name][_copy_to_like(ids, frame[name])] for name in names}
-    if len(col_dict) == 0:
-        return FrameRef(Frame(num_rows=len(ids)))
-    else:
-        return FrameRef(Frame(col_dict))
-
-
-def _update_frame(frame, names, ids, new_frame):
-    col_dict = {name: new_frame[name] for name in names}
-    if len(col_dict) > 0:
-        frame.update_rows(ids, FrameRef(Frame(col_dict)), inplace=True)
+__all__ = ['NodeFlow']

+NodeFlowHandle = ctypes.c_void_p

 class NodeFlow(DGLBaseGraph):
-    """The NodeFlow class stores the sampling results of Neighbor sampling and Layer-wise sampling.
+    """The NodeFlow class stores the sampling results of Neighbor
+    sampling and Layer-wise sampling.
+
+    These sampling algorithms generate graphs with multiple layers. The
+    edges connect the nodes between two layers while there don't exist
+    edges between the nodes in the same layer.

-    These sampling algorithms generate graphs with multiple layers. The edges connect the nodes
-    between two layers while there don't exist edges between the nodes in the same layer.
+    We store multiple layers of the sampling results in a single graph.
+    We store extra information, such as the node and edge mapping from
+    the NodeFlow graph to the parent graph.

-    We store multiple layers of the sampling results in a single graph. We store extra information,
-    such as the node and edge mapping from the NodeFlow graph to the parent graph.
+    DO NOT create NodeFlow object directly. Use sampling method to
+    generate NodeFlow instead.

    Parameters
    ----------
    parent : DGLGraph
-        The parent graph
-    graph_index : NodeFlowIndex
-        The graph index of the NodeFlow graph.
+        The parent graph.
+    handle : NodeFlowHandle
+        The handle to the underlying C structure.
    """
-    def __init__(self, parent, graph_idx):
-        super(NodeFlow, self).__init__(graph_idx)
+    def __init__(self, parent, handle):
+        # NOTE(minjie): handle is a pointer to the underlying C++ structure
+        #  defined in include/dgl/sampler.h. The constructor will save
+        #  all its members in the python side and destroy the handler
+        #  afterwards. One can view the given handle object as a transient
+        #  argument pack to construct this python class.
+        # TODO(minjie): We should use TVM's Node system as a cleaner solution later.
+        super(NodeFlow, self).__init__(GraphIndex(_CAPI_NodeFlowGetGraph(handle)))
        self._parent = parent
-        self._node_mapping = graph_idx.node_mapping
-        self._edge_mapping = graph_idx.edge_mapping
-        self._layer_offsets = graph_idx.layers.tonumpy()
-        self._block_offsets = graph_idx.flows.tonumpy()
+        self._node_mapping = utils.toindex(_CAPI_NodeFlowGetNodeMapping(handle))
+        self._edge_mapping = utils.toindex(_CAPI_NodeFlowGetEdgeMapping(handle))
+        self._layer_offsets = utils.toindex(
+            _CAPI_NodeFlowGetLayerOffsets(handle)).tonumpy()
+        self._block_offsets = utils.toindex(
+            _CAPI_NodeFlowGetBlockOffsets(handle)).tonumpy()
+        _CAPI_NodeFlowFree(handle)
+        # node/edge frames
        self._node_frames = [FrameRef(Frame(num_rows=self.layer_size(i))) \
                             for i in range(self.num_layers)]
        self._edge_frames = [FrameRef(Frame(num_rows=self.block_size(i))) \
@@ -252,6 +259,32 @@ class NodeFlow(DGLBaseGraph):
        """
        return self._edge_mapping.tousertensor()[eid]

+    def map_from_parent_nid(self, layer_id, parent_nids):
+        """Map parent node Ids to NodeFlow node Ids in a certain layer.
+
+        Parameters
+        ----------
+        layer_id : int
+            The layer Id.
+        parent_nids: list or Tensor
+            Node Ids in the parent graph.
+
+        Returns
+        -------
+        Tensor
+            Node Ids in the NodeFlow.
+        """
+        parent_nids = utils.toindex(parent_nids)
+        layers = self._layer_offsets
+        start = int(layers[layer_id])
+        end = int(layers[layer_id + 1])
+        # TODO(minjie): should not directly use []
+        mapping = self._node_mapping.tousertensor()
+        mapping = mapping[start:end]
+        mapping = utils.toindex(mapping)
+        nflow_ids = transform_ids(mapping, parent_nids)
+        return nflow_ids.tousertensor()
+
    def layer_in_degree(self, layer_id):
        """Return the in-degree of the nodes in the specified layer.

@@ -677,7 +710,7 @@ class NodeFlow(DGLBaseGraph):
        if is_all(flow_range):
            flow_range = range(0, self.num_blocks)
        elif isinstance(flow_range, slice):
-            if slice.step is not 1:
+            if slice.step != 1:
                raise DGLError("We can't propogate flows and skip some of them")
            flow_range = range(flow_range.start, flow_range.stop)
        else:
@@ -708,26 +741,20 @@ class NodeFlow(DGLBaseGraph):
            self.block_compute(i, message_func, reduce_func, apply_node_func,
                               inplace=inplace)

+def _copy_to_like(arr1, arr2):
+    return F.copy_to(arr1, F.context(arr2))
+
+def _get_frame(frame, names, ids):
+    col_dict = {name: frame[name][_copy_to_like(ids, frame[name])] for name in names}
+    if len(col_dict) == 0:
+        return FrameRef(Frame(num_rows=len(ids)))
+    else:
+        return FrameRef(Frame(col_dict))
+

-def create_full_node_flow(g, num_layers, add_self_loop=False):
-    """Convert a full graph to NodeFlow to run a L-layer GNN model.
+def _update_frame(frame, names, ids, new_frame):
+    col_dict = {name: new_frame[name] for name in names}
+    if len(col_dict) > 0:
+        frame.update_rows(ids, FrameRef(Frame(col_dict)), inplace=True)

-    Parameters
-    ----------
-    g : DGLGraph
-        a DGL graph
-    num_layers : int
-        The number of layers
-    add_self_loop : bool, default False
-        Whether to add self loop to the sampled NodeFlow.
-        If True, the edge IDs of the self loop edges are -1.
-
-    Returns
-    -------
-    NodeFlow
-        a NodeFlow with a specified number of layers.
-    """
-    seeds = [utils.toindex(F.arange(0, g.number_of_nodes()))]
-    nfi = g._graph.neighbor_sampling(seeds, g.number_of_nodes(), num_layers,
-                                     'in', None, add_self_loop)
-    return NodeFlow(g, nfi[0])
+_init_api("dgl.nodeflow", __name__)
--- a/python/dgl/utils.py
+++ b/python/dgl/utils.py
 """Utility module."""
 from __future__ import absolute_import, division

+import ctypes
 from collections.abc import Mapping, Iterable
 from functools import wraps
 import numpy as np

+from . import _api_internal
 from .base import DGLError
 from . import backend as F
 from . import ndarray as nd
@@ -483,3 +485,27 @@ def get_ndata_name(g, name):
    while name in g.ndata:
        name += '_'
    return name
+
+def unwrap_to_ptr_list(wrapper):
+    """Convert the internal vector wrapper to a python list of ctypes.c_void_p.
+
+    The wrapper will be destroyed after this function.
+
+    Parameters
+    ----------
+    wrapper : ctypes.c_void_p
+        The handler to the wrapper.
+
+    Returns
+    -------
+    list of ctypes.c_void_p
+        A python list of void pointers.
+    """
+    size = _api_internal._GetVectorWrapperSize(wrapper)
+    if size == 0:
+        return []
+    data = _api_internal._GetVectorWrapperData(wrapper)
+    data = ctypes.cast(data, ctypes.POINTER(ctypes.c_void_p * size))
+    rst = [ctypes.c_void_p(x) for x in data.contents]
+    _api_internal._FreeVectorWrapper(wrapper)
+    return rst
--- a/python/setup.py
+++ b/python/setup.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-import sys, os, platform
+import sys, os, platform, sysconfig
 import shutil
 import glob

 from setuptools import find_packages
 from setuptools.dist import Distribution
-from setuptools import setup
+
+# need to use distutils.core for correct placement of cython dll
+if '--inplace' in sys.argv:
+    from distutils.core import setup
+    from distutils.extension import Extension
+else:
+    from setuptools import setup
+    from setuptools.extension import Extension

 class BinaryDistribution(Distribution):
    def has_ext_modules(self):
@@ -30,6 +37,49 @@ def get_lib_path():

 LIBS, VERSION = get_lib_path()

+def config_cython():
+    """Try to configure cython and return cython configuration"""
+    if os.name == 'nt':
+        print("WARNING: Cython is not supported on Windows, will compile without cython module")
+        return []
+    sys_cflags = sysconfig.get_config_var("CFLAGS")
+
+    if "i386" in sys_cflags and "x86_64" in sys_cflags:
+        print("WARNING: Cython library may not be compiled correctly with both i386 and x64")
+        return []
+    try:
+        from Cython.Build import cythonize
+        # from setuptools.extension import Extension
+        if sys.version_info >= (3, 0):
+            subdir = "_cy3"
+        else:
+            subdir = "_cy2"
+        ret = []
+        path = "dgl/_ffi/_cython"
+        if os.name == 'nt':
+            library_dirs = ['dgl', '../build/Release', '../build']
+            libraries = ['libtvm']
+        else:
+            library_dirs = None
+            libraries = None
+        for fn in os.listdir(path):
+            if not fn.endswith(".pyx"):
+                continue
+            ret.append(Extension(
+                "dgl._ffi.%s.%s" % (subdir, fn[:-4]),
+                ["dgl/_ffi/_cython/%s" % fn],
+                include_dirs=["../include/",
+                              "../third_party/dmlc-core/include",
+                              "../third_party/dlpack/include",
+                ],
+                library_dirs=library_dirs,
+                libraries=libraries,
+                language="c++"))
+        return cythonize(ret)
+    except ImportError:
+        print("WARNING: Cython is not installed, will compile without cython module")
+        return []
+
 include_libs = False
 wheel_include_libs = False
 if "bdist_wheel" in sys.argv or os.getenv('CONDA_BUILD'):
@@ -74,6 +124,7 @@ setup(
    ],
    url='https://github.com/dmlc/dgl',
    distclass=BinaryDistribution,
+    ext_modules=config_cython(),
    classifiers=[
        'Development Status :: 3 - Alpha',
        'Programming Language :: Python :: 3',

--- a/src/c_api_common.cc
+++ b/src/c_api_common.cc
@@ -34,5 +34,25 @@ PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector<NDArray>& vec) {
    return PackedFunc(body);
 }

-}  // namespace dgl
+DGL_REGISTER_GLOBAL("_GetVectorWrapperSize")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    const CAPIVectorWrapper* wrapper = static_cast<const CAPIVectorWrapper*>(ptr);
+    *rv = static_cast<int64_t>(wrapper->pointers.size());
+  });
+
+DGL_REGISTER_GLOBAL("_GetVectorWrapperData")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    CAPIVectorWrapper* wrapper = static_cast<CAPIVectorWrapper*>(ptr);
+    *rv = static_cast<void*>(wrapper->pointers.data());
+  });

+DGL_REGISTER_GLOBAL("_FreeVectorWrapper")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    CAPIVectorWrapper* wrapper = static_cast<CAPIVectorWrapper*>(ptr);
+    delete wrapper;
+  });
+
+}  // namespace dgl
--- a/src/c_api_common.h
+++ b/src/c_api_common.h
@@ -52,6 +52,29 @@ dgl::runtime::NDArray CopyVectorToNDArray(
  return a;
 }

+/* A structure used to return a vector of void* pointers. */
+struct CAPIVectorWrapper {
+  // The pointer vector.
+  std::vector<void*> pointers;
+};
+
+/*!
+ * \brief A helper function used to return vector of pointers from C to frontend.
+ *
+ * Note that the function will move the given vector memory into the returned
+ * wrapper object.
+ * 
+ * \param vec The given pointer vectors.
+ * \return A wrapper object containing the given data.
+ */
+template<typename PType>
+CAPIVectorWrapper* WrapVectorReturn(std::vector<PType*> vec) {
+  CAPIVectorWrapper* wrapper = new CAPIVectorWrapper;
+  wrapper->pointers.reserve(vec.size());
+  wrapper->pointers.insert(wrapper->pointers.end(), vec.begin(), vec.end());
+  return wrapper;
+}
+
 }  // namespace dgl

 #endif  // DGL_C_API_COMMON_H_
--- a/src/graph/graph_apis.cc
+++ b/src/graph/graph_apis.cc
@@ -68,30 +68,6 @@ PackedFunc ConvertSubgraphToPackedFunc(const Subgraph& sg) {
  return PackedFunc(body);
 }

-// Convert Sampled Subgraph structures to PackedFunc.
-PackedFunc ConvertSubgraphToPackedFunc(const std::vector<NodeFlow>& sg) {
-  auto body = [sg] (DGLArgs args, DGLRetValue* rv) {
-      const uint64_t which = args[0];
-      if (which < sg.size()) {
-        GraphInterface* gptr = sg[which].graph->Reset();
-        GraphHandle ghandle = gptr;
-        *rv = ghandle;
-      } else if (which >= sg.size() && which < sg.size() * 2) {
-        *rv = std::move(sg[which - sg.size()].node_mapping);
-      } else if (which >= sg.size() * 2 && which < sg.size() * 3) {
-        *rv = std::move(sg[which - sg.size() * 2].edge_mapping);
-      } else if (which >= sg.size() * 3 && which < sg.size() * 4) {
-        *rv = std::move(sg[which - sg.size() * 3].layer_offsets);
-      } else if (which >= sg.size() * 4 && which < sg.size() * 5) {
-        *rv = std::move(sg[which - sg.size() * 4].flow_offsets);
-      } else {
-        LOG(FATAL) << "invalid choice";
-      }
-    };
-  // TODO(minjie): figure out a better way of returning a complex results.
-  return PackedFunc(body);
-}
-
 }  // namespace

 ///////////////////////////// Graph API ///////////////////////////////////
@@ -433,89 +409,6 @@ DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLineGraph")
    *rv = lghandle;
  });

-template<int num_seeds>
-void CAPI_NeighborUniformSample(DGLArgs args, DGLRetValue* rv) {
-  GraphHandle ghandle = args[0];
-  std::vector<IdArray> seeds(num_seeds);
-  for (size_t i = 0; i < seeds.size(); i++)
-    seeds[i] = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[i + 1]));
-  std::string neigh_type = args[num_seeds + 1];
-  const int num_hops = args[num_seeds + 2];
-  const int num_neighbors = args[num_seeds + 3];
-  const int num_valid_seeds = args[num_seeds + 4];
-  const bool add_self_loop = args[num_seeds + 5];
-  const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
-  const ImmutableGraph *gptr = dynamic_cast<const ImmutableGraph*>(ptr);
-  CHECK(gptr) << "sampling isn't implemented in mutable graph";
-  CHECK(num_valid_seeds <= num_seeds);
-  std::vector<NodeFlow> subgs(seeds.size());
-#pragma omp parallel for
-  for (int i = 0; i < num_valid_seeds; i++) {
-    subgs[i] = SamplerOp::NeighborUniformSample(gptr, seeds[i], neigh_type, num_hops,
-                                                num_neighbors, add_self_loop);
-  }
-  *rv = ConvertSubgraphToPackedFunc(subgs);
-}
-
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling")
-.set_body(CAPI_NeighborUniformSample<1>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling2")
-.set_body(CAPI_NeighborUniformSample<2>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling4")
-.set_body(CAPI_NeighborUniformSample<4>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling8")
-.set_body(CAPI_NeighborUniformSample<8>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling16")
-.set_body(CAPI_NeighborUniformSample<16>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling32")
-.set_body(CAPI_NeighborUniformSample<32>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling64")
-.set_body(CAPI_NeighborUniformSample<64>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling128")
-.set_body(CAPI_NeighborUniformSample<128>);
-
-template<int num_seeds>
-void CAPI_LayerUniformSample(DGLArgs args, DGLRetValue* rv) {
-  GraphHandle ghandle = args[0];
-  std::vector<IdArray> seeds(num_seeds);
-  for (size_t i = 0; i < seeds.size(); i++)
-    seeds[i] = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[i + 1]));
-  std::string neigh_type = args[num_seeds + 1];
-  auto ls_array = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[num_seeds + 2]));
-  size_t *ls_data = static_cast<size_t*>(ls_array->data);
-  size_t ls_len = ls_array->shape[0];
-  std::vector<size_t> layer_sizes;
-  std::copy(ls_data, ls_data + ls_len, std::back_inserter(layer_sizes));
-  const int num_valid_seeds = args[num_seeds + 3];
-  const GraphInterface *ptr = static_cast<const GraphInterface *>(ghandle);
-  const ImmutableGraph *gptr = dynamic_cast<const ImmutableGraph*>(ptr);
-  CHECK(gptr) << "sampling isn't implemented in mutable graph";
-  CHECK(num_valid_seeds <= num_seeds);
-  std::vector<NodeFlow> subgs(seeds.size());
-#pragma omp parallel for
-  for (int i = 0; i < num_valid_seeds; i++) {
-    subgs[i] = SamplerOp::LayerUniformSample(gptr, seeds[i], neigh_type, layer_sizes);
-  }
-  *rv = ConvertSubgraphToPackedFunc(subgs);
-}
-
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling")
-.set_body(CAPI_LayerUniformSample<1>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling2")
-.set_body(CAPI_LayerUniformSample<2>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling4")
-.set_body(CAPI_LayerUniformSample<4>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling8")
-.set_body(CAPI_LayerUniformSample<8>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling16")
-.set_body(CAPI_LayerUniformSample<16>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling32")
-.set_body(CAPI_LayerUniformSample<32>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling64")
-.set_body(CAPI_LayerUniformSample<64>);
-DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling128")
-.set_body(CAPI_LayerUniformSample<128>);
-
 DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphGetAdj")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphHandle ghandle = args[0];

--- a/src/graph/sampler.cc
+++ b/src/graph/sampler.cc
@@ -11,6 +11,7 @@
 #include <cstdlib>
 #include <cmath>
 #include <numeric>
+#include "../c_api_common.h"

 #ifdef _MSC_VER
 // rand in MS compiler works well in multi-threading.
@@ -19,10 +20,15 @@ int rand_r(unsigned *seed) {
 }
 #endif

+using dgl::runtime::DGLArgs;
+using dgl::runtime::DGLArgValue;
+using dgl::runtime::DGLRetValue;
+using dgl::runtime::PackedFunc;
+using dgl::runtime::NDArray;
+
 namespace dgl {

 namespace {
-
 /*
 * ArrayHeap is used to sample elements from vector
 */
@@ -373,29 +379,28 @@ NodeFlow ConstructNodeFlow(std::vector<dgl_id_t> neighbor_list,
 }

 NodeFlow SampleSubgraph(const ImmutableGraph *graph,
-                        IdArray seed_arr,
+                        const std::vector<dgl_id_t>& seeds,
                        const float* probability,
                        const std::string &edge_type,
                        int num_hops,
                        size_t num_neighbor,
                        const bool add_self_loop) {
  unsigned int time_seed = time(nullptr);
-  size_t num_seeds = seed_arr->shape[0];
+  const size_t num_seeds = seeds.size();
  auto orig_csr = edge_type == "in" ? graph->GetInCSR() : graph->GetOutCSR();
  const dgl_id_t* val_list = orig_csr->edge_ids.data();
  const dgl_id_t* col_list = orig_csr->indices.data();
  const int64_t* indptr = orig_csr->indptr.data();
-  const dgl_id_t* seed = static_cast<dgl_id_t*>(seed_arr->data);

  std::unordered_set<dgl_id_t> sub_ver_map;  // The vertex Ids in a layer.
  std::vector<std::pair<dgl_id_t, int> > sub_vers;
  sub_vers.reserve(num_seeds * 10);
  // add seed vertices
  for (size_t i = 0; i < num_seeds; ++i) {
-    auto ret = sub_ver_map.insert(seed[i]);
+    auto ret = sub_ver_map.insert(seeds[i]);
    // If the vertex is inserted successfully.
    if (ret.second) {
-      sub_vers.emplace_back(seed[i], 0);
+      sub_vers.emplace_back(seeds[i], 0);
    }
  }
  std::vector<dgl_id_t> tmp_sampled_src_list;
@@ -478,7 +483,51 @@ NodeFlow SampleSubgraph(const ImmutableGraph *graph,

 }  // namespace

-NodeFlow SamplerOp::NeighborUniformSample(const ImmutableGraph *graph, IdArray seeds,
+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetGraph")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
+    GraphInterface* gptr = nflow->graph->Reset();
+    *rv = gptr;
+  });
+
+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetNodeMapping")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
+    *rv = nflow->node_mapping;
+  });
+
+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetEdgeMapping")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
+    *rv = nflow->edge_mapping;
+  });
+
+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetLayerOffsets")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
+    *rv = nflow->layer_offsets;
+  });
+
+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowGetBlockOffsets")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    const NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
+    *rv = nflow->flow_offsets;
+  });
+
+DGL_REGISTER_GLOBAL("nodeflow._CAPI_NodeFlowFree")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    void* ptr = args[0];
+    NodeFlow* nflow = static_cast<NodeFlow*>(ptr);
+    delete nflow;
+  });
+
+NodeFlow SamplerOp::NeighborUniformSample(const ImmutableGraph *graph,
+                                          const std::vector<dgl_id_t>& seeds,
                                          const std::string &edge_type,
                                          int num_hops, int expand_factor,
                                          const bool add_self_loop) {
@@ -535,8 +584,8 @@ IdArray SamplerOp::RandomWalk(
 namespace {
  void ConstructLayers(const int64_t *indptr,
                       const dgl_id_t *indices,
-                       const IdArray seed_array,
-                       const std::vector<size_t> &layer_sizes,
+                       const std::vector<dgl_id_t>& seed_array,
+                       IdArray layer_sizes,
                       std::vector<dgl_id_t> *layer_offsets,
                       std::vector<dgl_id_t> *node_mapping,
                       std::vector<int64_t> *actl_layer_sizes,
@@ -546,17 +595,17 @@ namespace {
     * layers via uniform layer-wise sampling, and return the resultant layers and their
     * corresponding probabilities.
     */
-    const dgl_id_t* seed_data = static_cast<dgl_id_t*>(seed_array->data);
-    size_t seed_len = seed_array->shape[0];
-    std::copy(seed_data, seed_data + seed_len, std::back_inserter(*node_mapping));
+    std::copy(seed_array.begin(), seed_array.end(), std::back_inserter(*node_mapping));
    actl_layer_sizes->push_back(node_mapping->size());
    probabilities->insert(probabilities->end(), node_mapping->size(), 1);
+    const int64_t* layer_sizes_data = static_cast<int64_t*>(layer_sizes->data);
+    const int64_t num_layers = layer_sizes->shape[0];

    size_t curr = 0;
    size_t next = node_mapping->size();
    unsigned int rand_seed = time(nullptr);
-    for (auto i = layer_sizes.rbegin(); i != layer_sizes.rend(); ++i) {
-      auto layer_size = *i;
+    for (int64_t i = num_layers - 1; i >= 0; --i) {
+      const int64_t layer_size = layer_sizes_data[i];
      std::unordered_set<dgl_id_t> candidate_set;
      for (auto j = curr; j != next; ++j) {
        auto src = (*node_mapping)[j];
@@ -569,7 +618,7 @@ namespace {

      std::unordered_map<dgl_id_t, size_t> n_occurrences;
      auto n_candidates = candidate_vector.size();
-      for (size_t j = 0; j != layer_size; ++j) {
+      for (int64_t j = 0; j != layer_size; ++j) {
        auto dst = candidate_vector[rand_r(&rand_seed) % n_candidates];
        if (!n_occurrences.insert(std::make_pair(dst, 1)).second) {
          ++n_occurrences[dst];
@@ -647,9 +696,9 @@ namespace {
 }  // namespace

 NodeFlow SamplerOp::LayerUniformSample(const ImmutableGraph *graph,
-                                       const IdArray seed_array,
+                                       const std::vector<dgl_id_t>& seeds,
                                       const std::string &neighbor_type,
-                                       const std::vector<size_t> &layer_sizes) {
+                                       IdArray layer_sizes) {
  const auto g_csr = neighbor_type == "in" ? graph->GetInCSR() : graph->GetOutCSR();
  const int64_t *indptr = g_csr->indptr.data();
  const dgl_id_t *indices = g_csr->indices.data();
@@ -661,7 +710,7 @@ NodeFlow SamplerOp::LayerUniformSample(const ImmutableGraph *graph,
  std::vector<float> probabilities;
  ConstructLayers(indptr,
                  indices,
-                  seed_array,
+                  seeds,
                  layer_sizes,
                  &layer_offsets,
                  &node_mapping,
@@ -715,4 +764,82 @@ NodeFlow SamplerOp::LayerUniformSample(const ImmutableGraph *graph,
  return nf;
 }

+DGL_REGISTER_GLOBAL("sampling._CAPI_UniformSampling")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    // arguments
+    const GraphHandle ghdl = args[0];
+    const IdArray seed_nodes = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const int64_t batch_start_id = args[2];
+    const int64_t batch_size = args[3];
+    const int64_t max_num_workers = args[4];
+    const int64_t expand_factor = args[5];
+    const int64_t num_hops = args[6];
+    const std::string neigh_type = args[7];
+    const bool add_self_loop = args[8];
+    // process args
+    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghdl);
+    const ImmutableGraph *gptr = dynamic_cast<const ImmutableGraph*>(ptr);
+    CHECK(gptr) << "sampling isn't implemented in mutable graph";
+    CHECK(IsValidIdArray(seed_nodes));
+    const dgl_id_t* seed_nodes_data = static_cast<dgl_id_t*>(seed_nodes->data);
+    const int64_t num_seeds = seed_nodes->shape[0];
+    const int64_t num_workers = std::min(max_num_workers,
+        (num_seeds + batch_size - 1) / batch_size - batch_start_id);
+    // generate node flows
+    std::vector<NodeFlow*> nflows(num_workers);
+#pragma omp parallel for
+    for (int i = 0; i < num_workers; i++) {
+      // create per-worker seed nodes.
+      const int64_t start = (batch_start_id + i) * batch_size;
+      const int64_t end = std::min(start + batch_size, num_seeds);
+      // TODO(minjie): the vector allocation/copy is unnecessary
+      std::vector<dgl_id_t> worker_seeds(end - start);
+      std::copy(seed_nodes_data + start, seed_nodes_data + end,
+                worker_seeds.begin());
+      nflows[i] = new NodeFlow();
+      *nflows[i] = SamplerOp::NeighborUniformSample(
+          gptr, worker_seeds, neigh_type, num_hops, expand_factor, add_self_loop);
+    }
+    *rv = WrapVectorReturn(nflows);
+  });
+
+DGL_REGISTER_GLOBAL("sampling._CAPI_LayerSampling")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    // arguments
+    const GraphHandle ghdl = args[0];
+    const IdArray seed_nodes = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    const int64_t batch_start_id = args[2];
+    const int64_t batch_size = args[3];
+    const int64_t max_num_workers = args[4];
+    const IdArray layer_sizes = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[5]));
+    const std::string neigh_type = args[6];
+    // process args
+    const GraphInterface *ptr = static_cast<const GraphInterface *>(ghdl);
+    const ImmutableGraph *gptr = dynamic_cast<const ImmutableGraph*>(ptr);
+    CHECK(gptr) << "sampling isn't implemented in mutable graph";
+    CHECK(IsValidIdArray(seed_nodes));
+    const dgl_id_t* seed_nodes_data = static_cast<dgl_id_t*>(seed_nodes->data);
+    const int64_t num_seeds = seed_nodes->shape[0];
+    const int64_t num_workers = std::min(max_num_workers,
+        (num_seeds + batch_size - 1) / batch_size - batch_start_id);
+    // generate node flows
+    std::vector<NodeFlow*> nflows(num_workers);
+#pragma omp parallel for
+    for (int i = 0; i < num_workers; i++) {
+      // create per-worker seed nodes.
+      const int64_t start = (batch_start_id + i) * batch_size;
+      const int64_t end = std::min(start + batch_size, num_seeds);
+      // TODO(minjie): the vector allocation/copy is unnecessary
+      std::vector<dgl_id_t> worker_seeds(end - start);
+      std::copy(seed_nodes_data + start, seed_nodes_data + end,
+                worker_seeds.begin());
+      nflows[i] = new NodeFlow();
+      *nflows[i] = SamplerOp::LayerUniformSample(
+          gptr, worker_seeds, neigh_type, layer_sizes);
+    }
+    *rv = WrapVectorReturn(nflows);
+  });
+
+
+
 }  // namespace dgl
--- a/src/runtime/pack_args.h
+++ b/src/runtime/pack_args.h
@@ -14,6 +14,7 @@
 #define DGL_RUNTIME_PACK_ARGS_H_

 #include <dgl/runtime/c_runtime_api.h>
+#include <dgl/runtime/packed_func.h>
 #include <vector>
 #include <cstring>