Merge pull request #3 from jermainewang/cpp

Sync with latest commit

Merge pull request #3 from jermainewang/cpp
Sync with latest commit
9d3f299d · VoVAllen · GitHub · 2cc6079a · bc3f852d · 9d3f299d
Unverified Commit 9d3f299d authored Oct 18, 2018 by VoVAllen Committed by GitHub Oct 18, 2018
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,9 +86,7 @@ endif(MSVC)
 #assign_source_group("Include" ${GROUP_INCLUDE})

 # Source file lists
-file(GLOB CORE_SRCS
-    src/graph/*.cc
-    )
+file(GLOB CORE_SRCS src/graph/*.cc src/*.cc src/scheduler/*.cc)

 file(GLOB RUNTIME_SRCS src/runtime/*.cc)


--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -33,9 +33,9 @@ pipeline {
                        stage('TEST') {
                            steps {
                                withEnv(["DGL_LIBRARY_PATH=${env.WORKSPACE}/build"]) {
-                                    sh 'echo $DGL_LIBRARY_PATH'
                                    sh 'nosetests tests -v --with-xunit'
                                    sh 'nosetests tests/pytorch -v --with-xunit'
+                                    sh 'nosetests tests/graph_index -v --with-xunit'
                                }
                            }
                        }
@@ -76,9 +76,9 @@ pipeline {
                        stage('TEST') {
                            steps {
                                withEnv(["DGL_LIBRARY_PATH=${env.WORKSPACE}/build"]) {
-                                    sh 'echo $DGL_LIBRARY_PATH'
                                    sh 'nosetests tests -v --with-xunit'
                                    sh 'nosetests tests/pytorch -v --with-xunit'
+                                    sh 'nosetests tests/graph_index -v --with-xunit'
                                }
                            }
                        }

--- a/include/dgl/graph.h
+++ b/include/dgl/graph.h
@@ -41,7 +41,7 @@ class Graph {
  } EdgeArray;

  /*! \brief default constructor */
-  Graph() {}
+  Graph(bool multigraph = false) : is_multigraph_(multigraph) {}

  /*! \brief default copy constructor */
  Graph(const Graph& other) = default;
@@ -56,8 +56,9 @@ class Graph {
    all_edges_src_ = other.all_edges_src_;
    all_edges_dst_ = other.all_edges_dst_;
    read_only_ = other.read_only_;
+    is_multigraph_ = other.is_multigraph_;
    num_edges_ = other.num_edges_;
-    other.clear();
+    other.Clear();
  }
 #endif  // _MSC_VER

@@ -101,6 +102,14 @@ class Graph {
    num_edges_ = 0;
  }

+  /*!
+   * \note not const since we have caches
+   * \return whether the graph is a multigraph
+   */
+  bool IsMultigraph() const {
+    return is_multigraph_;
+  }
+
  /*! \return the number of vertices in the graph.*/
  uint64_t NumVertices() const {
    return adjlist_.size();
@@ -120,10 +129,10 @@ class Graph {
  BoolArray HasVertices(IdArray vids) const;

  /*! \return true if the given edge is in the graph.*/
-  bool HasEdge(dgl_id_t src, dgl_id_t dst) const;
+  bool HasEdgeBetween(dgl_id_t src, dgl_id_t dst) const;

  /*! \return a 0-1 array indicating whether the given edges are in the graph.*/
-  BoolArray HasEdges(IdArray src_ids, IdArray dst_ids) const;
+  BoolArray HasEdgesBetween(IdArray src_ids, IdArray dst_ids) const;

  /*!
   * \brief Find the predecessors of a vertex.
@@ -142,22 +151,32 @@ class Graph {
  IdArray Successors(dgl_id_t vid, uint64_t radius = 1) const;

  /*!
-   * \brief Get the edge id using the two endpoints
+   * \brief Get all edge ids between the two given endpoints
   * \note Edges are associated with an integer id start from zero.
   *       The id is assigned when the edge is being added to the graph.
   * \param src The source vertex.
   * \param dst The destination vertex.
-   * \return the edge id.
+   * \return the edge id array.
   */
-  dgl_id_t EdgeId(dgl_id_t src, dgl_id_t dst) const;
+  IdArray EdgeId(dgl_id_t src, dgl_id_t dst) const;

  /*!
-   * \brief Get the edge id using the two endpoints
+   * \brief Get all edge ids between the given endpoint pairs.
   * \note Edges are associated with an integer id start from zero.
   *       The id is assigned when the edge is being added to the graph.
-   * \return the edge id array.
+   *       If duplicate pairs exist, the returned edge IDs will also duplicate.
+   *       The order of returned edge IDs will follow the order of src-dst pairs
+   *       first, and ties are broken by the order of edge ID.
+   * \return EdgeArray containing all edges between all pairs.
   */
-  IdArray EdgeIds(IdArray src, IdArray dst) const;
+  EdgeArray EdgeIds(IdArray src, IdArray dst) const;
+
+  /*!
+   * \brief Find the edge IDs and return their source and target node IDs.
+   * \param eids The edge ID array.
+   * \return EdgeArray containing all edges with id in eid.  The order is preserved.
+   */
+  EdgeArray FindEdges(IdArray eids) const;

  /*!
   * \brief Get the in edges of the vertex.
@@ -263,10 +282,10 @@ class Graph {
   *
   * The result subgraph is read-only.
   *
-   * \param vids The edges in the subgraph.
+   * \param eids The edges in the subgraph.
   * \return the induced edge subgraph
   */
-  Subgraph EdgeSubgraph(IdArray src, IdArray dst) const;
+  Subgraph EdgeSubgraph(IdArray eids) const;

  /*!
   * \brief Return a new graph with all the edges reversed.
@@ -300,6 +319,12 @@ class Graph {

  /*! \brief read only flag */
  bool read_only_ = false;
+  /*!
+   * \brief Whether if this is a multigraph.
+   *
+   * When a multiedge is added, this flag switches to true.
+   */
+  bool is_multigraph_ = false;
  /*! \brief number of edges */
  uint64_t num_edges_ = 0;
 };

--- a/include/dgl/scheduler.h
+++ b/include/dgl/scheduler.h
+// DGL Scheduler interface
+#ifndef DGL_SCHEDULER_H_
+#define DGL_SCHEDULER_H_
+
+#include "runtime/ndarray.h"
+#include <vector>
+
+namespace dgl {
+
+typedef tvm::runtime::NDArray IdArray;
+
+namespace sched {
+
+/*!
+ * \brief Generate degree bucketing schedule
+ * \param vids The destination vertex for messages
+ * \note If there are multiple messages going into the same destination vertex, then
+ *       there will be multiple copies of the destination vertex in vids
+ * \return a vector of 5 IdArrays for degree bucketing. The 5 arrays are:
+ *         degrees: of degrees for each bucket
+ *         nids: destination node ids
+ *         nid_section: number of nodes in each bucket (used to split nids)
+ *         mids: message ids
+ *         mid_section: number of messages in each bucket (used to split mids)
+ */
+std::vector<IdArray> DegreeBucketing(const IdArray& vids);
+
+} // namespace sched
+
+} // namespace dgl
+
+#endif // DGL_SCHEDULER_H_
--- a/python/dgl/graph.py
+++ b/python/dgl/graph.py
--- a/python/dgl/graph_index.py
+++ b/python/dgl/graph_index.py
@@ -72,6 +72,16 @@ class GraphIndex(object):
        _CAPI_DGLGraphClear(self._handle)
        self._cache.clear()

+    def is_multigraph(self):
+        """Return whether the graph is a multigraph
+
+        Returns
+        -------
+        bool
+            True if it is a multigraph, False otherwise.
+        """
+        return bool(_CAPI_DGLGraphIsMultigraph(self._handle))
+
    def number_of_nodes(self):
        """Return the number of nodes.

@@ -103,9 +113,9 @@ class GraphIndex(object):
        Returns
        -------
        bool
-            True if the node exists
+            True if the node exists, False otherwise.
        """
-        return _CAPI_DGLGraphHasVertex(self._handle, vid)
+        return bool(_CAPI_DGLGraphHasVertex(self._handle, vid))

    def has_nodes(self, vids):
        """Return true if the nodes exist.
@@ -123,7 +133,7 @@ class GraphIndex(object):
        vid_array = vids.todgltensor()
        return utils.toindex(_CAPI_DGLGraphHasVertices(self._handle, vid_array))

-    def has_edge(self, u, v):
+    def has_edge_between(self, u, v):
        """Return true if the edge exists.

        Parameters
@@ -136,11 +146,11 @@ class GraphIndex(object):
        Returns
        -------
        bool
-            True if the edge exists
+            True if the edge exists, False otherwise
        """
-        return _CAPI_DGLGraphHasEdge(self._handle, u, v)
+        return bool(_CAPI_DGLGraphHasEdgeBetween(self._handle, u, v))

-    def has_edges(self, u, v):
+    def has_edges_between(self, u, v):
        """Return true if the edge exists.

        Parameters
@@ -157,7 +167,7 @@ class GraphIndex(object):
        """
        u_array = u.todgltensor()
        v_array = v.todgltensor()
-        return utils.toindex(_CAPI_DGLGraphHasEdges(self._handle, u_array, v_array))
+        return utils.toindex(_CAPI_DGLGraphHasEdgesBetween(self._handle, u_array, v_array))

    def predecessors(self, v, radius=1):
        """Return the predecessors of the node.
@@ -194,7 +204,7 @@ class GraphIndex(object):
        return utils.toindex(_CAPI_DGLGraphSuccessors(self._handle, v, radius))

    def edge_id(self, u, v):
-        """Return the id of the edge.
+        """Return the id array of all edges between u and v.

        Parameters
        ----------
@@ -205,13 +215,13 @@ class GraphIndex(object):

        Returns
        -------
-        int
-            The edge id.
+        utils.Index
+            The edge id array.
        """
-        return _CAPI_DGLGraphEdgeId(self._handle, u, v)
+        return utils.toindex(_CAPI_DGLGraphEdgeId(self._handle, u, v))

    def edge_ids(self, u, v):
-        """Return the edge ids.
+        """Return a triplet of arrays that contains the edge IDs.

        Parameters
        ----------
@@ -223,11 +233,47 @@ class GraphIndex(object):
        Returns
        -------
        utils.Index
-            Teh edge id array.
+            The src nodes.
+        utils.Index
+            The dst nodes.
+        utils.Index
+            The edge ids.
        """
        u_array = u.todgltensor()
        v_array = v.todgltensor()
-        return utils.toindex(_CAPI_DGLGraphEdgeIds(self._handle, u_array, v_array))
+        edge_array = _CAPI_DGLGraphEdgeIds(self._handle, u_array, v_array)
+
+        src = utils.toindex(edge_array(0))
+        dst = utils.toindex(edge_array(1))
+        eid = utils.toindex(edge_array(2))
+
+        return src, dst, eid
+
+    def find_edges(self, eid):
+        """Return a triplet of arrays that contains the edge IDs.
+
+        Parameters
+        ----------
+        eid : utils.Index
+            The edge ids.
+
+        Returns
+        -------
+        utils.Index
+            The src nodes.
+        utils.Index
+            The dst nodes.
+        utils.Index
+            The edge ids.
+        """
+        eid_array = eid.todgltensor()
+        edge_array = _CAPI_DGLGraphFindEdges(self._handle, eid_array)
+
+        src = utils.toindex(edge_array(0))
+        dst = utils.toindex(edge_array(1))
+        eid = utils.toindex(edge_array(2))
+
+        return src, dst, eid

    def in_edges(self, v):
        """Return the in edges of the node(s).
@@ -378,16 +424,32 @@ class GraphIndex(object):

        Returns
        -------
-        GraphIndex
+        SubgraphIndex
            The subgraph index.
-        utils.Index
-            The induced edge ids. This is also a map from new edge id to parent edge id.
        """
        v_array = v.todgltensor()
        rst = _CAPI_DGLGraphVertexSubgraph(self._handle, v_array)
-        gi = GraphIndex(rst(0))
        induced_edges = utils.toindex(rst(2))
-        return gi, induced_edges
+        return SubgraphIndex(rst(0), self, v, induced_edges)
+
+    def edge_subgraph(self, e):
+        """Return the induced edge subgraph.
+
+        Parameters
+        ----------
+        e : utils.Index
+            The edges.
+
+        Returns
+        -------
+        SubgraphIndex
+            The subgraph index.
+        """
+        e_array = e.todgltensor()
+        rst = _CAPI_DGLGraphEdgeSubgraph(self._handle, e_array)
+        gi = GraphIndex(rst(0))
+        induced_nodes = utils.toindex(rst(1))
+        return SubgraphIndex(rst(0), self, induced_nodes, e)

    def adjacency_matrix(self):
        """Return the adjacency matrix representation of this graph.
@@ -460,7 +522,7 @@ class GraphIndex(object):
            The nx graph
        """
        src, dst, eid = self.edges()
-        ret = nx.DiGraph()
+        ret = nx.MultiDiGraph() if self.is_multigraph() else nx.DiGraph()
        for u, v, id in zip(src, dst, eid):
            ret.add_edge(u, v, id=id)
        return ret
@@ -477,8 +539,13 @@ class GraphIndex(object):
            The nx graph
        """
        self.clear()
-        if not isinstance(nx_graph, nx.DiGraph):
-            nx_graph = nx.DiGraph(nx_graph)
+
+        if not isinstance(nx_graph, nx.Graph):
+            nx_graph = (nx.MultiDiGraph(nx_graph) if self.is_multigraph()
+                    else nx.DiGraph(nx_graph))
+        else:
+            nx_graph = nx_graph.to_directed()
+
        num_nodes = nx_graph.number_of_nodes()
        self.add_nodes(num_nodes)
        has_edge_id = 'id' in next(iter(nx_graph.edges))
@@ -487,16 +554,16 @@ class GraphIndex(object):
            src = np.zeros((num_edges,), dtype=np.int64)
            dst = np.zeros((num_edges,), dtype=np.int64)
            for e, attr in nx_graph.edges.items:
-                u, v = e
+                # MultiDiGraph returns a triplet in e while DiGraph returns a pair
                eid = attr['id']
-                src[eid] = u
-                dst[eid] = v
+                src[eid] = e[0]
+                dst[eid] = e[1]
        else:
            src = []
            dst = []
-            for u, v in nx_graph.edges:
-                src.append(u)
-                dst.append(v)
+            for e in nx_graph.edges:
+                src.append(e[0])
+                dst.append(e[1])
        src = utils.toindex(src)
        dst = utils.toindex(dst)
        self.add_edges(src, dst)
@@ -531,7 +598,32 @@ class GraphIndex(object):
        """
        handle = _CAPI_DGLGraphLineGraph(self._handle, backtracking)
        return GraphIndex(handle)
-        
+
+class SubgraphIndex(GraphIndex):
+    def __init__(self, handle, parent, induced_nodes, induced_edges):
+        super().__init__(handle)
+
+        self._parent = parent
+        self._induced_nodes = induced_nodes
+        self._induced_edges = induced_edges
+
+    def add_nodes(self, num):
+        raise RuntimeError('Readonly graph. Mutation is not allowed.')
+
+    def add_edge(self, u, v):
+        raise RuntimeError('Readonly graph. Mutation is not allowed.')
+
+    def add_edges(self, u, v):
+        raise RuntimeError('Readonly graph. Mutation is not allowed.')
+
+    @property
+    def induced_edges(self):
+        return self._induced_edges
+
+    @property
+    def induced_nodes(self):
+        return self._induced_nodes
+
 def disjoint_union(graphs):
    """Return a disjoint union of the input graphs.

@@ -590,17 +682,20 @@ def disjoint_partition(graph, num_or_size_splits):
        graphs.append(GraphIndex(handle))
    return graphs

-def create_graph_index(graph_data=None):
+def create_graph_index(graph_data=None, multigraph=False):
    """Create a graph index object.

    Parameters
    ----------
    graph_data : graph data, optional
        Data to initialize graph. Same as networkx's semantics.
+    multigraph : bool, optional
+        Whether the graph is multigraph (default is False)
    """
    if isinstance(graph_data, GraphIndex):
        return graph_data
-    handle = _CAPI_DGLGraphCreate()
+
+    handle = _CAPI_DGLGraphCreate(multigraph)
    gi = GraphIndex(handle)
    if graph_data is not None:
        gi.from_networkx(graph_data)

--- a/python/dgl/scheduler.py
+++ b/python/dgl/scheduler.py
@@ -3,13 +3,16 @@ from __future__ import absolute_import

 import numpy as np

-from .base import ALL
+from .base import ALL, __MSG__, __REPR__
 from . import backend as F
 from .function import message as fmsg
 from .function import reducer as fred
 from . import utils
+from collections import defaultdict as ddict

-__all__ = ["degree_bucketing", "get_executor"]
+from ._ffi.function import _init_api
+
+__all__ = ["degree_bucketing", "get_recv_executor", "get_executor"]

 def degree_bucketing(graph, v):
    """Create degree bucketing scheduling policy.
@@ -39,6 +42,73 @@ def degree_bucketing(graph, v):
    #print('degree-bucketing:', unique_degrees, [len(b) for b in v_bkt])
    return unique_degrees, v_bkt

+def _process_buckets(buckets):
+    """read bucketing auxiliary data"""
+    # get back results
+    degs = utils.toindex(buckets(0))
+    v = utils.toindex(buckets(1))
+    # TODO: convert directly from ndarary to python list?
+    v_section = buckets(2).asnumpy().tolist()
+    msg_ids = utils.toindex(buckets(3))
+    msg_section = buckets(4).asnumpy().tolist()
+
+    # split buckets
+    unique_v = v.tousertensor()
+    msg_ids = msg_ids.tousertensor()
+    dsts = F.unpack(unique_v, v_section)
+    msg_ids = F.unpack(msg_ids, msg_section)
+
+    # convert to utils.Index
+    unique_v = utils.toindex(unique_v)
+    dsts = [utils.toindex(dst) for dst in dsts]
+    msg_ids = [utils.toindex(msg_id) for msg_id in msg_ids]
+
+    return unique_v, degs, dsts, msg_ids
+
+def light_degree_bucketing(v):
+    """Return the bucketing by degree scheduling for destination nodes of messages
+
+    Parameters
+    ----------
+    v: utils.Index
+        destionation node for each message
+
+    Returns
+    -------
+    unique_v: utils.Index
+        unqiue destination nodes
+    degrees: utils.Index
+        A list of degree for each bucket
+    v_bkt: list of utils.Index
+        A list of node id buckets, nodes in each bucket have the same degree
+    msg_ids: list of utils.Index
+        A list of message id buckets, each node in the ith node id bucket has
+        degree[i] messages in the ith message id bucket
+    """
+    buckets = _CAPI_DGLDegreeBucketing(v.todgltensor())
+    return _process_buckets(buckets)
+
+def light_degree_bucketing_for_graph(graph):
+    """Return the bucketing by degree scheduling for the entire graph
+
+    Parameters:
+        graph: GraphIndex
+
+    Returns
+    -------
+    unique_v: utils.Index
+        unqiue destination nodes
+    degrees: utils.Index
+        A list of degree for each bucket
+    v_bkt: list of utils.Index
+        A list of node id buckets, nodes in each bucket have the same degree
+    msg_ids: list of utils.Index
+        A list of message id buckets, each node in the ith node id bucket has
+        degree[i] messages in the ith message id bucket
+    """
+    buckets = _CAPI_DGLDegreeBucketingFromGraph(self._handle)
+    return _process_buckets(buckets)
+

 class Executor(object):
    def run(self):
@@ -78,6 +148,55 @@ class SPMVOperator(Executor):
            return {self.dst_field : dstcol}


+# FIXME: refactorize in scheduler/executor redesign
+class DegreeBucketingExecutor(Executor):
+    def __init__(self, g, rfunc, message_frame, edges=None):
+        self.g = g
+        self.rfunc = rfunc
+        self.msg_frame = message_frame
+
+        # calc degree bucketing schedule
+        if edges is not None:
+            unique_v, degs, dsts, msg_ids = light_degree_bucketing(edges[1])
+        else:
+            unique_v, degs, dsts, msg_ids = light_degree_bucketing_for_graph(g._graph)
+        self._recv_nodes = unique_v
+        self.degrees = degs
+        self.dsts = dsts
+        self.msg_ids = msg_ids
+
+    @property
+    def recv_nodes(self):
+        return self._recv_nodes
+
+    def run(self):
+        new_reprs = []
+        # loop over each bucket
+        # FIXME (lingfan): handle zero-degree case
+        for deg, vv, msg_id in zip(self.degrees, self.dsts, self.msg_ids):
+            dst_reprs = self.g.get_n_repr(vv)
+            in_msgs = self.msg_frame.select_rows(msg_id)
+            def _reshape_fn(msg):
+                msg_shape = F.shape(msg)
+                new_shape = (len(vv), deg) + msg_shape[1:]
+                return F.reshape(msg, new_shape)
+            if len(in_msgs) == 1 and __MSG__ in in_msgs:
+                reshaped_in_msgs = _reshape_fn(in_msgs[__MSG__])
+            else:
+                reshaped_in_msgs = utils.LazyDict(
+                        lambda key: _reshape_fn(in_msgs[key]), self.msg_frame.schemes)
+            new_reprs.append(self.rfunc(dst_reprs, reshaped_in_msgs))
+
+        # Pack all reducer results together
+        if utils.is_dict_like(new_reprs[0]):
+            keys = new_reprs[0].keys()
+            new_reprs = {key : F.pack([repr[key] for repr in new_reprs])
+                         for key in keys}
+        else:
+            new_reprs = {__REPR__ : F.pack(new_reprs)}
+        return new_reprs
+
+
 class BasicExecutor(Executor):
    def __init__(self, graph, mfunc, rfunc):
        self.g = graph
@@ -92,7 +211,7 @@ class BasicExecutor(Executor):
        raise NotImplementedError

    @property
-    def graph_mapping(self):
+    def recv_nodes(self):
        raise NotImplementedError

    def _build_exec(self, mfunc, rfunc):
@@ -115,8 +234,7 @@ class BasicExecutor(Executor):
        return exe

    def run(self):
-        attr = self.exe.run()
-        self.g.set_n_repr(attr, self.graph_mapping)
+        return self.exe.run()


 class UpdateAllExecutor(BasicExecutor):
@@ -129,7 +247,7 @@ class UpdateAllExecutor(BasicExecutor):
        self._edge_repr = None
        self._graph_idx = None
        self._graph_shape = None
-        self._graph_mapping = None
+        self._recv_nodes = None

    @property
    def graph_idx(self):
@@ -145,7 +263,7 @@ class UpdateAllExecutor(BasicExecutor):
        return self._graph_shape

    @property
-    def graph_mapping(self):
+    def recv_nodes(self):
        return ALL

    @property
@@ -186,7 +304,7 @@ class SendRecvExecutor(BasicExecutor):
        self._edge_repr = None
        self._graph_idx = None
        self._graph_shape = None
-        self._graph_mapping = None
+        self._recv_nodes = None

    @property
    def graph_idx(self):
@@ -201,10 +319,10 @@ class SendRecvExecutor(BasicExecutor):
        return self._graph_shape

    @property
-    def graph_mapping(self):
-        if self._graph_mapping is None:
+    def recv_nodes(self):
+        if self._recv_nodes is None:
            self._build_adjmat()
-        return self._graph_mapping
+        return self._recv_nodes

    @property
    def node_repr(self):
@@ -229,7 +347,7 @@ class SendRecvExecutor(BasicExecutor):
        m = len(new2old)
        self._graph_idx = F.pack([F.unsqueeze(new_v, 0), F.unsqueeze(u, 0)])
        self._graph_shape = [m, n]
-        self._graph_mapping = new2old
+        self._recv_nodes = new2old

    def _adj_build_fn(self, edge_field, ctx, use_edge_feat):
        if use_edge_feat:
@@ -283,7 +401,7 @@ class BundledExecutor(BasicExecutor):
            else:
                # attr and res must be dict
                attr.update(res)
-        self.g.set_n_repr(attr, self.graph_mapping)
+        return attr


 class BundledUpdateAllExecutor(BundledExecutor, UpdateAllExecutor):
@@ -298,6 +416,14 @@ class BundledSendRecvExecutor(BundledExecutor, SendRecvExecutor):
        BundledExecutor.__init__(self, graph, mfunc, rfunc)

 def _is_spmv_supported(fn, graph=None):
+    # FIXME: also take into account
+    # (1) which backend DGL is under.
+    # (2) whether the graph is a multigraph.
+    #
+    # Current SPMV optimizer assumes that duplicate entries are summed up
+    # in sparse matrices, which is the case for PyTorch but not MXNet.
+    # The result is that on multigraphs, SPMV can still work for reducer=sum
+    # and message=copy_src/src_mul_edge *only in PyTorch*.
    if isinstance(fn, fmsg.MessageFunction):
        return fn.is_spmv_supported(graph)
    elif isinstance(fn, fred.ReduceFunction):
@@ -342,3 +468,24 @@ def get_executor(call_type, graph, **kwargs):
        return _create_send_and_recv_exec(graph, **kwargs)
    else:
        return None
+
+def get_recv_executor(graph, reduce_func, message_frame, edges=None):
+    """Create executor for recv phase
+
+    Parameters
+    ----------
+    graph: DGLGraph
+        DGLGraph on which to perform recv
+    reduce_func: callable
+        The reduce function
+    message_frame: FrameRef
+        Message frame
+    edges: tuple/list of utils.Index
+        src and dst Index representing edges along which messages are sent
+        If not specified, all edges of graph are used instead
+    """
+
+    # FIXME: handle builtin spmv executor case
+    return DegreeBucketingExecutor(graph, reduce_func, message_frame, edges)
+
+_init_api("dgl.scheduler")
--- a/python/dgl/utils.py
+++ b/python/dgl/utils.py
@@ -23,7 +23,7 @@ class Index(object):
            if not (F.dtype(data) == F.int64 and len(F.shape(data)) == 1):
                raise ValueError('Index data must be 1D int64 vector, but got: %s' % str(data))
            self._user_tensor_data[F.get_context(data)] = data
-        elif isinstance(data, nd.NDArray): 
+        elif isinstance(data, nd.NDArray):
            if not (data.dtype == 'int64' and len(data.shape) == 1):
                raise ValueError('Index data must be 1D int64 vector, but got: %s' % str(data))
            self._dgl_tensor_data = data
@@ -168,6 +168,34 @@ class LazyDict(Mapping):
    def __len__(self):
        return len(self._keys)

+class HybridDict(Mapping):
+    """A readonly dictonary that merges several dict-like (python dict, LazyDict).
+       If there are duplicate keys, early keys have priority over latter ones
+    """
+    def __init__(self, *dict_like_list):
+        self._dict_like_list = dict_like_list
+        self._keys = None
+
+    def keys(self):
+        if self._keys is None:
+            self._keys = sum([set(d.keys()) for d in self._dict_like_list], set())
+            self._keys = list(self._keys)
+        return self._keys
+
+    def __getitem__(self, key):
+        for d in self._dict_like_list:
+            if key in d:
+                return d[key]
+
+    def __contains__(self, key):
+        return key in self.keys()
+
+    def __iter__(self):
+        return iter(self.keys())
+
+    def __len__(self):
+        return len(self.keys())
+
 class ReadOnlyDict(Mapping):
    """A readonly dictionary wrapper."""
    def __init__(self, dict_like):

--- a/src/c_api_common.cc
+++ b/src/c_api_common.cc
+#include "c_api_common.h"
+
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMArgValue;
+using tvm::runtime::TVMRetValue;
+using tvm::runtime::PackedFunc;
+using tvm::runtime::NDArray;
+
+namespace dgl {
+
+DLManagedTensor* CreateTmpDLManagedTensor(const TVMArgValue& arg) {
+  const DLTensor* dl_tensor = arg;
+  DLManagedTensor* ret = new DLManagedTensor();
+  ret->deleter = [] (DLManagedTensor* self) { delete self; };
+  ret->manager_ctx = nullptr;
+  ret->dl_tensor = *dl_tensor;
+  return ret;
+}
+
+PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector<NDArray>& vec) {
+    auto body = [vec](TVMArgs args, TVMRetValue* rv) {
+        size_t which = args[0];
+        if (which >= vec.size()) {
+            LOG(FATAL) << "invalid choice";
+        } else {
+            *rv = std::move(vec[which]);
+        }
+    };
+    return PackedFunc(body);
+}
+
+} // namespace dgl
+
--- a/src/c_api_common.h
+++ b/src/c_api_common.h
+// DGL C API common util functions
+#ifndef DGL_C_API_COMMON_H_
+#define DGL_C_API_COMMON_H_
+
+#include <dgl/runtime/ndarray.h>
+#include <dgl/runtime/packed_func.h>
+#include <dgl/runtime/registry.h>
+#include <vector>
+
+namespace dgl {
+
+// Graph handler type
+typedef void* GraphHandle;
+
+// Convert the given DLTensor to a temporary DLManagedTensor that does not own memory.
+DLManagedTensor* CreateTmpDLManagedTensor(const tvm::runtime::TVMArgValue& arg);
+
+// Convert a vector of NDArray to PackedFunc
+tvm::runtime::PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector<tvm::runtime::NDArray>& vec);
+
+} // namespace dgl
+
+#endif // DGL_C_API_COMMON_H_
--- a/src/graph/graph.cc
+++ b/src/graph/graph.cc
 // Graph class implementation
 #include <algorithm>
 #include <unordered_map>
+#include <set>
+#include <functional>
 #include <dgl/graph.h>

 namespace dgl {
@@ -21,11 +23,14 @@ void Graph::AddEdge(dgl_id_t src, dgl_id_t dst) {
  CHECK(!read_only_) << "Graph is read-only. Mutations are not allowed.";
  CHECK(HasVertex(src) && HasVertex(dst))
    << "Invalid vertices: src=" << src << " dst=" << dst;
+
  dgl_id_t eid = num_edges_++;
+
  adjlist_[src].succ.push_back(dst);
  adjlist_[src].edge_id.push_back(eid);
  reverse_adjlist_[dst].succ.push_back(src);
  reverse_adjlist_[dst].edge_id.push_back(eid);
+
  all_edges_src_.push_back(src);
  all_edges_dst_.push_back(dst);
 }
@@ -71,14 +76,14 @@ BoolArray Graph::HasVertices(IdArray vids) const {
 }

 // O(E)
-bool Graph::HasEdge(dgl_id_t src, dgl_id_t dst) const {
+bool Graph::HasEdgeBetween(dgl_id_t src, dgl_id_t dst) const {
  if (!HasVertex(src) || !HasVertex(dst)) return false;
  const auto& succ = adjlist_[src].succ;
  return std::find(succ.begin(), succ.end(), dst) != succ.end();
 }

-// O(E*K) pretty slow
-BoolArray Graph::HasEdges(IdArray src_ids, IdArray dst_ids) const {
+// O(E*k) pretty slow
+BoolArray Graph::HasEdgesBetween(IdArray src_ids, IdArray dst_ids) const {
  CHECK(IsValidIdArray(src_ids)) << "Invalid src id array.";
  CHECK(IsValidIdArray(dst_ids)) << "Invalid dst id array.";
  const auto srclen = src_ids->shape[0];
@@ -91,18 +96,18 @@ BoolArray Graph::HasEdges(IdArray src_ids, IdArray dst_ids) const {
  if (srclen == 1) {
    // one-many
    for (int64_t i = 0; i < dstlen; ++i) {
-      rst_data[i] = HasEdge(src_data[0], dst_data[i])? 1 : 0;
+      rst_data[i] = HasEdgeBetween(src_data[0], dst_data[i])? 1 : 0;
    }
  } else if (dstlen == 1) {
    // many-one
    for (int64_t i = 0; i < srclen; ++i) {
-      rst_data[i] = HasEdge(src_data[i], dst_data[0])? 1 : 0;
+      rst_data[i] = HasEdgeBetween(src_data[i], dst_data[0])? 1 : 0;
    }
  } else {
    // many-many
    CHECK(srclen == dstlen) << "Invalid src and dst id array.";
    for (int64_t i = 0; i < srclen; ++i) {
-      rst_data[i] = HasEdge(src_data[i], dst_data[i])? 1 : 0;
+      rst_data[i] = HasEdgeBetween(src_data[i], dst_data[i])? 1 : 0;
    }
  }
  return rst;
@@ -112,13 +117,16 @@ BoolArray Graph::HasEdges(IdArray src_ids, IdArray dst_ids) const {
 IdArray Graph::Predecessors(dgl_id_t vid, uint64_t radius) const {
  CHECK(HasVertex(vid)) << "invalid vertex: " << vid;
  CHECK(radius >= 1) << "invalid radius: " << radius;
-  const auto& pred = reverse_adjlist_[vid].succ;
-  const int64_t len = pred.size();
+  std::set<dgl_id_t> vset;
+
+  for (auto& it : reverse_adjlist_[vid].succ)
+    vset.insert(it);
+
+  const int64_t len = vset.size();
  IdArray rst = IdArray::Empty({len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
  int64_t* rst_data = static_cast<int64_t*>(rst->data);
-  for (int64_t i = 0; i < len; ++i) {
-    rst_data[i] = pred[i];
-  }
+
+  std::copy(vset.begin(), vset.end(), rst_data);
  return rst;
 }

@@ -126,58 +134,109 @@ IdArray Graph::Predecessors(dgl_id_t vid, uint64_t radius) const {
 IdArray Graph::Successors(dgl_id_t vid, uint64_t radius) const {
  CHECK(HasVertex(vid)) << "invalid vertex: " << vid;
  CHECK(radius >= 1) << "invalid radius: " << radius;
-  const auto& succ = adjlist_[vid].succ;
-  const int64_t len = succ.size();
+  std::set<dgl_id_t> vset;
+
+  for (auto& it : adjlist_[vid].succ)
+    vset.insert(it);
+
+  const int64_t len = vset.size();
  IdArray rst = IdArray::Empty({len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
  int64_t* rst_data = static_cast<int64_t*>(rst->data);
-  for (int64_t i = 0; i < len; ++i) {
-    rst_data[i] = succ[i];
-  }
+
+  std::copy(vset.begin(), vset.end(), rst_data);
  return rst;
 }

 // O(E)
-dgl_id_t Graph::EdgeId(dgl_id_t src, dgl_id_t dst) const {
-  CHECK(HasVertex(src)) << "invalid edge: " << src << " -> " << dst;
+IdArray Graph::EdgeId(dgl_id_t src, dgl_id_t dst) const {
+  CHECK(HasVertex(src) && HasVertex(dst)) << "invalid edge: " << src << " -> " << dst;
+
  const auto& succ = adjlist_[src].succ;
+  std::vector<dgl_id_t> edgelist;
+
  for (size_t i = 0; i < succ.size(); ++i) {
-    if (succ[i] == dst) {
-      return adjlist_[src].edge_id[i];
-    }
+    if (succ[i] == dst)
+      edgelist.push_back(adjlist_[src].edge_id[i]);
  }
-  LOG(FATAL) << "invalid edge: " << src << " -> " << dst;
-  return 0;
+
+  // FIXME: signed?  Also it seems that we are using int64_t everywhere...
+  const int64_t len = edgelist.size();
+  IdArray rst = IdArray::Empty({len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
+  // FIXME: signed?
+  int64_t* rst_data = static_cast<int64_t*>(rst->data);
+
+  std::copy(edgelist.begin(), edgelist.end(), rst_data);
+
+  return rst;
 }

 // O(E*k) pretty slow
-IdArray Graph::EdgeIds(IdArray src_ids, IdArray dst_ids) const {
+Graph::EdgeArray Graph::EdgeIds(IdArray src_ids, IdArray dst_ids) const {
  CHECK(IsValidIdArray(src_ids)) << "Invalid src id array.";
  CHECK(IsValidIdArray(dst_ids)) << "Invalid dst id array.";
  const auto srclen = src_ids->shape[0];
  const auto dstlen = dst_ids->shape[0];
-  const auto rstlen = std::max(srclen, dstlen);
-  IdArray rst = IdArray::Empty({rstlen}, src_ids->dtype, src_ids->ctx);
-  int64_t* rst_data = static_cast<int64_t*>(rst->data);
+  int64_t i, j;
+
+  CHECK((srclen == dstlen) || (srclen == 1) || (dstlen == 1))
+    << "Invalid src and dst id array.";
+
+  const int64_t src_stride = (srclen == 1 && dstlen != 1) ? 0 : 1;
+  const int64_t dst_stride = (dstlen == 1 && srclen != 1) ? 0 : 1;
  const int64_t* src_data = static_cast<int64_t*>(src_ids->data);
  const int64_t* dst_data = static_cast<int64_t*>(dst_ids->data);
-  if (srclen == 1) {
-    // one-many
-    for (int64_t i = 0; i < dstlen; ++i) {
-      rst_data[i] = EdgeId(src_data[0], dst_data[i]);
-    }
-  } else if (dstlen == 1) {
-    // many-one
-    for (int64_t i = 0; i < srclen; ++i) {
-      rst_data[i] = EdgeId(src_data[i], dst_data[0]);
-    }
-  } else {
-    // many-many
-    CHECK(srclen == dstlen) << "Invalid src and dst id array.";
-    for (int64_t i = 0; i < srclen; ++i) {
-      rst_data[i] = EdgeId(src_data[i], dst_data[i]);
+
+  std::vector<dgl_id_t> src, dst, eid;
+
+  for (i = 0, j = 0; i < srclen && j < dstlen; i += src_stride, j += dst_stride) {
+    const dgl_id_t src_id = src_data[i], dst_id = dst_data[j];
+    const auto& succ = adjlist_[src_id].succ;
+    for (size_t k = 0; k < succ.size(); ++k) {
+      if (succ[k] == dst_id) {
+	src.push_back(src_id);
+	dst.push_back(dst_id);
+	eid.push_back(adjlist_[src_id].edge_id[k]);
+      }
    }
  }
-  return rst;
+
+  int64_t rstlen = src.size();
+  IdArray rst_src = IdArray::Empty({rstlen}, src_ids->dtype, src_ids->ctx);
+  IdArray rst_dst = IdArray::Empty({rstlen}, src_ids->dtype, src_ids->ctx);
+  IdArray rst_eid = IdArray::Empty({rstlen}, src_ids->dtype, src_ids->ctx);
+  int64_t* rst_src_data = static_cast<int64_t*>(rst_src->data);
+  int64_t* rst_dst_data = static_cast<int64_t*>(rst_dst->data);
+  int64_t* rst_eid_data = static_cast<int64_t*>(rst_eid->data);
+
+  std::copy(src.begin(), src.end(), rst_src_data);
+  std::copy(dst.begin(), dst.end(), rst_dst_data);
+  std::copy(eid.begin(), eid.end(), rst_eid_data);
+
+  return EdgeArray{rst_src, rst_dst, rst_eid};
+}
+
+Graph::EdgeArray Graph::FindEdges(IdArray eids) const {
+  int64_t len = eids->shape[0];
+
+  IdArray rst_src = IdArray::Empty({len}, eids->dtype, eids->ctx);
+  IdArray rst_dst = IdArray::Empty({len}, eids->dtype, eids->ctx);
+  IdArray rst_eid = IdArray::Empty({len}, eids->dtype, eids->ctx);
+  int64_t* eid_data = static_cast<int64_t*>(eids->data);
+  int64_t* rst_src_data = static_cast<int64_t*>(rst_src->data);
+  int64_t* rst_dst_data = static_cast<int64_t*>(rst_dst->data);
+  int64_t* rst_eid_data = static_cast<int64_t*>(rst_eid->data);
+
+  for (uint64_t i = 0; i < (uint64_t)len; ++i) {
+    dgl_id_t eid = eid_data[i];
+    if (eid >= num_edges_)
+      LOG(FATAL) << "invalid edge id:" << eid;
+
+    rst_src_data[i] = all_edges_src_[eid];
+    rst_dst_data[i] = all_edges_dst_[eid];
+    rst_eid_data[i] = eid;
+  }
+
+  return EdgeArray{rst_src, rst_dst, rst_eid};
 }

 // O(E)
@@ -375,9 +434,37 @@ Subgraph Graph::VertexSubgraph(IdArray vids) const {
  return rst;
 }

-Subgraph Graph::EdgeSubgraph(IdArray src, IdArray dst) const {
-  LOG(FATAL) << "not implemented";
-  return Subgraph();
+Subgraph Graph::EdgeSubgraph(IdArray eids) const {
+  CHECK(IsValidIdArray(eids)) << "Invalid vertex id array.";
+
+  const auto len = eids->shape[0];
+  std::unordered_map<dgl_id_t, dgl_id_t> oldv2newv;
+  std::vector<dgl_id_t> nodes;
+  const int64_t* eid_data = static_cast<int64_t*>(eids->data);
+
+  for (int64_t i = 0; i < len; ++i) {
+    dgl_id_t src_id = all_edges_src_[eid_data[i]];
+    dgl_id_t dst_id = all_edges_dst_[eid_data[i]];
+    if (oldv2newv.insert(std::make_pair(src_id, oldv2newv.size())).second)
+      nodes.push_back(src_id);
+    if (oldv2newv.insert(std::make_pair(dst_id, oldv2newv.size())).second)
+      nodes.push_back(dst_id);
+  }
+
+  Subgraph rst;
+  rst.induced_edges = eids;
+  rst.graph.AddVertices(nodes.size());
+
+  for (int64_t i = 0; i < len; ++i) {
+    dgl_id_t src_id = all_edges_src_[eid_data[i]];
+    dgl_id_t dst_id = all_edges_dst_[eid_data[i]];
+    rst.graph.AddEdge(oldv2newv[src_id], oldv2newv[dst_id]);
+  }
+
+  rst.induced_vertices = IdArray::Empty({static_cast<int64_t>(nodes.size())}, eids->dtype, eids->ctx);
+  std::copy(nodes.begin(), nodes.end(), static_cast<int64_t*>(rst.induced_vertices->data));
+
+  return rst;
 }

 Graph Graph::Reverse() const {

--- a/src/graph/graph_apis.cc
+++ b/src/graph/graph_apis.cc
-#include <dgl/runtime/packed_func.h>
-#include <dgl/runtime/registry.h>
 #include <dgl/graph.h>
 #include <dgl/graph_op.h>
+#include "../c_api_common.h"

 using tvm::runtime::TVMArgs;
 using tvm::runtime::TVMArgValue;
@@ -11,9 +10,6 @@ using tvm::runtime::NDArray;

 namespace dgl {

-// Graph handler type
-typedef void* GraphHandle;
-
 namespace {
 // Convert EdgeArray structure to PackedFunc.
 PackedFunc ConvertEdgeArrayToPackedFunc(const Graph::EdgeArray& ea) {
@@ -52,21 +48,12 @@ PackedFunc ConvertSubgraphToPackedFunc(const Subgraph& sg) {
  return PackedFunc(body);
 }

-// Convert the given DLTensor to a temporary DLManagedTensor that does not own memory.
-DLManagedTensor* CreateTmpDLManagedTensor(const TVMArgValue& arg) {
-  const DLTensor* dl_tensor = arg;
-  DLManagedTensor* ret = new DLManagedTensor();
-  ret->deleter = [] (DLManagedTensor* self) { delete self; };
-  ret->manager_ctx = nullptr;
-  ret->dl_tensor = *dl_tensor;
-  return ret;
-}
-
 }  // namespace

 TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphCreate")
 .set_body([] (TVMArgs args, TVMRetValue* rv) {
-    GraphHandle ghandle = new Graph();
+    bool multigraph = static_cast<bool>(args[0]);
+    GraphHandle ghandle = new Graph(multigraph);
    *rv = ghandle;
  });

@@ -110,6 +97,14 @@ TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphClear")
    gptr->Clear();
  });

+TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphIsMultigraph")
+.set_body([] (TVMArgs args, TVMRetValue *rv) {
+    GraphHandle ghandle = args[0];
+    // NOTE: not const since we have caches
+    const Graph* gptr = static_cast<Graph*>(ghandle);
+    *rv = gptr->IsMultigraph();
+  });
+
 TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNumVertices")
 .set_body([] (TVMArgs args, TVMRetValue* rv) {
    GraphHandle ghandle = args[0];
@@ -140,22 +135,22 @@ TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphHasVertices")
    *rv = gptr->HasVertices(vids);
  });

-TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphHasEdge")
+TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphHasEdgeBetween")
 .set_body([] (TVMArgs args, TVMRetValue* rv) {
    GraphHandle ghandle = args[0];
    const Graph* gptr = static_cast<Graph*>(ghandle);
    const dgl_id_t src = args[1];
    const dgl_id_t dst = args[2];
-    *rv = gptr->HasEdge(src, dst);
+    *rv = gptr->HasEdgeBetween(src, dst);
  });

-TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphHasEdges")
+TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphHasEdgesBetween")
 .set_body([] (TVMArgs args, TVMRetValue* rv) {
    GraphHandle ghandle = args[0];
    const Graph* gptr = static_cast<Graph*>(ghandle);
    const IdArray src = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
    const IdArray dst = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
-    *rv = gptr->HasEdges(src, dst);
+    *rv = gptr->HasEdgesBetween(src, dst);
  });

 TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphPredecessors")
@@ -182,7 +177,7 @@ TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphEdgeId")
    const Graph* gptr = static_cast<Graph*>(ghandle);
    const dgl_id_t src = args[1];
    const dgl_id_t dst = args[2];
-    *rv = static_cast<int64_t>(gptr->EdgeId(src, dst));
+    *rv = gptr->EdgeId(src, dst);
  });

 TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphEdgeIds")
@@ -191,7 +186,15 @@ TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphEdgeIds")
    const Graph* gptr = static_cast<Graph*>(ghandle);
    const IdArray src = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
    const IdArray dst = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[2]));
-    *rv = gptr->EdgeIds(src, dst);
+    *rv = ConvertEdgeArrayToPackedFunc(gptr->EdgeIds(src, dst));
+  });
+
+TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphFindEdges")
+.set_body([] (TVMArgs args, TVMRetValue* rv) {
+    GraphHandle ghandle = args[0];
+    const Graph* gptr = static_cast<Graph*>(ghandle);
+    const IdArray eids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    *rv = ConvertEdgeArrayToPackedFunc(gptr->FindEdges(eids));
  });

 TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphInEdges_1")
@@ -274,6 +277,14 @@ TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphVertexSubgraph")
    *rv = ConvertSubgraphToPackedFunc(gptr->VertexSubgraph(vids));
  });

+TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphEdgeSubgraph")
+.set_body([] (TVMArgs args, TVMRetValue* rv) {
+    GraphHandle ghandle = args[0];
+    const Graph *gptr = static_cast<Graph*>(ghandle);
+    const IdArray eids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[1]));
+    *rv = ConvertSubgraphToPackedFunc(gptr->EdgeSubgraph(eids));
+  });
+
 TVM_REGISTER_GLOBAL("graph_index._CAPI_DGLDisjointUnion")
 .set_body([] (TVMArgs args, TVMRetValue* rv) {
    void* list = args[0];

--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -25,7 +25,7 @@ class CPUDeviceAPI final : public DeviceAPI {
                       size_t alignment,
                       TVMType type_hint) final {
    void* ptr;
-#if _MSC_VER
+#if _MSC_VER || defined(__MINGW32__)
    ptr = _aligned_malloc(nbytes, alignment);
    if (ptr == nullptr) throw std::bad_alloc();
 #elif defined(_LIBCPP_SGX_CONFIG)
@@ -39,7 +39,7 @@ class CPUDeviceAPI final : public DeviceAPI {
  }

  void FreeDataSpace(TVMContext ctx, void* ptr) final {
-#if _MSC_VER
+#if _MSC_VER || defined(__MINGW32__)
    _aligned_free(ptr);
 #else
    free(ptr);

--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -136,8 +136,8 @@ DLManagedTensor* NDArray::ToDLPack() const {
 }

 NDArray NDArray::Empty(std::vector<int64_t> shape,
-                        DLDataType dtype,
-                        DLContext ctx) {
+                       DLDataType dtype,
+                       DLContext ctx) {
  NDArray ret = Internal::Create(shape, dtype, ctx);
  // setup memory content
  size_t size = GetDataSize(ret.data_->dl_tensor);

--- a/src/scheduler/scheduler.cc
+++ b/src/scheduler/scheduler.cc
+// DGL Scheduler implementation
+
+#include <unordered_map>
+#include <vector>
+#include <dgl/scheduler.h>
+
+namespace dgl {
+namespace sched {
+
+std::vector<IdArray> DegreeBucketing(const IdArray& vids) {
+    const auto n_msgs = vids->shape[0];
+    const int64_t* vid_data = static_cast<int64_t*>(vids->data);
+
+    // inedge: dst->msgs
+    std::unordered_map<int64_t, std::vector<int64_t>> in_edges;
+    for (int64_t mid = 0; mid < n_msgs; ++mid) {
+        in_edges[vid_data[mid]].push_back(mid);
+    }
+
+    // bkt: deg->dsts
+    std::unordered_map<int64_t, std::vector<int64_t>> bkt;
+    for (auto& it: in_edges) {
+        bkt[it.second.size()].push_back(it.first);
+    }
+
+    // initialize output
+    int64_t n_deg = bkt.size();
+    int64_t n_dst = in_edges.size();
+    IdArray degs = IdArray::Empty({n_deg}, vids->dtype, vids->ctx);
+    IdArray nids = IdArray::Empty({n_dst}, vids->dtype, vids->ctx);
+    IdArray nid_section = IdArray::Empty({n_deg}, vids->dtype, vids->ctx);
+    IdArray mids = IdArray::Empty({n_msgs}, vids->dtype, vids->ctx);
+    IdArray mid_section = IdArray::Empty({n_deg}, vids->dtype, vids->ctx);
+    int64_t* deg_ptr = static_cast<int64_t*>(degs->data);
+    int64_t* nid_ptr = static_cast<int64_t*>(nids->data);
+    int64_t* nsec_ptr = static_cast<int64_t*>(nid_section->data);
+    int64_t* mid_ptr = static_cast<int64_t*>(mids->data);
+    int64_t* msec_ptr = static_cast<int64_t*>(mid_section->data);
+
+    // fill in bucketing ordering
+    for (auto& it: bkt) { // for each bucket
+        int64_t deg = it.first;
+        int64_t n_dst = it.second.size();
+        *deg_ptr++ = deg;
+        *nsec_ptr++ = n_dst;
+        *msec_ptr++ = deg * n_dst;
+        for (auto dst: it.second) { // for each dst in this bucket
+            *nid_ptr++ = dst;
+            for (auto mid: in_edges[dst]) { // for each in edge of dst
+                *mid_ptr++ = mid;
+            }
+        }
+    }
+
+    std::vector<IdArray> ret;
+    ret.push_back(std::move(degs));
+    ret.push_back(std::move(nids));
+    ret.push_back(std::move(nid_section));
+    ret.push_back(std::move(mids));
+    ret.push_back(std::move(mid_section));
+
+    return std::move(ret);
+}
+
+} // namespace sched
+
+} // namespace dgl
--- a/src/scheduler/scheduler_apis.cc
+++ b/src/scheduler/scheduler_apis.cc
+#include "../c_api_common.h"
+#include <dgl/graph.h>
+#include <dgl/scheduler.h>
+
+using tvm::runtime::TVMArgs;
+using tvm::runtime::TVMRetValue;
+using tvm::runtime::NDArray;
+
+namespace dgl {
+
+TVM_REGISTER_GLOBAL("scheduler._CAPI_DGLDegreeBucketing")
+.set_body([] (TVMArgs args, TVMRetValue* rv) {
+    const IdArray vids = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[0]));
+    *rv = ConvertNDArrayVectorToPackedFunc(sched::DegreeBucketing(vids));
+  });
+
+TVM_REGISTER_GLOBAL("scheduler._CAPI_DGLDegreeBucketingFromGraph")
+.set_body([] (TVMArgs args, TVMRetValue* rv) {
+    GraphHandle ghandle = args[0];
+    const Graph* gptr = static_cast<Graph*>(ghandle);
+    auto edges = gptr->Edges(false);
+    *rv = ConvertNDArrayVectorToPackedFunc(sched::DegreeBucketing(edges.dst));
+  });
+
+} // namespace dgl
--- a/tests/graph_index/test_basics.py
+++ b/tests/graph_index/test_basics.py
+from dgl import DGLError
+from dgl.utils import toindex
+from dgl.graph_index import create_graph_index
+import networkx as nx
+
+def test_edge_id():
+    gi = create_graph_index(multigraph=False)
+    assert not gi.is_multigraph()
+
+    gi = create_graph_index(multigraph=True)
+
+    gi.add_nodes(4)
+    gi.add_edge(0, 1)
+    eid = gi.edge_id(0, 1).tolist()
+    assert len(eid) == 1
+    assert eid[0] == 0
+    assert gi.is_multigraph()
+
+    # multiedges
+    gi.add_edge(0, 1)
+    eid = gi.edge_id(0, 1).tolist()
+    assert len(eid) == 2
+    assert eid[0] == 0
+    assert eid[1] == 1
+
+    gi.add_edges(toindex([0, 1, 1, 2]), toindex([2, 2, 2, 3]))
+    src, dst, eid = gi.edge_ids(toindex([0, 0, 2, 1]), toindex([2, 1, 3, 2]))
+    eid_answer = [2, 0, 1, 5, 3, 4]
+    assert len(eid) == 6
+    assert all(e == ea for e, ea in zip(eid, eid_answer))
+
+    # find edges
+    src, dst, eid = gi.find_edges(toindex([1, 3, 5]))
+    assert len(src) == len(dst) == len(eid) == 3
+    assert src[0] == 0 and src[1] == 1 and src[2] == 2
+    assert dst[0] == 1 and dst[1] == 2 and dst[2] == 3
+    assert eid[0] == 1 and eid[1] == 3 and eid[2] == 5
+
+    # source broadcasting
+    src, dst, eid = gi.edge_ids(toindex([0]), toindex([1, 2]))
+    eid_answer = [0, 1, 2]
+    assert len(eid) == 3
+    assert all(e == ea for e, ea in zip(eid, eid_answer))
+
+    # destination broadcasting
+    src, dst, eid = gi.edge_ids(toindex([1, 0]), toindex([2]))
+    eid_answer = [3, 4, 2]
+    assert len(eid) == 3
+    assert all(e == ea for e, ea in zip(eid, eid_answer))
+
+    gi.clear()
+    # the following assumes that grabbing nonexistent edge will throw an error
+    try:
+        gi.edge_id(0, 1)
+        fail = True
+    except DGLError:
+        fail = False
+    finally:
+        assert not fail
+
+    gi.add_nodes(4)
+    gi.add_edge(0, 1)
+    eid = gi.edge_id(0, 1).tolist()
+    assert len(eid) == 1
+    assert eid[0] == 0
+
+def test_nx():
+    gi = create_graph_index(multigraph=True)
+
+    gi.add_nodes(2)
+    gi.add_edge(0, 1)
+    nxg = gi.to_networkx()
+    assert len(nxg.nodes) == 2
+    assert len(nxg.edges(0, 1)) == 1
+    gi.add_edge(0, 1)
+    nxg = gi.to_networkx()
+    assert len(nxg.edges(0, 1)) == 2
+
+    nxg = nx.DiGraph()
+    nxg.add_edge(0, 1)
+    gi = create_graph_index(nxg)
+    assert not gi.is_multigraph()
+    assert gi.number_of_nodes() == 2
+    assert gi.number_of_edges() == 1
+    assert gi.edge_id(0, 1)[0] == 0
+
+    nxg = nx.MultiDiGraph()
+    nxg.add_edge(0, 1)
+    nxg.add_edge(0, 1)
+    gi = create_graph_index(nxg, True)
+    assert gi.is_multigraph()
+    assert gi.number_of_nodes() == 2
+    assert gi.number_of_edges() == 2
+    assert 0 in gi.edge_id(0, 1)
+    assert 1 in gi.edge_id(0, 1)
+
+def test_predsucc():
+    gi = create_graph_index(multigraph=True)
+
+    gi.add_nodes(4)
+    gi.add_edge(0, 1)
+    gi.add_edge(0, 1)
+    gi.add_edge(0, 2)
+    gi.add_edge(2, 0)
+    gi.add_edge(3, 0)
+    gi.add_edge(0, 0)
+    gi.add_edge(0, 0)
+
+    pred = gi.predecessors(0)
+    assert len(pred) == 3
+    assert 2 in pred
+    assert 3 in pred
+    assert 0 in pred
+
+    succ = gi.successors(0)
+    assert len(succ) == 3
+    assert 1 in succ
+    assert 2 in succ
+    assert 0 in succ
+
+
+if __name__ == '__main__':
+    test_edge_id()
+    test_nx()
+    test_predsucc()
--- a/tests/graph_index/test_subgraph.py
+++ b/tests/graph_index/test_subgraph.py
+from dgl import DGLError
+from dgl.utils import toindex
+from dgl.graph_index import create_graph_index
+
+def test_node_subgraph():
+    gi = create_graph_index()
+    gi.add_nodes(4)
+    gi.add_edge(0, 1)
+    gi.add_edge(0, 2)
+    gi.add_edge(0, 2)
+    gi.add_edge(0, 3)
+
+    sub2par_nodemap = [2, 0, 3]
+    sgi = gi.node_subgraph(toindex(sub2par_nodemap))
+
+    for s, d, e in zip(*sgi.edges()):
+        assert sgi.induced_edges[e] in gi.edge_id(
+                sgi.induced_nodes[s], sgi.induced_nodes[d])
+
+def test_edge_subgraph():
+    gi = create_graph_index()
+    gi.add_nodes(4)
+    gi.add_edge(0, 1)
+    gi.add_edge(0, 1)
+    gi.add_edge(0, 2)
+    gi.add_edge(2, 3)
+
+    sub2par_edgemap = [3, 2]
+    sgi = gi.edge_subgraph(toindex(sub2par_edgemap))
+
+    for s, d, e in zip(*sgi.edges()):
+        assert sgi.induced_edges[e] in gi.edge_id(
+                sgi.induced_nodes[s], sgi.induced_nodes[d])
+
+
+if __name__ == '__main__':
+    test_node_subgraph()
+    test_edge_subgraph()
--- a/tests/pytorch/test_basics.py
+++ b/tests/pytorch/test_basics.py
@@ -242,6 +242,104 @@ def test_pull_0deg():
    assert th.allclose(new_repr[0], old_repr[0])
    assert th.allclose(new_repr[1], old_repr[0])

+def test_send_twice():
+    g = DGLGraph()
+    g.add_nodes(3)
+    g.add_edge(0, 1)
+    g.add_edge(2, 1)
+    def _message_a(src, edge):
+        return {'a': src['a']}
+    def _message_b(src, edge):
+        return {'a': src['a'] * 3}
+    def _reduce(node, msgs):
+        assert msgs is not None
+        return {'a': msgs['a'].max(1)[0]}
+
+    old_repr = th.randn(3, 5)
+    g.set_n_repr({'a': old_repr})
+    g.send(0, 1, _message_a)
+    g.send(0, 1, _message_b)
+    g.recv([1], _reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], old_repr[0] * 3)
+
+    g.set_n_repr({'a': old_repr})
+    g.send(0, 1, _message_a)
+    g.send(2, 1, _message_b)
+    g.recv([1], _reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], th.stack([old_repr[0], old_repr[2] * 3], 0).max(0)[0])
+
+def test_send_multigraph():
+    g = DGLGraph(multigraph=True)
+    g.add_nodes(3)
+    g.add_edge(0, 1)
+    g.add_edge(0, 1)
+    g.add_edge(0, 1)
+    g.add_edge(2, 1)
+
+    def _message_a(src, edge):
+        return {'a': edge['a']}
+    def _message_b(src, edge):
+        return {'a': edge['a'] * 3}
+    def _reduce(node, msgs):
+        assert msgs is not None
+        return {'a': msgs['a'].max(1)[0]}
+
+    def answer(*args):
+        return th.stack(args, 0).max(0)[0]
+
+    # send by eid
+    old_repr = th.randn(4, 5)
+    g.set_n_repr({'a': th.zeros(3, 5)})
+    g.set_e_repr({'a': old_repr})
+    g.send(eid=[0, 2], message_func=_message_a)
+    g.recv([1], _reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], answer(old_repr[0], old_repr[2]))
+
+    g.set_n_repr({'a': th.zeros(3, 5)})
+    g.set_e_repr({'a': old_repr})
+    g.send(eid=[0, 2, 3], message_func=_message_a)
+    g.recv([1], _reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], answer(old_repr[0], old_repr[2], old_repr[3]))
+
+    # send on multigraph
+    g.set_n_repr({'a': th.zeros(3, 5)})
+    g.set_e_repr({'a': old_repr})
+    g.send([0, 2], [1, 1], _message_a)
+    g.recv([1], _reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], old_repr.max(0)[0])
+
+    # consecutive send and send_on
+    g.set_n_repr({'a': th.zeros(3, 5)})
+    g.set_e_repr({'a': old_repr})
+    g.send(2, 1, _message_a)
+    g.send(eid=[0, 1], message_func=_message_b)
+    g.recv([1], _reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], answer(old_repr[0] * 3, old_repr[1] * 3, old_repr[3]))
+
+    # consecutive send_on
+    g.set_n_repr({'a': th.zeros(3, 5)})
+    g.set_e_repr({'a': old_repr})
+    g.send(eid=0, message_func=_message_a)
+    g.send(eid=1, message_func=_message_b)
+    g.recv([1], _reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], answer(old_repr[0], old_repr[1] * 3))
+
+    # send_and_recv_on
+    g.set_n_repr({'a': th.zeros(3, 5)})
+    g.set_e_repr({'a': old_repr})
+    g.send_and_recv(eid=[0, 2, 3], message_func=_message_a, reduce_func=_reduce)
+    new_repr = g.get_n_repr()['a']
+    assert th.allclose(new_repr[1], answer(old_repr[0], old_repr[2], old_repr[3]))
+    assert th.allclose(new_repr[[0, 2]], th.zeros(2, 5))
+
+
 if __name__ == '__main__':
    test_batch_setter_getter()
    test_batch_setter_autograd()
@@ -250,3 +348,5 @@ if __name__ == '__main__':
    test_update_routines()
    test_reduce_0deg()
    test_pull_0deg()
+    test_send_twice()
+    test_send_multigraph()
--- a/tests/pytorch/test_line_graph.py
+++ b/tests/pytorch/test_line_graph.py
@@ -39,8 +39,8 @@ def test_no_backtracking():
    for i in range(1, N):
        e1 = G.edge_id(0, i)
        e2 = G.edge_id(i, 0)
-        assert not L.has_edge(e1, e2)
-        assert not L.has_edge(e2, e1)
+        assert not L.has_edge_between(e1, e2)
+        assert not L.has_edge_between(e2, e1)

 if __name__ == '__main__':
    test_line_graph()