[API Deprecation]Deprecate contrib module (#5114)

56ffb650 · peizhou001 · GitHub · 436de3d1 · 436de3d1 · 436de3d1
Unverified Commit 56ffb650 authored Jan 06, 2023 by peizhou001 Committed by GitHub Jan 06, 2023
20 changed files
--- a/python/dgl/_deprecate/runtime/__init__.py
+++ b/python/dgl/_deprecate/runtime/__init__.py
-"""Package for DGL scheduler and runtime."""
-from __future__ import absolute_import
-
-from . import scheduler
-from .runtime import Runtime
-from .adapter import GraphAdapter
--- a/python/dgl/_deprecate/runtime/adapter.py
+++ b/python/dgl/_deprecate/runtime/adapter.py
-"""Temporary adapter to unify DGLGraph and HeteroGraph for scheduler.
-NOTE(minjie): remove once all scheduler codes are migrated to heterograph
-"""
-from __future__ import absolute_import
-
-from abc import ABC, abstractmethod
-
-class GraphAdapter(ABC):
-    """Temporary adapter class to unify DGLGraph and DGLHeteroGraph for schedulers."""
-    @property
-    @abstractmethod
-    def gidx(self):
-        """Get graph index object."""
-
-    @abstractmethod
-    def num_src(self):
-        """Number of source nodes."""
-
-    @abstractmethod
-    def num_dst(self):
-        """Number of destination nodes."""
-
-    @abstractmethod
-    def num_edges(self):
-        """Number of edges."""
-
-    @property
-    @abstractmethod
-    def srcframe(self):
-        """Frame to store source node features."""
-
-    @property
-    @abstractmethod
-    def dstframe(self):
-        """Frame to store source node features."""
-
-    @property
-    @abstractmethod
-    def edgeframe(self):
-        """Frame to store edge features."""
-
-    @property
-    @abstractmethod
-    def msgframe(self):
-        """Frame to store messages."""
-
-
-    @property
-    @abstractmethod
-    def msgindicator(self):
-        """Message indicator tensor."""
-
-    @msgindicator.setter
-    @abstractmethod
-    def msgindicator(self, val):
-        """Set new message indicator tensor."""
-
-    @abstractmethod
-    def in_edges(self, nodes):
-        """Get in edges
-
-        Parameters
-        ----------
-        nodes : utils.Index
-            Nodes
-
-        Returns
-        -------
-        tuple of utils.Index
-            (src, dst, eid)
-        """
-
-    @abstractmethod
-    def out_edges(self, nodes):
-        """Get out edges
-
-        Parameters
-        ----------
-        nodes : utils.Index
-            Nodes
-
-        Returns
-        -------
-        tuple of utils.Index
-            (src, dst, eid)
-        """
-
-    @abstractmethod
-    def edges(self, form):
-        """Get all edges
-
-        Parameters
-        ----------
-        form : str
-            "eid", "uv", etc.
-
-        Returns
-        -------
-        tuple of utils.Index
-            (src, dst, eid)
-        """
-
-    @abstractmethod
-    def get_immutable_gidx(self, ctx):
-        """Get immutable graph index for kernel computation.
-
-        Parameters
-        ----------
-        ctx : DGLContext
-            The context of the returned graph.
-
-        Returns
-        -------
-        GraphIndex
-
-        """
-
-    @abstractmethod
-    def bits_needed(self):
-        """Return the number of integer bits needed to represent the graph
-
-        Returns
-        -------
-        int
-            The number of bits needed
-        """
--- a/python/dgl/_deprecate/runtime/degree_bucketing.py
+++ b/python/dgl/_deprecate/runtime/degree_bucketing.py
-"""Module for degree bucketing schedulers."""
-from __future__ import absolute_import
-from functools import partial
-
-from ..._ffi.function import _init_api
-from ... import backend as F
-from ..udf import NodeBatch, EdgeBatch
-from ... import utils
-
-from . import ir
-from .ir import var
-
-def gen_degree_bucketing_schedule(
-        reduce_udf,
-        message_ids,
-        dst_nodes,
-        recv_nodes,
-        var_nf,
-        var_mf,
-        var_out,
-        ntype=None):
-    """Create degree bucketing schedule.
-
-    The messages will be divided by their receivers into buckets. Each bucket
-    contains nodes that have the same in-degree. The reduce UDF will be applied
-    on each bucket. The per-bucket result will be merged according to the
-    *unique-ascending order* of the recv node ids. The order is important to
-    be compatible with other reduce scheduler such as v2v_spmv.
-
-    Parameters
-    ----------
-    reduce_udf : callable
-        The UDF to reduce messages.
-    message_ids : utils.Index
-        The variable for message ids.
-        Invariant: len(message_ids) == len(dst_nodes)
-    dst_nodes : utils.Index
-        The variable for dst node of each message.
-        Invariant: len(message_ids) == len(dst_nodes)
-    recv_nodes : utils.Index
-        The unique nodes that perform recv.
-        Invariant: recv_nodes = sort(unique(dst_nodes))
-    var_nf : var.FEAT_DICT
-        The variable for node feature frame.
-    var_mf : var.FEAT_DICT
-        The variable for message frame.
-    var_out : var.FEAT_DICT
-        The variable for output feature dicts.
-    ntype : str, optional
-        The node type, if running on a heterograph.
-        If None, assuming it's running on a homogeneous graph.
-    """
-    buckets = _degree_bucketing_schedule(message_ids, dst_nodes, recv_nodes)
-    # generate schedule
-    _, degs, buckets, msg_ids, zero_deg_nodes = buckets
-    # loop over each bucket
-    idx_list = []
-    fd_list = []
-    for deg, vbkt, mid in zip(degs, buckets, msg_ids):
-        # create per-bkt rfunc
-        rfunc = _create_per_bkt_rfunc(reduce_udf, deg, vbkt, ntype=ntype)
-        # vars
-        vbkt = var.IDX(vbkt)
-        mid = var.IDX(mid)
-        rfunc = var.FUNC(rfunc)
-        # recv on each bucket
-        fdvb = ir.READ_ROW(var_nf, vbkt)
-        fdmail = ir.READ_ROW(var_mf, mid)
-        fdvb = ir.NODE_UDF(rfunc, fdvb, fdmail, ret=fdvb)  # reuse var
-        # save for merge
-        idx_list.append(vbkt)
-        fd_list.append(fdvb)
-    if zero_deg_nodes is not None:
-        # NOTE: there must be at least one non-zero-deg node; otherwise,
-        #   degree bucketing should not be called.
-        var_0deg = var.IDX(zero_deg_nodes)
-        zero_feat = ir.NEW_DICT(var_out, var_0deg, fd_list[0])
-        idx_list.append(var_0deg)
-        fd_list.append(zero_feat)
-    # merge buckets according to the ascending order of the node ids.
-    all_idx = F.cat([idx.data.tousertensor() for idx in idx_list], dim=0)
-    _, order = F.sort_1d(all_idx)
-    var_order = var.IDX(utils.toindex(order))
-    reduced_feat = ir.MERGE_ROW(var_order, fd_list)
-    ir.WRITE_DICT_(var_out, reduced_feat)
-
-def _degree_bucketing_schedule(mids, dsts, v):
-    """Return the bucketing by degree scheduling for destination nodes of
-    messages
-
-    Parameters
-    ----------
-    mids: utils.Index
-        edge id for each message
-    dsts: utils.Index
-        destination node for each message
-    v: utils.Index
-        all receiving nodes (for checking zero degree nodes)
-    """
-    buckets = _CAPI_DGLDegreeBucketing(mids.todgltensor(), dsts.todgltensor(),
-                                       v.todgltensor())
-    return _process_node_buckets(buckets)
-
-def _process_node_buckets(buckets):
-    """read bucketing auxiliary data
-
-    Returns
-    -------
-    unique_v: utils.Index
-        unqiue destination nodes
-    degrees: numpy.ndarray
-        A list of degree for each bucket
-    v_bkt: list of utils.Index
-        A list of node id buckets, nodes in each bucket have the same degree
-    msg_ids: list of utils.Index
-        A list of message id buckets, each node in the ith node id bucket has
-        degree[i] messages in the ith message id bucket
-    zero_deg_nodes : utils.Index
-        The zero-degree nodes
-    """
-    # get back results
-    dtype = buckets(0).dtype
-    degs = utils.toindex(buckets(0), dtype)
-    v = utils.toindex(buckets(1), dtype)
-    # XXX: convert directly from ndarary to python list?
-    v_section = buckets(2).asnumpy().tolist()
-    msg_ids = utils.toindex(buckets(3), dtype)
-    msg_section = buckets(4).asnumpy().tolist()
-
-    # split buckets
-    msg_ids = msg_ids.tousertensor()
-    dsts = F.split(v.tousertensor(), v_section, 0)
-    msg_ids = F.split(msg_ids, msg_section, 0)
-
-    # convert to utils.Index
-    dsts = [utils.toindex(dst, dtype) for dst in dsts]
-    msg_ids = [utils.toindex(msg_id, dtype) for msg_id in msg_ids]
-
-    # handle zero deg
-    degs = degs.tonumpy()
-    if degs[-1] == 0:
-        degs = degs[:-1]
-        zero_deg_nodes = dsts[-1]
-        dsts = dsts[:-1]
-    else:
-        zero_deg_nodes = None
-
-    return v, degs, dsts, msg_ids, zero_deg_nodes
-
-def _create_per_bkt_rfunc(reduce_udf, deg, vbkt, ntype=None):
-    """Internal function to generate the per degree bucket node UDF."""
-    def _rfunc_wrapper(node_data, mail_data):
-        def _reshaped_getter(key):
-            msg = mail_data[key]
-            new_shape = (len(vbkt), deg) + F.shape(msg)[1:]
-            return F.reshape(msg, new_shape)
-        reshaped_mail_data = utils.LazyDict(_reshaped_getter, mail_data.keys())
-        nbatch = NodeBatch(vbkt, node_data, reshaped_mail_data, ntype=ntype)
-        return reduce_udf(nbatch)
-    return _rfunc_wrapper
-
-def gen_group_apply_edge_schedule(
-        apply_func,
-        u, v, eid,
-        group_by,
-        var_src_nf,
-        var_dst_nf,
-        var_ef,
-        var_out,
-        canonical_etype=(None, None, None)):
-    """Create degree bucketing schedule for group_apply_edge
-
-    Edges will be grouped by either its source node or destination node
-    specified by 'group_by', and will be divided into buckets in which
-    'group_by' nodes have the same degree. The apply_func UDF will be applied
-    to each bucket. The per-bucket result will be merged according to the
-    *unique-ascending order* of the edge ids.
-
-    Parameters
-    ----------
-    apply_func: callable
-        The edge_apply_func UDF
-    u: utils.Index
-        Source nodes of edges to apply
-    v: utils.Index
-        Destination nodes of edges to apply
-    eid: utils.Index
-        Edges to apply
-    group_by: str
-        If "src", group by u. If "dst", group by v
-    var_src_nf : var.FEAT_DICT
-        The variable for source feature frame.
-    var_dst_nf : var.FEAT_DICT
-        The variable for destination feature frame.
-    var_ef : var.FEAT_DICT
-        The variable for edge frame.
-    var_out : var.FEAT_DICT
-        The variable for output feature dicts.
-    canonical_etype : tuple[str, str, str], optional
-        Canonical edge type if running on a heterograph.
-        Default: (None, None, None), if running on a homogeneous graph.
-    """
-    if group_by == "src":
-        buckets = _degree_bucketing_for_edge_grouping(u, v, eid)
-        degs, uids, vids, eids = buckets
-    elif group_by == "dst":
-        buckets = _degree_bucketing_for_edge_grouping(v, u, eid)
-        degs, vids, uids, eids = buckets
-    else:
-        raise DGLError("group_apply_edge must be grouped by either src or dst")
-
-    idx_list = []
-    fd_list = []
-    for deg, u_bkt, v_bkt, eid_bkt in zip(degs, uids, vids, eids):
-        # create per-bkt efunc
-        _efunc = var.FUNC(_create_per_bkt_efunc(apply_func, deg,
-                                                u_bkt, v_bkt, eid_bkt,
-                                                canonical_etype=canonical_etype))
-        # vars
-        var_u = var.IDX(u_bkt)
-        var_v = var.IDX(v_bkt)
-        var_eid = var.IDX(eid_bkt)
-        # apply edge UDF on each bucket
-        fdsrc = ir.READ_ROW(var_src_nf, var_u)
-        fddst = ir.READ_ROW(var_dst_nf, var_v)
-        fdedge = ir.READ_ROW(var_ef, var_eid)
-        fdedge = ir.EDGE_UDF(_efunc, fdsrc, fdedge, fddst, ret=fdedge)  # reuse var
-        # save for merge
-        idx_list.append(var_eid)
-        fd_list.append(fdedge)
-
-    # merge buckets according to the ascending order of the edge ids.
-    all_idx = F.cat([idx.data.tousertensor() for idx in idx_list], dim=0)
-    _, order = F.sort_1d(all_idx)
-    var_order = var.IDX(utils.toindex(order))
-    ir.MERGE_ROW(var_order, fd_list, ret=var_out)
-
-def _degree_bucketing_for_edge_grouping(uids, vids, eids):
-    """Return the edge buckets by degree and grouped nodes for group_apply_edge
-
-    Parameters
-    ----------
-    degree
-    uids: utils.Index
-        node id of one end of eids, based on which edges are grouped
-    vids: utils.Index
-        node id of the other end of eids
-    eids: utils.Index
-        edge id for each edge
-    """
-    buckets = _CAPI_DGLGroupEdgeByNodeDegree(uids.todgltensor(),
-                                             vids.todgltensor(),
-                                             eids.todgltensor())
-    return _process_edge_buckets(buckets)
-
-def _process_edge_buckets(buckets):
-    """read bucketing auxiliary data for group_apply_edge buckets
-
-    Returns
-    -------
-    degrees: numpy.ndarray
-        A list of degree for each bucket
-    uids: list of utils.Index
-        A list of node id buckets, nodes in each bucket have the same degree
-    vids: list of utils.Index
-        A list of node id buckets
-    eids: list of utils.Index
-        A list of edge id buckets
-    """
-    # get back results
-    dtype = buckets(0).dtype
-    degs = buckets(0).asnumpy()
-    uids = utils.toindex(buckets(1), dtype)
-    vids = utils.toindex(buckets(2), dtype)
-    eids = utils.toindex(buckets(3), dtype)
-    # XXX: convert directly from ndarary to python list?
-    sections = buckets(4).asnumpy().tolist()
-
-    # split buckets and convert to index
-    def split(to_split):
-        res = F.split(to_split.tousertensor(), sections, 0)
-        return map(partial(utils.toindex, dtype=dtype), res)
-
-    uids = split(uids)
-    vids = split(vids)
-    eids = split(eids)
-    return degs, uids, vids, eids
-
-def _create_per_bkt_efunc(apply_func, deg, u, v, eid, canonical_etype=(None, None, None)):
-    """Internal function to generate the per degree bucket edge UDF."""
-    batch_size = len(u) // deg
-    def _efunc_wrapper(src_data, edge_data, dst_data):
-        def _reshape_func(data):
-            def _reshaped_getter(key):
-                feat = data[key]
-                new_shape = (batch_size, deg) + F.shape(feat)[1:]
-                return F.reshape(feat, new_shape)
-            return _reshaped_getter
-
-        def _reshape_back(data):
-            shape = F.shape(data)[2:]
-            new_shape = (batch_size * deg,) + shape
-            return F.reshape(data, new_shape)
-
-        reshaped_src_data = utils.LazyDict(_reshape_func(src_data),
-                                           src_data.keys())
-        reshaped_edge_data = utils.LazyDict(_reshape_func(edge_data),
-                                            edge_data.keys())
-        reshaped_dst_data = utils.LazyDict(_reshape_func(dst_data),
-                                           dst_data.keys())
-        ebatch = EdgeBatch((u, v, eid), reshaped_src_data,
-                           reshaped_edge_data, reshaped_dst_data,
-                           canonical_etype=canonical_etype)
-        return {k: _reshape_back(v) for k, v in apply_func(ebatch).items()}
-    return _efunc_wrapper
-
-_init_api("dgl._deprecate.runtime.degree_bucketing")
--- a/python/dgl/_deprecate/runtime/ir/__init__.py
+++ b/python/dgl/_deprecate/runtime/ir/__init__.py
-"""Package for DGL's internal IR."""
-from .executor import *
-from .program import get_current_prog, prog
--- a/python/dgl/_deprecate/runtime/ir/executor.py
+++ b/python/dgl/_deprecate/runtime/ir/executor.py
-"""Module for executors."""
-# pylint: disable=invalid-name
-from __future__ import absolute_import
-
-from abc import abstractmethod
-
-from .... import backend as F
-from ...frame import FrameRef, Frame
-from .... import utils
-
-from .program import get_current_prog
-from . import var
-from .var import VarType
-from .registry import IR_REGISTRY
-
-__all__ = [
-    'OpCode', 'Executor',
-    'NodeUDFExecutor', 'NODE_UDF',
-    'EdgeUDFExecutor', 'EDGE_UDF',
-    'ReadExecutor', 'READ',
-    'ReadColExecutor', 'READ_COL',
-    'ReadRowExecutor', 'READ_ROW',
-    'MergeRowExecutor', 'MERGE_ROW',
-    'UpdateDictExecutor', 'UPDATE_DICT',
-    'NewDictExecutor', 'NEW_DICT',
-    'Write_Executor', 'WRITE_',
-    'WriteCol_Executor', 'WRITE_COL_',
-    'WriteRow_Executor', 'WRITE_ROW_',
-    'WriteDict_Executor', 'WRITE_DICT_',
-    'AppendRow_Executor', 'APPEND_ROW_',
-    'WriteRowInplace_Executor', 'WRITE_ROW_INPLACE_',
-    'ClearFrame_Executor', 'CLEAR_FRAME_',
-    'BinaryReduceExecutor', 'BINARY_REDUCE',
-    'CopyReduceExecutor', 'COPY_REDUCE',
-]
-
-
-class OpCode(object):
-    """Opcode for all the executor types."""
-    # immutable op
-    NODE_UDF = 0
-    EDGE_UDF = 1
-    READ = 4
-    READ_COL = 5
-    READ_ROW = 6
-    MERGE_ROW = 7
-    UPDATE_DICT = 8
-    NEW_DICT = 9
-    # mutable op (no return)
-    # remember the name is suffixed with "_"
-    WRITE_ = 21
-    WRITE_COL_ = 22
-    WRITE_ROW_ = 23
-    WRITE_DICT_ = 24
-    APPEND_ROW_ = 25
-    WRITE_ROW_INPLACE_ = 26
-    CLEAR_FRAME_ = 27
-    # DGL kernels
-    BINARY_REDUCE = 50
-    COPY_REDUCE = 51
-
-
-class Executor(object):
-    """Base executor class.
-
-    An executor is similar to a basic operator in dataflow-based framework.
-    The executor can be evaluated by the ``run`` function.
-    """
-    @abstractmethod
-    def opcode(self):
-        """Return the opcode of this executor."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def arg_vars(self):
-        """Return the argument variable list of this executor."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def ret_var(self):
-        """Return the result variable of this executor."""
-        raise NotImplementedError
-
-    @abstractmethod
-    def run(self):
-        """Evaluate this executor.
-
-        The function takes no argument and returns none, which means all the
-        argument and result variables must be pre-bound.
-        """
-        raise NotImplementedError
-
-class NodeUDFExecutor(Executor):
-    """Executor for Node UDF call.
-
-    Parameters
-    ----------
-    fn : var.Var
-        The UDF.
-    fdnode : var.Var
-        The node feature dict.
-    fdmail : var.Var
-        The mailbox data dict.
-    ret : var.Var
-        The return new node feature dict.
-    """
-    def __init__(self, fn, fdnode, fdmail, ret):
-        self.fn = fn
-        self.fdnode = fdnode
-        self.fdmail = fdmail
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.NODE_UDF
-
-    def arg_vars(self):
-        if self.fdmail is None:
-            return [self.fn, self.fdnode]
-        else:
-            return [self.fn, self.fdnode, self.fdmail]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        fn_data = self.fn.data
-        node_data = self.fdnode.data
-        if self.fdmail is None:
-            udf_ret = fn_data(node_data)
-        else:
-            mail_data = self.fdmail.data
-            udf_ret = fn_data(node_data, mail_data)
-        self.ret.data = FrameRef(Frame(udf_ret))
-
-IR_REGISTRY[OpCode.NODE_UDF] = {
-    'name' : 'NODE_UDF',
-    'args_type' : [VarType.FUNC, VarType.FEAT_DICT, VarType.FEAT_DICT],
-    'ret_type' : VarType.FEAT_DICT,
-    'executor_cls' : NodeUDFExecutor,
-}
-
-def NODE_UDF(fn, fdnode, fdmail=None, ret=None):
-    """Apply the node UDF and get the new node feature symbolically.
-
-    Parameters
-    ----------
-    fn : var.Var
-        The UDF.
-    fdnode : var.Var
-        The node feature dict.
-    fdmail : var.Var
-        The mailbox data dict.
-    ret : var.Var, optional
-        The return variable for new node feature dict. If not give,
-        a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.NODE_UDF]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](fn, fdnode, fdmail, ret))
-    return ret
-
-class EdgeUDFExecutor(Executor):
-    """Executor for edge UDF call.
-
-    Parameters
-    ----------
-    fn : var.Var
-        The UDF.
-    fdsrc : var.Var
-        The src node feature dict.
-    fdedge : var.Var
-        The edge feature dict.
-    fddst : var.Var
-        The dst node feature dict.
-    ret : var.Var
-        The return new edge feature dict.
-    """
-    def __init__(self, fn, fdsrc, fdedge, fddst, ret):
-        self.fn = fn
-        self.fdsrc = fdsrc
-        self.fdedge = fdedge
-        self.fddst = fddst
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.EDGE_UDF
-
-    def arg_vars(self):
-        return [self.fn, self.fdsrc, self.fdedge, self.fddst]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        fn_data = self.fn.data
-        src_data = self.fdsrc.data
-        edge_data = self.fdedge.data
-        dst_data = self.fddst.data
-        udf_ret = fn_data(src_data, edge_data, dst_data)
-        self.ret.data = FrameRef(Frame(udf_ret))
-
-IR_REGISTRY[OpCode.EDGE_UDF] = {
-    'name' : 'EDGE_UDF',
-    'args_type' : [VarType.FUNC, VarType.FEAT_DICT, VarType.FEAT_DICT],
-    'ret_type' : VarType.FEAT_DICT,
-    'executor_cls' : EdgeUDFExecutor,
-}
-def EDGE_UDF(fn, fdsrc, fdedge, fddst, ret=None):
-    """Apply the edge UDF and get the new edge feature symbolically.
-
-    Parameters
-    ----------
-    fn : var.Var
-        The UDF.
-    fdsrc : var.Var
-        The src node feature dict.
-    fdedge : var.Var
-        The edge feature dict.
-    fddst : var.Var
-        The dst node feature dict.
-    ret : var.Var, optional
-        The return variable for new node feature dict. If not give,
-        a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.EDGE_UDF]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](fn, fdsrc, fdedge, fddst, ret))
-    return ret
-
-class ReadExecutor(Executor):
-    """Executor for read data from feature dict.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    col : var.Var
-        The column name.
-    ret : var.Var
-        The return feature tensor.
-    """
-    def __init__(self, fd, row, col, ret):
-        self.fd = fd
-        self.row = row
-        self.col = col
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.READ
-
-    def arg_vars(self):
-        return [self.fd, self.row, self.col]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        fd_data = self.fd.data  # feature dict
-        row_data = self.row.data  # idx
-        col_data = self.col.data  # key str
-        self.ret.data = fd_data[row_data][col_data]
-
-IR_REGISTRY[OpCode.READ] = {
-    'name' : 'READ',
-    'args_type' : [VarType.FEAT_DICT, VarType.IDX, VarType.STR],
-    'ret_type' : VarType.FEAT,
-    'executor_cls' : ReadExecutor,
-}
-
-def READ(fd, row, col, ret=None):
-    """Read the feature data from the dictionary specified by the row and column symbolically.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    col : var.Var
-        The column name.
-    ret : var.Var, optional
-        The return feature tensor. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.READ]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](fd, row, col, ret))
-    return ret
-
-class ReadColExecutor(Executor):
-    """Executor for read column data from feature dict.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    col : var.Var
-        The column name.
-    ret : var.Var
-        The return feature tensor.
-    """
-    def __init__(self, fd, col, ret):
-        self.fd = fd
-        self.col = col
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.READ_COL
-
-    def arg_vars(self):
-        return [self.fd, self.col]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        fd_data = self.fd.data
-        col_data = self.col.data
-        self.ret.data = fd_data[col_data]
-
-IR_REGISTRY[OpCode.READ_COL] = {
-    'name' : 'READ_COL',
-    'args_type' : [VarType.FEAT_DICT, VarType.STR],
-    'ret_type' : VarType.FEAT,
-    'executor_cls' : ReadColExecutor,
-}
-
-def READ_COL(fd, col, ret=None):
-    """Read the column data from the dictionary.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    col : var.Var
-        The column name.
-    ret : var.Var, optional
-        The return feature tensor. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.READ_COL]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](fd, col, ret))
-    return ret
-
-class ReadRowExecutor(Executor):
-    """Executor for read row data from feature dict.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    ret : var.Var
-        The return feature tensor.
-    """
-    def __init__(self, fd, row, ret):
-        self.fd = fd
-        self.row = row
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.READ_ROW
-
-    def arg_vars(self):
-        return [self.fd, self.row]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        fd_data = self.fd.data
-        row_data = self.row.data  # idx
-        self.ret.data = fd_data[row_data]
-
-IR_REGISTRY[OpCode.READ_ROW] = {
-    'name' : 'READ_ROW',
-    'args_type' : [VarType.FEAT_DICT, VarType.IDX],
-    'ret_type' : VarType.FEAT_DICT,
-    'executor_cls' : ReadRowExecutor,
-}
-
-def READ_ROW(fd, row, ret=None):
-    """Read the row data from the dictionary.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    ret : var.Var, optional
-        The return feature tensor. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.READ_ROW]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](fd, row, ret))
-    return ret
-
-class MergeRowExecutor(Executor):
-    """Executor for merge row data according to the given order.
-
-    Parameters
-    ----------
-    order : var.Var
-        The order index.
-    fd_list : list of var.Var
-        The list of row data variables. Each represents a feature dict.
-    ret : var.Var
-        Variable for the result.
-    """
-    def __init__(self, order, fd_list, ret):
-        self.order = order
-        self.fd_list = fd_list
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.MERGE_ROW
-
-    def arg_vars(self):
-        return [self.order] + self.fd_list
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        # merge buckets according to the ascending order of the node ids.
-        order_data = self.order.data
-        fd_data = [fd.data for fd in self.fd_list]
-        keys = fd_data[0].keys()
-        all_fd = {key : F.cat([fd[key] for fd in fd_data], dim=0)
-                  for key in keys}
-        ret_fd = utils.reorder(all_fd, order_data)
-        self.ret.data = ret_fd
-
-IR_REGISTRY[OpCode.MERGE_ROW] = {
-    'name' : 'MERGE_ROW',
-    'args_type' : [VarType.IDX, VarType.IDX, '*', VarType.FEAT_DICT, '*'],
-    'ret_type' : VarType.FEAT_DICT,
-    'executor_cls' : MergeRowExecutor,
-}
-
-def MERGE_ROW(idx_list, fd_list, ret=None):
-    """Merge row data according to the given order symbolically.
-
-    Parameters
-    ----------
-    order : var.Var
-        The order index.
-    fd_list : list of var.Var
-        The list of row data variables. Each represents a feature dict.
-    ret : var.Var, optional
-        Variable for the result. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.MERGE_ROW]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](idx_list, fd_list, ret))
-    return ret
-
-class UpdateDictExecutor(Executor):
-    """Executor for update feature dictionary with another one.
-
-    Similar to python dict's update but return a new dictionary.
-
-    Parameters
-    ----------
-    fd1 : var.Var
-        Variable for the feature dict to be updated.
-    fd2 : var.Var
-        Variable for the provided feature dict.
-    ret : var.Var
-        Variable for the result.
-    """
-    def __init__(self, fd1, fd2, ret):
-        self.fd1 = fd1
-        self.fd2 = fd2
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.UPDATE_DICT
-
-    def arg_vars(self):
-        return [self.fd1, self.fd2]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        fd1_data = self.fd1.data
-        fd2_data = self.fd2.data
-        if (isinstance(fd1_data, utils.LazyDict)
-                or isinstance(fd2_data, utils.LazyDict)):
-            # NOTE: fd2 has higher priority
-            ret_data = utils.HybridDict(fd2_data, fd1_data)
-        else:
-            ret_data = {k : v for k, v in fd1_data.items()}
-            ret_data.update(fd2_data)
-        self.ret.data = ret_data
-
-IR_REGISTRY[OpCode.UPDATE_DICT] = {
-    'name' : 'UPDATE_DICT',
-    'args_type' : [VarType.FEAT_DICT, VarType.FEAT_DICT],
-    'ret_type' : VarType.FEAT_DICT,
-    'executor_cls' : UpdateDictExecutor,
-}
-
-def UPDATE_DICT(fd1, fd2, ret=None):
-    """Executor for update feature dictionary with another one.
-
-    Similar to python dict's update but return a new dictionary.
-
-    Parameters
-    ----------
-    fd1 : var.Var
-        Variable for the feature dict to be updated.
-    fd2 : var.Var
-        Variable for the provided feature dict.
-    ret : var.Var, optional
-        Variable for the result. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.UPDATE_DICT]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](fd1, fd2, ret))
-    return ret
-
-class NewDictExecutor(Executor):
-    """Executor for creating new feature dictionary.
-
-    Parameters
-    ----------
-    fd_init : var.Var
-        The feat dict to borrow initializer.
-    idx : var.Var
-        The index to look for number or rows.
-    fd_scheme : var.Var
-        The feat dict to look for column scheme.
-    ret : var.Var
-        Variable for the result.
-    """
-    def __init__(self, fd_init, idx, fd_scheme, ret):
-        self.fd_init = fd_init  # the feat dict to borrow initializer
-        self.idx = idx  # the index to look for number or rows
-        self.fd_scheme = fd_scheme  # the feat dict to look for column scheme
-        self.ret = ret  # the result
-
-    def opcode(self):
-        return OpCode.NEW_DICT
-
-    def arg_vars(self):
-        return [self.fd_init, self.idx, self.fd_scheme]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        fd_init_data = self.fd_init.data
-        idx_data = self.idx.data
-        fd_scheme_data = self.fd_scheme.data
-        schemes = fd_scheme_data.schemes
-        ret_dict = {}
-        for key, sch in schemes.items():
-            initializer = fd_init_data.get_initializer(key)
-            ctx = F.context(fd_scheme_data[key])
-            shape = (len(idx_data),) + sch.shape
-            # FIXME: the last argument here can only be idx; range
-            #   is meaningless. Need to rethink the signature.
-            ret_dict[key] = initializer(shape, sch.dtype, ctx, idx_data)
-        self.ret.data = FrameRef(Frame(ret_dict))
-
-IR_REGISTRY[OpCode.NEW_DICT] = {
-    'name' : 'NEW_DICT',
-    'args_type' : [VarType.FEAT_DICT, VarType.IDX, VarType.FEAT_DICT],
-    'ret_type' : VarType.FEAT_DICT,
-    'executor_cls' : NewDictExecutor,
-}
-
-def NEW_DICT(fd_init, idx, fd_scheme, ret=None):
-    """Create a new dictionary symbolically.
-
-    Parameters
-    ----------
-    fd_init : var.Var
-        The feat dict to borrow initializer.
-    idx : var.Var
-        The index to look for number or rows.
-    fd_scheme : var.Var
-        The feat dict to look for column scheme.
-    ret : var.Var
-        Variable for the result. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.NEW_DICT]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](fd_init, idx, fd_scheme, ret))
-    return ret
-
-class Write_Executor(Executor):
-    """Executor for writing the given data to the feature dict.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    col : var.Var
-        The column name.
-    val : var.Var
-        The given feature data.
-    """
-    def __init__(self, fd, row, col, val):
-        self.fd = fd
-        self.row = row
-        self.col = col
-        self.val = val
-
-    def opcode(self):
-        return OpCode.WRITE_
-
-    def arg_vars(self):
-        return [self.fd, self.row, self.col, self.val]
-
-    def ret_var(self):
-        return None
-
-    def run(self):
-        fd_data = self.fd.data  # feature dict
-        row_data = self.row.data  # idx
-        col_data = self.col.data  # key str
-        val_data = self.val.data
-        fd_data[col_data][row_data] = val_data
-
-IR_REGISTRY[OpCode.WRITE_] = {
-    'name' : 'WRITE_',
-    'args_type' : [VarType.FEAT_DICT, VarType.IDX, VarType.STR, VarType.FEAT],
-    'ret_type' : None,
-    'executor_cls' : Write_Executor,
-}
-
-def WRITE_(fd, row, col, val):
-    """Write the given data to the feature dict symbolically.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    col : var.Var
-        The column name.
-    val : var.Var
-        The given feature data.
-    """
-    reg = IR_REGISTRY[OpCode.WRITE_]
-    get_current_prog().issue(reg['executor_cls'](fd, row, col, val))
-
-class WriteCol_Executor(Executor):
-    """Executor for writing the given column data to the feature dict.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    col : var.Var
-        The column name.
-    val : var.Var
-        The given feature data.
-    """
-    def __init__(self, fd, col, val):
-        self.fd = fd
-        self.col = col
-        self.val = val
-
-    def opcode(self):
-        return OpCode.WRITE_COL_
-
-    def arg_vars(self):
-        return [self.fd, self.col, self.val]
-
-    def ret_var(self):
-        return None
-
-    def run(self):
-        fd_data = self.fd.data  # feature dict
-        col_data = self.col.data  # key str
-        val_data = self.val.data
-        fd_data[col_data] = val_data
-
-IR_REGISTRY[OpCode.WRITE_COL_] = {
-    'name' : 'WRITE_COL_',
-    'args_type' : [VarType.FEAT_DICT, VarType.STR, VarType.FEAT],
-    'ret_type' : None,
-    'executor_cls' : WriteCol_Executor,
-}
-
-def WRITE_COL_(fd, col, val):
-    """Writing the given column data to the feature dict symbolically.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    col : var.Var
-        The column name.
-    val : var.Var
-        The given feature data.
-    """
-    reg = IR_REGISTRY[OpCode.WRITE_COL_]
-    get_current_prog().issue(reg['executor_cls'](fd, col, val))
-
-class WriteRow_Executor(Executor):
-    """Executor for writing the given row data to the feature dict.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    val : var.Var
-        The given feature data.
-    """
-    def __init__(self, fd, row, val):
-        self.fd = fd
-        self.row = row
-        self.val = val
-
-    def opcode(self):
-        return OpCode.WRITE_ROW_
-
-    def arg_vars(self):
-        return [self.fd, self.row, self.val]
-
-    def ret_var(self):
-        return None
-
-    def run(self):
-        fd_data = self.fd.data  # feature dict
-        row_data = self.row.data  # idx
-        val_data = self.val.data
-        fd_data[row_data] = val_data
-
-IR_REGISTRY[OpCode.WRITE_ROW_] = {
-    'name' : 'WRITE_ROW_',
-    'args_type' : [VarType.FEAT_DICT, VarType.IDX, VarType.FEAT_DICT],
-    'ret_type' : None,
-    'executor_cls' : WriteRow_Executor,
-}
-
-def WRITE_ROW_(fd, row, val):
-    """Write the given row data to the feature dict symbolically.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    val : var.Var
-        The given feature data.
-    """
-    reg = IR_REGISTRY[OpCode.WRITE_ROW_]
-    get_current_prog().issue(reg['executor_cls'](fd, row, val))
-
-class WriteRowInplace_Executor(Executor):
-    """Executor for writing the given row data to the feature dict in-place.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    val : var.Var
-        The given feature data.
-    """
-    def __init__(self, fd, row, val):
-        self.fd = fd
-        self.row = row
-        self.val = val
-
-    def opcode(self):
-        return OpCode.WRITE_ROW_INPLACE_
-
-    def arg_vars(self):
-        return [self.fd, self.row, self.val]
-
-    def ret_var(self):
-        return None
-
-    def run(self):
-        fd_data = self.fd.data  # feature dict
-        row_data = self.row.data  # idx
-        val_data = self.val.data
-        fd_data.update_data(row_data, val_data, inplace=True)
-
-IR_REGISTRY[OpCode.WRITE_ROW_INPLACE_] = {
-    'name' : 'WRITE_ROW_INPLACE_',
-    'args_type' : [VarType.FEAT_DICT, VarType.IDX, VarType.FEAT_DICT],
-    'ret_type' : None,
-    'executor_cls' : WriteRowInplace_Executor,
-}
-
-def WRITE_ROW_INPLACE_(fd, row, val):
-    """Write the given row data to the feature dict in-place symbolically.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict.
-    row : var.Var
-        The row index.
-    val : var.Var
-        The given feature data.
-    """
-    reg = IR_REGISTRY[OpCode.WRITE_ROW_INPLACE_]
-    get_current_prog().issue(reg['executor_cls'](fd, row, val))
-
-class WriteDict_Executor(Executor):
-    """Executor for writing the given feature dict data into the another one.
-
-    Parameters
-    ----------
-    fd1 : var.Var
-        The feature dict to be mutated.
-    fd2 : var.Var
-        The feature dict data.
-    """
-    def __init__(self, fd1, fd2):
-        self.fd1 = fd1
-        self.fd2 = fd2
-
-    def opcode(self):
-        return OpCode.WRITE_DICT_
-
-    def arg_vars(self):
-        return [self.fd1, self.fd2]
-
-    def ret_var(self):
-        return None
-
-    def run(self):
-        fd1_data = self.fd1.data
-        fd2_data = self.fd2.data
-        for k, v in fd2_data.items():
-            fd1_data[k] = v
-
-IR_REGISTRY[OpCode.WRITE_DICT_] = {
-    'name' : 'WRITE_DICT_',
-    'args_type' : [VarType.FEAT_DICT, VarType.FEAT_DICT],
-    'ret_type' : None,
-    'executor_cls' : WriteDict_Executor,
-}
-
-def WRITE_DICT_(fd1, fd2):
-    """Writing the given feature dict data into the another one symbolically.
-
-    Parameters
-    ----------
-    fd1 : var.Var
-        The feature dict to be mutated.
-    fd2 : var.Var
-        The feature dict data.
-    """
-    reg = IR_REGISTRY[OpCode.WRITE_DICT_]
-    get_current_prog().issue(reg['executor_cls'](fd1, fd2))
-
-class AppendRow_Executor(Executor):
-    """Executor for appending one feature dict to another.
-
-    Parameters
-    ----------
-    fd1 : var.Var
-        The feature dict in the front.
-    fd2 : var.Var
-        The feature dict in the back.
-    """
-    def __init__(self, fd1, fd2):
-        self.fd1 = fd1
-        self.fd2 = fd2
-
-    def opcode(self):
-        return OpCode.APPEND_ROW_
-
-    def arg_vars(self):
-        return [self.fd1, self.fd2]
-
-    def ret_var(self):
-        return None
-
-    def run(self):
-        fd1_data = self.fd1.data
-        fd2_data = self.fd2.data
-        fd1_data.append(fd2_data)
-
-IR_REGISTRY[OpCode.APPEND_ROW_] = {
-    'name' : 'APPEND_ROW_',
-    'args_type' : [VarType.FEAT_DICT, VarType.FEAT_DICT],
-    'ret_type' : None,
-    'executor_cls' : AppendRow_Executor,
-}
-def APPEND_ROW_(fd1, fd2):
-    """Append one feature dict to another symbolically.
-
-    Parameters
-    ----------
-    fd1 : var.Var
-        The feature dict in the front.
-    fd2 : var.Var
-        The feature dict in the back.
-    """
-    reg = IR_REGISTRY[OpCode.APPEND_ROW_]
-    get_current_prog().issue(reg['executor_cls'](fd1, fd2))
-
-class ClearFrame_Executor(Executor):
-    """Executor for clear the feature dict.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict to be cleared.
-    """
-    def __init__(self, fd):
-        self.fd = fd
-
-    def opcode(self):
-        return OpCode.CLEAR_FRAME_
-
-    def arg_vars(self):
-        return [self.fd]
-
-    def ret_var(self):
-        return None
-
-    def run(self):
-        frame = self.fd.data
-        num_rows = frame.num_rows
-        frame.clear()
-        frame.add_rows(num_rows)
-
-IR_REGISTRY[OpCode.CLEAR_FRAME_] = {
-    'name': 'CLEAR_FRAME_',
-    'args_type': [VarType.FEAT_DICT],
-    'ret_type': None,
-    'executor_cls': ClearFrame_Executor,
-}
-
-def CLEAR_FRAME_(fd):
-    """Clear the feature dict symbolically.
-
-    Parameters
-    ----------
-    fd : var.Var
-        The feature dict to be cleared.
-    """
-    reg = IR_REGISTRY[OpCode.CLEAR_FRAME_]
-    get_current_prog().issue(reg['executor_cls'](fd))
-
-
-class BinaryReduceExecutor(Executor):
-    """Executor for BINARY_REDUCE
-
-    Parameters
-    ----------
-    reducer : str
-        String representing reduction to perform, can be "sum", "max", "min",
-        "mean", "prod", "none" (no reduction)
-    binary_op : str
-        String representing binary operation to perform, can be "add", "mul",
-        "sub", "div", "dot"
-    graph : var.Var
-        Variable for graph index lambda. The lambda returns the immutable graph
-        index given a context object.
-    lhs: int
-        The lhs target (src, dst, edge)
-    rhs: int
-        The rhs target (src, dst, edge)
-    lhs_data : var.Var
-        Variable for the lhs data
-    rhs_data : var.Var
-        Variable for the rhs data
-    out_size : int
-        Output size
-    lhs_map : var.Var
-        Variable for mapping lambda. The lambda returns the lhs id mapping
-        array on given context
-    rhs_map : var.Var
-        Variable for mapping lambda. The lambda returns the rhs id mapping
-        array on given context
-    out_map : var.Var
-        Variable for mapping lambda. The lambda returns the output id mapping
-        array on given context
-    ret : var.Var
-        Variable for the result.
-    """
-    def __init__(self, reducer, binary_op, graph, lhs, rhs, lhs_data,
-                 rhs_data, out_size, lhs_map, rhs_map, out_map, ret):
-        self.reducer = reducer
-        self.binary_op = binary_op
-        self.graph = graph
-        self.lhs = lhs
-        self.rhs = rhs
-        self.lhs_data = lhs_data
-        self.rhs_data = rhs_data
-        self.out_size = out_size
-        self.lhs_map = lhs_map
-        self.rhs_map = rhs_map
-        self.out_map = out_map
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.BINARY_REDUCE
-
-    def arg_vars(self):
-        return [self.reducer, self.binary_op, self.graph, self.lhs, self.rhs,
-                self.lhs_data, self.rhs_data, self.out_size, self.lhs_map,
-                self.rhs_map, self.out_map]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        lhs_data = self.lhs_data.data
-        rhs_data = self.rhs_data.data
-        ctx = utils.to_dgl_context(F.context(lhs_data))
-        graph = self.graph.data(ctx)
-        lhs_map = self.lhs_map.data(ctx) if self.lhs_map.data else None
-        rhs_map = self.rhs_map.data(ctx) if self.rhs_map.data else None
-        out_map = self.out_map.data(ctx) if self.out_map.data else None
-        if not isinstance(lhs_map, tuple):
-            lhs_map = (lhs_map, lhs_map)
-        if not isinstance(rhs_map, tuple):
-            rhs_map = (rhs_map, rhs_map)
-        if not isinstance(out_map, tuple):
-            out_map = (out_map, out_map)
-        self.ret.data = F.binary_reduce(
-            self.reducer, self.binary_op, graph, self.lhs, self.rhs,
-            lhs_data, rhs_data, self.out_size, lhs_map, rhs_map, out_map)
-
-
-IR_REGISTRY[OpCode.BINARY_REDUCE] = {
-    'name': 'BINARY_REDUCE',
-    'args_type': [VarType.STR, VarType.STR, VarType.GRAPH, VarType.INT,
-                  VarType.INT, VarType.FEAT, VarType.FEAT, VarType.INT,
-                  VarType.MAP, VarType.MAP, VarType.MAP],
-    'ret_type': VarType.FEAT,
-    'executor_cls': BinaryReduceExecutor,
-}
-
-
-def BINARY_REDUCE(reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data,
-                  out_size, lhs_map, rhs_map, out_map, ret=None):
-    """Perform BINARY_REDUCE symbolically.
-
-    Parameters
-    ----------
-    reducer : str
-        String representing reduction to perform, can be "sum", "max", "min",
-        "mean", "prod", "none" (no reduction)
-    binary_op : str
-        String representing binary operation to perform, can be "add", "mul",
-        "sub", "div", "dot"
-    graph : var.Var
-        Variable for graph index lambda. The lambda returns the immutable graph
-        index given a context object.
-    lhs: int
-        The lhs target (src, dst, edge)
-    rhs: int
-        The rhs target (src, dst, edge)
-    lhs_data : var.Var
-        Variable for the lhs data
-    rhs_data : var.Var
-        Variable for the rhs data
-    out_size : int
-        Output size
-    lhs_map : var.Var
-        Variable for mapping lambda. The lambda returns the lhs id mapping
-        array on given context
-    rhs_map : var.Var
-        Variable for mapping lambda. The lambda returns the rhs id mapping
-        array on given context
-    out_map : var.Var
-        Variable for mapping lambda. The lambda returns the output id mapping
-        array on given context
-    ret : var.Var, optional
-        Variable for the result. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.BINARY_REDUCE]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](
-        reducer, binary_op, graph, lhs, rhs, lhs_data, rhs_data, out_size,
-        lhs_map, rhs_map, out_map, ret))
-    return ret
-
-
-class CopyReduceExecutor(Executor):
-    """Executor for COPY_REDUCE
-
-    Parameters
-    ----------
-    reducer : str
-        String representing reduction to perform, can be "sum", "max", "min",
-        "mean", "prod", "none" (no reduction)
-    graph : var.Var
-        Variable for graph index lambda. The lambda returns the immutable graph
-        index given a context object.
-    target: int
-        The input target (src, dst, edge)
-    in_data : var.Var
-        Variable for the input data
-    out_size : int
-        Output size
-    in_map : var.Var
-        Variable for mapping lambda. The lambda returns the input id mapping
-        array on given context
-    out_map : var.Var
-        Variable for mapping lambda. The lambda returns the output id mapping
-        array on given context
-    ret : var.Var
-        Variable for the result.
-    """
-    def __init__(self, reducer, graph, target, in_data, out_size, in_map,
-                 out_map, ret):
-        self.reducer = reducer
-        self.graph = graph
-        self.target = target
-        self.in_data = in_data
-        self.out_size = out_size
-        self.in_map = in_map
-        self.out_map = out_map
-        self.ret = ret
-
-    def opcode(self):
-        return OpCode.COPY_REDUCE
-
-    def arg_vars(self):
-        return [self.reducer, self.graph, self.target, self.in_data,
-                self.out_size, self.in_map, self.out_map]
-
-    def ret_var(self):
-        return self.ret
-
-    def run(self):
-        in_data = self.in_data.data
-        ctx = utils.to_dgl_context(F.context(in_data))
-        graph = self.graph.data(ctx)
-        in_map = self.in_map.data(ctx) if self.in_map.data else None
-        out_map = self.out_map.data(ctx) if self.out_map.data else None
-        if not isinstance(in_map, tuple):
-            in_map = (in_map, in_map)
-        if not isinstance(out_map, tuple):
-            out_map = (out_map, out_map)
-        self.ret.data = F.copy_reduce(
-            self.reducer, graph, self.target, in_data, self.out_size, in_map,
-            out_map)
-
-
-IR_REGISTRY[OpCode.COPY_REDUCE] = {
-    'name': 'COPY_REDUCE',
-    'args_type': [VarType.STR, VarType.GRAPH, VarType.INT, VarType.FEAT, VarType.INT,
-                  VarType.MAP, VarType.MAP],
-    'ret_type': VarType.FEAT,
-    'executor_cls': CopyReduceExecutor,
-}
-
-
-def COPY_REDUCE(reducer, graph, target, in_data, out_size, in_map, out_map,
-                ret=None):
-    """Perform COPY_REDUCE symbolically.
-
-    Parameters
-    ----------
-    reducer : str
-        String representing reduction to perform, can be "sum", "max", "min",
-        "mean", "prod", "none" (no reduction)
-    graph : var.Var
-        Variable for graph index lambda. The lambda returns the immutable graph
-        index given a context object.
-    target: int
-        The input target (src, dst, edge)
-    in_data : var.Var
-        Variable for the input data
-    out_size : int
-        Output size
-    in_map : var.Var
-        Variable for mapping lambda. The lambda returns the input id mapping
-        array on given context
-    out_map : var.Var
-        Variable for mapping lambda. The lambda returns the output id mapping
-        array on given context
-    ret : var.Var, optional
-        Variable for the result. If not give, a new variable will be created.
-
-    Returns
-    -------
-    var.Var
-        Variable for the result.
-    """
-    reg = IR_REGISTRY[OpCode.COPY_REDUCE]
-    ret = var.new(reg['ret_type']) if ret is None else ret
-    get_current_prog().issue(reg['executor_cls'](
-        reducer, graph, target, in_data, out_size, in_map, out_map, ret))
-    return ret
--- a/python/dgl/_deprecate/runtime/ir/program.py
+++ b/python/dgl/_deprecate/runtime/ir/program.py
-"""Module for program."""
-from __future__ import absolute_import
-
-from contextlib import contextmanager
-import threading
-
-from .registry import IR_REGISTRY
-
-class Prog(object):
-    """The program.
-
-    A program is simply a list of executors.
-    """
-    def __init__(self):
-        self.execs = []
-        self.varcount = 0
-
-    def issue(self, exe):
-        """Issue an executor to this program.
-
-        Parameters
-        ----------
-        exe : Executor
-            The executor.
-        """
-        self.execs.append(exe)
-
-    def pprint_exe(self, exe):
-        """Internal function to pretty-print the executor."""
-        argstr = ', '.join([str(av) for av in exe.arg_vars()])
-        if exe.ret_var() is None:
-            # stmt
-            print("%s(%s)" % (
-                IR_REGISTRY[exe.opcode()]['name'],
-                argstr))
-        else:
-            print("%s %s = %s(%s)" % (
-                exe.ret_var().typestr(),
-                exe.ret.name,
-                IR_REGISTRY[exe.opcode()]['name'],
-                argstr))
-
-    def pprint(self):
-        """Pretty-print the program."""
-        for exe in self.execs:
-            self.pprint_exe(exe)
-
-class CurrentProgram(threading.local):
-    """Thread local storage to keep the reference of current thread's program"""
-    def __init__(self):
-        super(CurrentProgram, self).__init__()
-        self.prog = None
-
-    def get_prog(self):
-        """Get program"""
-        return self.prog
-
-    def set_prog(self, program):
-        """Set program"""
-        self.prog = program
-
-# current program
-CURRENT_PROG = CurrentProgram()
-
-def get_current_prog():
-    """Get the current program."""
-    return CURRENT_PROG.get_prog()
-
-def set_current_prog(program):
-    """Set the current program."""
-    CURRENT_PROG.set_prog(program)
-
-@contextmanager
-def prog():
-    """A context manager to create a new program."""
-    set_current_prog(Prog())
-    yield get_current_prog()
-    set_current_prog(None)
--- a/python/dgl/_deprecate/runtime/ir/registry.py
+++ b/python/dgl/_deprecate/runtime/ir/registry.py
-"""Module for ir registry."""
-from __future__ import absolute_import
-
-IR_REGISTRY = {}
--- a/python/dgl/_deprecate/runtime/ir/var.py
+++ b/python/dgl/_deprecate/runtime/ir/var.py
-"""Module for variables."""
-# pylint: disable=invalid-name
-from __future__ import absolute_import
-
-from .program import get_current_prog
-
-class VarType(object):
-    """Variable types."""
-    # Types for symbolic objects (i.e, they might not be
-    #  concretized before evaluation.
-    FEAT = 0
-    FEAT_DICT = 1
-    # Types for concrete objects (i.e, they must have values).
-    GRAPH = 2
-    IDX = 3
-    STR = 4
-    FUNC = 5
-    MAP = 6
-    INT = 7
-
-VAR_TYPE_NAME_MAP = [
-    'Feat',
-    'FeatDict',
-    'GRAPH',
-    'Idx',
-    'Str',
-    'Func',
-    'Map',
-    'Int',
-]
-
-class Var(object):
-    """Class for variables in IR.
-
-    Variables represent data in the IR. A variable can contain concrete values.
-    Otherwise, it can act as a "symbol", whose values are not materialized at
-    the moment, but later.
-
-    Parameters
-    ----------
-    name : str
-        The variable name.
-    type : int
-        The type code.
-    data : any, default=None (not concretized)
-        The data.
-    """
-    __slots__ = ['name', 'typecode', 'data']
-
-    def __init__(self, name, typecode, data):
-        self.name = name
-        self.typecode = typecode
-        self.data = data
-
-    def __str__(self):
-        if self.typecode == VarType.STR:
-            return '"%s"' % self.data
-        else:
-            return self.name
-
-    def typestr(self):
-        """Return the type string of this variable."""
-        return VAR_TYPE_NAME_MAP[self.typecode]
-
-def new(typecode, data=None, name=None):
-    """Create a new variable."""
-    if name is None:
-        cur_prog = get_current_prog()
-        name = '_z%d' % cur_prog.varcount
-        cur_prog.varcount += 1
-    return Var(name, typecode, data)
-
-def FEAT(data=None, name=None):
-    """Create a variable for feature tensor."""
-    return new(VarType.FEAT, data, name)
-
-def FEAT_DICT(data=None, name=None):
-    """Create a variable for feature dict."""
-    return new(VarType.FEAT_DICT, data, name)
-
-def GRAPH(data=None, name=None):
-    """Create a variable for graph index lambda."""
-    return new(VarType.GRAPH, data, name)
-
-def IDX(data=None, name=None):
-    """Create a variable for index."""
-    return new(VarType.IDX, data, name)
-
-def STR(data=None, name=None):
-    """Create a variable for string value."""
-    return new(VarType.STR, data, name)
-
-def FUNC(data=None, name=None):
-    """Create a variable for function."""
-    return new(VarType.FUNC, data, name)
-
-def MAP(data=None, name=None):
-    """Create a variable for mapping lambda"""
-    return new(VarType.MAP, data, name)
-
-def INT(data=None, name=None):
-    """Create a variable for int value"""
-    return new(VarType.INT, data, name)
--- a/python/dgl/_deprecate/runtime/runtime.py
+++ b/python/dgl/_deprecate/runtime/runtime.py
-"""DGL mini-runtime."""
-
-
-class Runtime(object):
-    """The mini runtime class."""
-    @staticmethod
-    def run(prog):
-        """Run the given program."""
-        for exe in prog.execs:
-            # prog.pprint_exe(exe)
-            exe.run()
--- a/python/dgl/_deprecate/runtime/scheduler.py
+++ b/python/dgl/_deprecate/runtime/scheduler.py
-"""For different schedulers"""
-from __future__ import absolute_import
-
-from ... import utils
-from ..._ffi.function import _init_api
-from ...base import DGLError
-from ... import backend as F
-from ..frame import frame_like, FrameRef
-from ...function.base import BuiltinFunction
-from ..udf import EdgeBatch, NodeBatch
-from ... import ndarray as nd
-
-from . import ir
-from .ir import var
-from . import degree_bucketing as db
-from . import spmv
-
-__all__ = [
-    "schedule_send",
-    "schedule_recv",
-    "schedule_update_all",
-    "schedule_snr",
-    "schedule_apply_nodes",
-    "schedule_apply_edges",
-    "schedule_group_apply_edge",
-    "schedule_push",
-    "schedule_pull"
-]
-
-def schedule_send(graph,
-                  u, v, eid,
-                  message_func,
-                  msgframe=None):
-    """Schedule send
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    u : utils.Index
-        Source nodes
-    v : utils.Index
-        Destination nodes
-    eid : utils.Index
-        Ids of sending edges
-    message_func: callable or list of callable
-        The message function
-    msgframe : FrameRef, optional
-        The storage to write messages to. If None, use graph.msgframe.
-    """
-    var_mf = var.FEAT_DICT(msgframe if msgframe is not None else graph.msgframe)
-    var_src_nf = var.FEAT_DICT(graph.srcframe)
-    var_dst_nf = var.FEAT_DICT(graph.dstframe)
-    var_ef = var.FEAT_DICT(graph.edgeframe)
-    var_eid = var.IDX(eid)
-
-    var_msg = _gen_send(graph=graph,
-                        u=u,
-                        v=v,
-                        eid=eid,
-                        mfunc=message_func,
-                        var_src_nf=var_src_nf,
-                        var_dst_nf=var_dst_nf,
-                        var_ef=var_ef)
-
-    # write tmp msg back
-    ir.WRITE_ROW_(var_mf, var_eid, var_msg)
-    # set message indicator to 1
-    graph.msgindicator = graph.msgindicator.set_items(eid, 1)
-
-def schedule_recv(graph,
-                  recv_nodes,
-                  reduce_func,
-                  apply_func,
-                  inplace,
-                  outframe=None):
-    """Schedule recv.
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    recv_nodes: utils.Index
-        Nodes to recv.
-    reduce_func: callable or list of callable
-        The reduce function
-    apply_func: callable
-        The apply node function
-    inplace: bool
-        If True, the update will be done in place
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use graph.dstframe.
-    """
-    src, dst, eid = graph.in_edges(recv_nodes)
-    if len(eid) > 0:
-        nonzero_idx = graph.msgindicator.get_items(eid).nonzero()
-        eid = eid.get_items(nonzero_idx)
-        src = src.get_items(nonzero_idx)
-        dst = dst.get_items(nonzero_idx)
-    if len(eid) == 0:
-        # Downgrade to apply nodes if
-        #   1) all recv nodes are 0-degree nodes
-        #   2) no send has been called
-        if apply_func is not None:
-            schedule_apply_nodes(recv_nodes, apply_func, graph.dstframe,
-                                 inplace, outframe, ntype=graph.canonical_etype[-1])
-    else:
-        var_dst_nf = var.FEAT_DICT(graph.dstframe, 'dst_nf')
-        var_out_nf = var_dst_nf if outframe is None else var.FEAT_DICT(outframe, name='out_nf')
-        # sort and unique the argument
-        recv_nodes, _ = F.sort_1d(F.unique(recv_nodes.tousertensor()))
-        recv_nodes = utils.toindex(recv_nodes, graph.gidx.dtype)
-        var_recv_nodes = var.IDX(recv_nodes, name='recv_nodes')
-        # reduce
-        reduced_feat = _gen_reduce(graph, reduce_func, (src, dst, eid),
-                                   recv_nodes)
-        # apply
-        final_feat = _apply_with_accum(var_recv_nodes, var_dst_nf,
-                                       reduced_feat, apply_func,
-                                       ntype=graph.canonical_etype[-1])
-        if inplace:
-            ir.WRITE_ROW_INPLACE_(var_out_nf, var_recv_nodes, final_feat)
-        else:
-            ir.WRITE_ROW_(var_out_nf, var_recv_nodes, final_feat)
-        # set message indicator to 0
-        graph.msgindicator = graph.msgindicator.set_items(eid, 0)
-        if not graph.msgindicator.has_nonzero():
-            ir.CLEAR_FRAME_(var.FEAT_DICT(graph.msgframe, name='mf'))
-
-def schedule_snr(graph,
-                 edge_tuples,
-                 message_func,
-                 reduce_func,
-                 apply_func,
-                 inplace,
-                 outframe=None):
-    """Schedule send_and_recv.
-
-    Currently it builds a subgraph from edge_tuples with the same number of
-    nodes as the original graph, so that routines for whole-graph updates
-    (e.g. fused kernels) could be reused.
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    edge_tuples: tuple
-        A tuple of (src ids, dst ids, edge ids) representing edges to perform
-        send_and_recv
-    message_func: callable or list of callable
-        The message function
-    reduce_func: callable or list of callable
-        The reduce function
-    apply_func: callable
-        The apply node function
-    inplace: bool
-        If True, the update will be done in place
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use graph.dstframe.
-    """
-    u, v, eid = edge_tuples
-    recv_nodes, _ = F.sort_1d(F.unique(v.tousertensor()))
-    recv_nodes = utils.toindex(recv_nodes, graph.gidx.dtype)
-    # create vars
-    var_dst_nf = var.FEAT_DICT(graph.dstframe, 'dst_nf')
-    var_out_nf = var_dst_nf if outframe is None else var.FEAT_DICT(outframe, name='out_nf')
-    var_u = var.IDX(u)
-    var_v = var.IDX(v)
-    var_eid = var.IDX(eid)
-    var_recv_nodes = var.IDX(recv_nodes, name='recv_nodes')
-    # generate send and reduce schedule
-    uv_getter = lambda: (var_u, var_v)
-    adj_creator = lambda: spmv.build_gidx_and_mapping_uv(
-        edge_tuples, graph.num_src(), graph.num_dst())
-    out_map_creator = lambda nbits: _build_idx_map(recv_nodes, nbits)
-    reduced_feat = _gen_send_reduce(src_node_frame=graph.srcframe,
-                                    dst_node_frame=graph.dstframe,
-                                    edge_frame=graph.edgeframe,
-                                    message_func=message_func,
-                                    reduce_func=reduce_func,
-                                    var_send_edges=var_eid,
-                                    var_reduce_nodes=var_recv_nodes,
-                                    uv_getter=uv_getter,
-                                    adj_creator=adj_creator,
-                                    out_map_creator=out_map_creator,
-                                    canonical_etype=graph.canonical_etype)
-    # generate apply schedule
-    final_feat = _apply_with_accum(var_recv_nodes, var_dst_nf, reduced_feat,
-                                   apply_func, ntype=graph.canonical_etype[-1])
-    if inplace:
-        ir.WRITE_ROW_INPLACE_(var_out_nf, var_recv_nodes, final_feat)
-    else:
-        ir.WRITE_ROW_(var_out_nf, var_recv_nodes, final_feat)
-
-def schedule_update_all(graph,
-                        message_func,
-                        reduce_func,
-                        apply_func,
-                        outframe=None):
-    """Get send and recv schedule
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    message_func: callable or list of callable
-        The message function
-    reduce_func: callable or list of callable
-        The reduce function
-    apply_func: callable
-        The apply node function
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use graph.dstframe.
-    """
-    if graph.num_edges() == 0:
-        # All the nodes are zero degree; downgrade to apply nodes
-        if apply_func is not None:
-            nodes = utils.toindex(slice(0, graph.num_dst()), graph.gidx.dtype)
-            schedule_apply_nodes(nodes, apply_func, graph.dstframe,
-                                 inplace=False, outframe=outframe,
-                                 ntype=graph.canonical_etype[-1])
-    else:
-        eid = utils.toindex(slice(0, graph.num_edges()), graph.gidx.dtype) # ALL
-        recv_nodes = utils.toindex(slice(0, graph.num_dst()), graph.gidx.dtype) # ALL
-        # create vars
-        var_dst_nf = var.FEAT_DICT(graph.dstframe, name='dst_nf')
-        var_out_nf = var_dst_nf if outframe is None else var.FEAT_DICT(outframe, name='out_nf')
-        var_recv_nodes = var.IDX(recv_nodes, name='recv_nodes')
-        var_eid = var.IDX(eid)
-        # generate send + reduce
-        def uv_getter():
-            src, dst, _ = graph.edges('eid')
-            return var.IDX(src), var.IDX(dst)
-        adj_creator = lambda: spmv.build_gidx_and_mapping_graph(graph)
-        out_map_creator = lambda nbits: None
-        reduced_feat = _gen_send_reduce(src_node_frame=graph.srcframe,
-                                        dst_node_frame=graph.dstframe,
-                                        edge_frame=graph.edgeframe,
-                                        message_func=message_func,
-                                        reduce_func=reduce_func,
-                                        var_send_edges=var_eid,
-                                        var_reduce_nodes=var_recv_nodes,
-                                        uv_getter=uv_getter,
-                                        adj_creator=adj_creator,
-                                        out_map_creator=out_map_creator,
-                                        canonical_etype=graph.canonical_etype)
-        # generate optional apply
-        final_feat = _apply_with_accum(var_recv_nodes, var_dst_nf,
-                                       reduced_feat, apply_func,
-                                       ntype=graph.canonical_etype[-1])
-        ir.WRITE_DICT_(var_out_nf, final_feat)
-
-def schedule_apply_nodes(v,
-                         apply_func,
-                         node_frame,
-                         inplace,
-                         outframe=None,
-                         ntype=None):
-    """Get apply nodes schedule
-
-    Parameters
-    ----------
-    v : utils.Index
-        Nodes to apply
-    apply_func : callable
-        The apply node function
-    node_frame : FrameRef
-        Node feature frame.
-    inplace: bool
-        If True, the update will be done in place
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use the given node_frame.
-    ntype : str, optional
-        The node type, if running on a heterograph.
-        If None, assuming it's running on a homogeneous graph.
-
-    Returns
-    -------
-    A list of executors for DGL Runtime
-    """
-    var_v = var.IDX(v)
-    var_nf = var.FEAT_DICT(node_frame, name='nf')
-    var_out_nf = var_nf if outframe is None else var.FEAT_DICT(outframe, name='out_nf')
-    v_nf = ir.READ_ROW(var_nf, var_v)
-    def _afunc_wrapper(node_data):
-        nbatch = NodeBatch(v, node_data, ntype=ntype)
-        return apply_func(nbatch)
-    afunc = var.FUNC(_afunc_wrapper)
-    applied_feat = ir.NODE_UDF(afunc, v_nf)
-    if inplace:
-        ir.WRITE_ROW_INPLACE_(var_out_nf, var_v, applied_feat)
-    else:
-        ir.WRITE_ROW_(var_out_nf, var_v, applied_feat)
-
-def schedule_nodeflow_apply_nodes(graph,
-                                  layer_id,
-                                  v,
-                                  apply_func,
-                                  inplace):
-    """Get apply nodes schedule in NodeFlow.
-
-    Parameters
-    ----------
-    graph: NodeFlow
-        The NodeFlow to use
-    layer_id : int
-        The layer where we apply node update function.
-    v : utils.Index
-        Nodes to apply
-    apply_func: callable
-        The apply node function
-    inplace: bool
-        If True, the update will be done in place
-
-    Returns
-    -------
-    A list of executors for DGL Runtime
-    """
-    var_nf = var.FEAT_DICT(graph._get_node_frame(layer_id), name='nf')
-    var_v = var.IDX(v)
-    v_nf = ir.READ_ROW(var_nf, var_v)
-    def _afunc_wrapper(node_data):
-        nbatch = NodeBatch(v, node_data)
-        return apply_func(nbatch)
-    afunc = var.FUNC(_afunc_wrapper)
-    applied_feat = ir.NODE_UDF(afunc, v_nf)
-    # TODO we need to avoid index_copy here.
-    if inplace:
-        ir.WRITE_ROW_INPLACE_(var_nf, var_v, applied_feat)
-    else:
-        ir.WRITE_ROW_(var_nf, var_v, applied_feat)
-
-def schedule_apply_edges(graph,
-                         u, v, eid,
-                         apply_func,
-                         inplace,
-                         outframe=None):
-    """Get apply edges schedule
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    u : utils.Index
-        Source nodes of edges to apply
-    v : utils.Index
-        Destination nodes of edges to apply
-    eid : utils.Index
-        Ids of sending edges
-    apply_func: callable
-        The apply edge function
-    inplace: bool
-        If True, the update will be done in place
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use graph.edge_frame.
-
-    Returns
-    -------
-    A list of executors for DGL Runtime
-    """
-    # vars
-    var_src_nf = var.FEAT_DICT(graph.srcframe, 'uframe')
-    var_dst_nf = var.FEAT_DICT(graph.dstframe, 'vframe')
-    var_ef = var.FEAT_DICT(graph.edgeframe, 'eframe')
-    var_out_ef = var_ef if outframe is None else var.FEAT_DICT(outframe, 'out_ef')
-    var_out = _gen_send(graph=graph, u=u, v=v, eid=eid, mfunc=apply_func,
-                        var_src_nf=var_src_nf, var_dst_nf=var_dst_nf,
-                        var_ef=var_ef)
-    var_eid = var.IDX(eid)
-    # schedule apply edges
-    if inplace:
-        ir.WRITE_ROW_INPLACE_(var_out_ef, var_eid, var_out)
-    else:
-        ir.WRITE_ROW_(var_ef, var_eid, var_out)
-
-def schedule_nodeflow_apply_edges(graph, block_id,
-                                  u, v, eid,
-                                  apply_func,
-                                  inplace):
-    """Get apply edges schedule in NodeFlow.
-
-    Parameters
-    ----------
-    graph: NodeFlow
-        The NodeFlow to use
-    block_id : int
-        The block whose edges we apply edge update function.
-    u : utils.Index
-        Source nodes of edges to apply
-    v : utils.Index
-        Destination nodes of edges to apply
-    eid : utils.Index
-        Ids of sending edges
-    apply_func: callable
-        The apply edge function
-    inplace: bool
-        If True, the update will be done in place
-
-    Returns
-    -------
-    A list of executors for DGL Runtime
-    """
-    # vars
-    in_var_nf = var.FEAT_DICT(graph._get_node_frame(block_id), name='in_nf')
-    out_var_nf = var.FEAT_DICT(graph._get_node_frame(block_id + 1),
-                               name='out_nf')
-    var_ef = var.FEAT_DICT(graph._get_edge_frame(block_id), name='ef')
-    var_out = _gen_send(graph, u, v, eid, apply_func, in_var_nf, out_var_nf,
-                        var_ef, block_id=block_id)
-    var_eid = var.IDX(eid)
-    if inplace:
-        ir.WRITE_ROW_INPLACE_(var_ef, var_eid, var_out)
-    else:
-        ir.WRITE_ROW_(var_ef, var_eid, var_out)
-
-def schedule_push(graph,
-                  u,
-                  message_func,
-                  reduce_func,
-                  apply_func,
-                  inplace,
-                  outframe=None):
-    """Get push schedule
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    u : utils.Index
-        Source nodes for push
-    message_func: callable or list of callable
-        The message function
-    reduce_func: callable or list of callable
-        The reduce function
-    apply_func: callable
-        The apply node function
-    inplace: bool
-        If True, the update will be done in place
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use graph.dstframe.
-    """
-    u, v, eid = graph.out_edges(u)
-    if len(eid) == 0:
-        # All the pushing nodes have no out edges. No computation is scheduled.
-        return
-    schedule_snr(graph, (u, v, eid),
-                 message_func, reduce_func, apply_func,
-                 inplace, outframe)
-
-def schedule_pull(graph,
-                  pull_nodes,
-                  message_func,
-                  reduce_func,
-                  apply_func,
-                  inplace,
-                  outframe=None):
-    """Get pull schedule
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    pull_nodes : utils.Index
-        Destination nodes for pull
-    message_func: callable or list of callable
-        The message function
-    reduce_func: callable or list of callable
-        The reduce function
-    apply_func: callable
-        The apply node function
-    inplace: bool
-        If True, the update will be done in place
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use graph.dstframe.
-    """
-    # TODO(minjie): `in_edges` can be omitted if message and reduce func pairs
-    #   can be specialized to SPMV. This needs support for creating adjmat
-    #   directly from pull node frontier.
-    u, v, eid = graph.in_edges(pull_nodes)
-    if len(eid) == 0:
-        # All the nodes are 0deg; downgrades to apply.
-        if apply_func is not None:
-            schedule_apply_nodes(pull_nodes, apply_func, graph.dstframe, inplace,
-                                 outframe, ntype=graph.canonical_etype[-1])
-    else:
-        # TODO(Allen): Change operation to dgl operation
-        pull_nodes, _ = F.sort_1d(F.unique(pull_nodes.tousertensor()))
-        pull_nodes = utils.toindex(pull_nodes, graph.gidx.dtype)
-        # create vars
-        var_dst_nf = var.FEAT_DICT(graph.dstframe, name='dst_nf')
-        var_out_nf = var_dst_nf if outframe is None else var.FEAT_DICT(outframe, name='out_nf')
-        var_pull_nodes = var.IDX(pull_nodes, name='pull_nodes')
-        var_u = var.IDX(u)
-        var_v = var.IDX(v)
-        var_eid = var.IDX(eid)
-        # generate send and reduce schedule
-        uv_getter = lambda: (var_u, var_v)
-        adj_creator = lambda: spmv.build_gidx_and_mapping_uv(
-            (u, v, eid), graph.num_src(), graph.num_dst())
-        out_map_creator = lambda nbits: _build_idx_map(pull_nodes, nbits)
-        reduced_feat = _gen_send_reduce(graph.srcframe,
-                                        graph.dstframe, graph.edgeframe,
-                                        message_func, reduce_func, var_eid,
-                                        var_pull_nodes, uv_getter, adj_creator,
-                                        out_map_creator,
-                                        canonical_etype=graph.canonical_etype)
-        # generate optional apply
-        final_feat = _apply_with_accum(var_pull_nodes, var_dst_nf,
-                                       reduced_feat, apply_func,
-                                       ntype=graph.canonical_etype[-1])
-        if inplace:
-            ir.WRITE_ROW_INPLACE_(var_out_nf, var_pull_nodes, final_feat)
-        else:
-            ir.WRITE_ROW_(var_out_nf, var_pull_nodes, final_feat)
-
-def schedule_group_apply_edge(graph,
-                              u, v, eid,
-                              apply_func,
-                              group_by,
-                              inplace,
-                              outframe=None):
-    """Group apply edges schedule
-
-    Parameters
-    ----------
-    graph: GraphAdaptor
-        Graph
-    u : utils.Index
-        Source nodes of edges to apply
-    v : utils.Index
-        Destination nodes of edges to apply
-    eid : utils.Index
-        Ids of sending edges
-    apply_func: callable
-        The apply edge function
-    group_by : str
-        Specify how to group edges. Expected to be either 'src' or 'dst'
-    inplace: bool
-        If True, the update will be done in place
-    outframe : FrameRef, optional
-        The storage to write output data. If None, use graph.edgeframe.
-    """
-    # vars
-    var_src_nf = var.FEAT_DICT(graph.srcframe, name='src_nf')
-    var_dst_nf = var.FEAT_DICT(graph.dstframe, name='dst_nf')
-    var_ef = var.FEAT_DICT(graph.edgeframe, name='ef')
-    var_out_ef = var_ef if outframe is None else var.FEAT_DICT(outframe, name='out_ef')
-    var_out = var.FEAT_DICT(name='new_ef')
-    db.gen_group_apply_edge_schedule(apply_func, u, v, eid, group_by,
-                                     var_src_nf, var_dst_nf, var_ef, var_out,
-                                     canonical_etype=graph.canonical_etype)
-    var_eid = var.IDX(eid)
-    if inplace:
-        ir.WRITE_ROW_INPLACE_(var_out_ef, var_eid, var_out)
-    else:
-        ir.WRITE_ROW_(var_out_ef, var_eid, var_out)
-
-
-def schedule_nodeflow_update_all(graph,
-                                 block_id,
-                                 message_func,
-                                 reduce_func,
-                                 apply_func):
-    """Get update_all schedule in a block.
-
-    Parameters
-    ----------
-    graph: NodeFlow
-        The NodeFlow to use
-    block_id : int
-        The block where we perform computation.
-    message_func: callable or list of callable
-        The message function
-    reduce_func: callable or list of callable
-        The reduce function
-    apply_func: callable
-        The apply node function
-    """
-    # A NodeFlow shouldn't have 0 edges.
-    assert graph.block_size(block_id) > 0
-    eid = utils.toindex(slice(0, graph.block_size(block_id)))  # ALL
-    dest_nodes = utils.toindex(slice(0, graph.layer_size(block_id + 1)))  # ALL
-    # create vars
-    var_nf = var.FEAT_DICT(graph._get_node_frame(block_id + 1), name='out_nf')
-    var_dest_nodes = var.IDX(dest_nodes, name='dest_nodes')
-    var_eid = var.IDX(eid)
-    # generate send + reduce
-    def uv_getter():
-        src, dst, _ = graph.block_edges(block_id, remap_local=True)
-        return var.IDX(utils.toindex(src)), var.IDX(utils.toindex(dst))
-    adj_creator = lambda: spmv.build_gidx_and_mapping_block(graph, block_id)
-    out_map_creator = lambda nbits: None
-    reduced_feat = _gen_send_reduce(src_node_frame=graph._get_node_frame(block_id),
-                                    dst_node_frame=graph._get_node_frame(block_id + 1),
-                                    edge_frame=graph._get_edge_frame(block_id),
-                                    message_func=message_func,
-                                    reduce_func=reduce_func,
-                                    var_send_edges=var_eid,
-                                    var_reduce_nodes=var_dest_nodes,
-                                    uv_getter=uv_getter,
-                                    adj_creator=adj_creator,
-                                    out_map_creator=out_map_creator)
-    # generate optional apply
-    final_feat = _apply_with_accum(var_dest_nodes, var_nf, reduced_feat, apply_func)
-    ir.WRITE_DICT_(var_nf, final_feat)
-
-
-def schedule_nodeflow_compute(graph,
-                              block_id,
-                              u, v, eid,
-                              dest_nodes,
-                              message_func,
-                              reduce_func,
-                              apply_func,
-                              inplace):
-    """Get flow compute schedule in NodeFlow
-
-    Parameters
-    ----------
-    graph: NodeFlow
-        The NodeFlow to use
-    block_id : int
-        The block where we perform computation.
-    u : utils.Index
-        Source nodes of edges to apply
-    v : utils.Index
-        Destination nodes of edges to apply
-    eid : utils.Index
-        Ids of sending edges
-    dest_nodes : utils.Index
-        Destination nodes ids
-    message_func: callable or list of callable
-        The message function
-    reduce_func: callable or list of callable
-        The reduce function
-    apply_func: callable
-        The apply node function
-    inplace: bool
-        If True, the update will be done in place
-    """
-    # TODO(minjie): `in_edges` can be omitted if message and reduce func pairs
-    #   can be specialized to SPMV. This needs support for creating adjmat
-    #   directly from pull node frontier.
-    if len(eid) == 0:
-        # All the nodes are 0deg; downgrades to apply.
-        if apply_func is not None:
-            schedule_nodeflow_apply_nodes(graph, block_id + 1, dest_nodes,
-                                          apply_func, inplace)
-    else:
-        # create vars
-        var_nf = var.FEAT_DICT(graph._get_node_frame(block_id + 1),
-                               name='out_nf')
-        var_u = var.IDX(u)
-        var_v = var.IDX(v)
-        var_eid = var.IDX(eid)
-        var_dest_nodes = var.IDX(dest_nodes, name='dest_nodes')
-        # generate send and reduce schedule
-        uv_getter = lambda: (var_u, var_v)
-        adj_creator = lambda: spmv.build_gidx_and_mapping_block(
-            graph, block_id, (u, v, eid))
-        out_map_creator = lambda nbits: _build_idx_map(utils.toindex(dest_nodes), nbits)
-
-        reduced_feat = _gen_send_reduce(src_node_frame=graph._get_node_frame(block_id),
-                                        dst_node_frame=graph._get_node_frame(block_id + 1),
-                                        edge_frame=graph._get_edge_frame(block_id),
-                                        message_func=message_func,
-                                        reduce_func=reduce_func,
-                                        var_send_edges=var_eid,
-                                        var_reduce_nodes=var_dest_nodes,
-                                        uv_getter=uv_getter,
-                                        adj_creator=adj_creator,
-                                        out_map_creator=out_map_creator)
-        # generate optional apply
-        final_feat = _apply_with_accum(var_dest_nodes, var_nf,
-                                       reduced_feat, apply_func)
-        if inplace:
-            ir.WRITE_ROW_INPLACE_(var_nf, var_dest_nodes, final_feat)
-        else:
-            ir.WRITE_ROW_(var_nf, var_dest_nodes, final_feat)
-
-def _check_builtin_func_list(func_list):
-    """Check whether func_list only contains builtin functions."""
-    for fn in func_list:
-        if not isinstance(fn, BuiltinFunction):
-            raise DGLError("If specify multiple message/reduce functions, \
-                           all of them must be builtin")
-
-def _standardize_func_usage(func, func_name):
-    """Standardize usages of message and reduce functions
-    Message or reduce funtion can be:
-        1. a UDF
-        2. a dgl builtin function
-        3. a list of dgl builtin function
-
-    This function checks if func meets the requirement, and merges last two
-    cases by putting builtin function in case 2 into a list
-
-    Returns:
-    One single UDF function or a list of builtin function
-    """
-
-    if utils.is_iterable(func):
-        # func is a list of builtin
-        _check_builtin_func_list(func)
-        return func
-    elif isinstance(func, BuiltinFunction):
-        # func is one builtin-in
-        return [func]
-    else:
-        # func is one UDF
-        if not callable(func):
-            raise DGLError('User-defined %s function must be callable.'
-                           ' Got: %s' % (func_name, str(func)))
-        return func
-
-def _apply_with_accum(var_nodes, var_nf, var_accum, apply_func, ntype=None):
-    """Apply with accumulated features.
-
-    Paramters
-    ---------
-    var_nodes : var.IDX
-        The nodes.
-    var_nf : var.FEAT_DICT
-        The node features.
-    var_accum : var.FEAT_DICT
-        The accumulated features.
-    apply_func : callable, None
-        The apply function.
-    ntype : str, optional
-        The node type, if running on a heterograph.
-        If None, assuming it's running on a homogeneous graph.
-    """
-    if apply_func:
-        # To avoid writing reduced features back to node frame and reading
-        # it again for apply phase. Instead, we first read the the node
-        # features and "merge" it with the reduced features.
-        v_nf = ir.READ_ROW(var_nf, var_nodes)
-        v_nf = ir.UPDATE_DICT(v_nf, var_accum)
-
-        def _afunc_wrapper(node_data):
-            nbatch = NodeBatch(var_nodes.data, node_data, ntype=ntype)
-            return apply_func(nbatch)
-        afunc = var.FUNC(_afunc_wrapper)
-        applied_feat = ir.NODE_UDF(afunc, v_nf)
-        final_feat = ir.UPDATE_DICT(var_accum, applied_feat)
-    else:
-        final_feat = var_accum
-    return final_feat
-
-def _gen_reduce(graph, reduce_func, edge_tuples, recv_nodes):
-    """Generate reduce schedule
-
-    Parameters
-    ----------
-    graph : GraphAdaptor
-    reduce_func : callable
-    edge_tuples : tuple of utils.Index
-    recv_nodes : utils.Index
-
-    Returns
-    -------
-    var.FEAT_DICT
-        The reduced feature dict.
-    """
-    src, dst, eid = edge_tuples
-    rfunc = _standardize_func_usage(reduce_func, 'reduce')
-    rfunc_is_list = utils.is_iterable(rfunc)
-    # Create a tmp frame to hold the feature data.
-    # The frame has the same size and schemes of the
-    # node frame.
-    # TODO(minjie): should replace this with an IR call to make the program
-    # stateless.
-    tmpframe = FrameRef(frame_like(graph.dstframe._frame, len(recv_nodes)))
-
-    # vars
-    var_msg = var.FEAT_DICT(graph.msgframe, 'msg')
-    var_dst_nf = var.FEAT_DICT(graph.dstframe, 'nf')
-    var_out = var.FEAT_DICT(data=tmpframe)
-
-    if rfunc_is_list:
-        adj, edge_map, nbits = spmv.build_gidx_and_mapping_uv(
-            (src, dst, eid), graph.num_src(), graph.num_dst())
-        # using edge map instead of message map because messages are in global
-        # message frame
-        var_out_map = _build_idx_map(recv_nodes, nbits)
-        spmv.gen_e2v_spmv_schedule(graph=adj,
-                                   rfunc=rfunc,
-                                   message_frame=var_msg,
-                                   out=var_out,
-                                   out_size=len(recv_nodes),
-                                   edge_map=edge_map,
-                                   out_map=var_out_map)
-        return var_out
-    else:
-        # gen degree bucketing schedule for UDF recv
-        db.gen_degree_bucketing_schedule(rfunc, eid, dst, recv_nodes,
-                                         var_dst_nf, var_msg, var_out,
-                                         ntype=graph.canonical_etype[-1])
-        return var_out
-
-def _gen_send_reduce(
-        src_node_frame,
-        dst_node_frame,
-        edge_frame,
-        message_func,
-        reduce_func,
-        var_send_edges,
-        var_reduce_nodes,
-        uv_getter,
-        adj_creator,
-        out_map_creator,
-        canonical_etype=(None, None, None)):
-    """Generate send and reduce schedule.
-
-    The function generates symbolic program for computing
-    (1) message function on the given edges (var_send_edges).
-    (2) reduce function on the given nodes (var_reduce_nodes).
-
-    If both message_func and reduce_func are DGL builtin functions, the schedule
-    will invoke fused message passing kernels (e.g. dgl.backend.binary_reduce) to
-    avoid generating explicit edge messages.
-
-    If message_func is UDF while reduce_func is DGL builtin function, the schedule
-    first invokes UDF to generate explicit edge messages, and then invokes
-    dgl.backend.copy_reduce to reduce messages on the destination nodes.
-
-    If both message_func and reduce_func are UDFs, the schedule first invokes message
-    UDF to generate explicit edge messages and then use degree-bucketing to invoke
-    reduce UDF.
-
-    Parameters
-    ----------
-    src_node_frame : NodeFrame
-        The node frame of the source nodes.
-    dst_node_frame : NodeFrame
-        The node frame of the destination nodes.
-    edge_frame : NodeFrame
-        The frame for the edges between the source and destination nodes.
-    message_func : callable, list of builtins
-        The message func(s).
-    reduce_func : callable, list of builtins
-        The reduce func(s).
-    var_send_edges : var.IDX
-        The edges (ids) to perform send.
-    var_reduce_nodes : var.IDX
-        Unique and sorted nodes to perform reduce. This should include
-        unique(v) + 0deg nodes.
-    uv_getter : callable
-        Function that returns a pair of var.IDX (u, v) for the triggered edges.
-    adj_creator : callable
-        Function that returns the adjmat, edge order of csr matrix, and
-        bit-width.
-    out_map_creator : callable
-        A function that returns a mapping from reduce_nodes to relabeled
-        consecutive ids
-    canonical_etype : tuple[str, str, str], optional
-        Canonical edge type if running on a heterograph.
-        Default: (None, None, None), if running on a homogeneous graph.
-
-    Returns
-    -------
-    var.FEAT_DICT
-        The reduced feature dict.
-
-    Notes
-    -----
-    Reduce_nodes are assumed to be in the *unique-ascending* order of the edge
-    destination node ids. The returned reduced features will be batched
-    following the order of reduce_nodes.
-    """
-    # NOTE: currently, this function requires all var.IDX to contain concrete
-    # data.
-    reduce_nodes = var_reduce_nodes.data
-
-    # arg vars
-    var_src_nf = var.FEAT_DICT(src_node_frame, name='src_frame')
-    var_dst_nf = var.FEAT_DICT(dst_node_frame, name='dst_frame')
-    var_ef = var.FEAT_DICT(edge_frame, name='edge_frame')
-    var_eid = var_send_edges
-
-    # format the input functions
-    mfunc = _standardize_func_usage(message_func, 'message')
-    rfunc = _standardize_func_usage(reduce_func, 'reduce')
-    mfunc_is_list = utils.is_iterable(mfunc)
-    rfunc_is_list = utils.is_iterable(rfunc)
-
-    # Create a tmp frame to hold the feature data. The frame has the same size
-    # and schemes of the node frame.
-    # TODO(minjie): should replace this with an IR call to make the program
-    # stateless.
-    tmpframe = FrameRef(frame_like(dst_node_frame._frame, len(reduce_nodes)))
-    var_out = var.FEAT_DICT(data=tmpframe)
-
-    # 1. If either mfunc or rfunc is builtin, generate adjmat, edge mapping and
-    # message mapping
-    if mfunc_is_list or rfunc_is_list:
-        adj, edge_map, nbits = adj_creator()
-
-    # 2. If rfunc is builtin, generate a mapping from recv nodes to consecutive
-    # output id
-    if rfunc_is_list:
-        out_map = out_map_creator(nbits)
-
-    # 3. First try fused message and reduce function
-    if mfunc_is_list and rfunc_is_list:
-        # builtin message + builtin reducer
-        spmv.gen_v2v_spmv_schedule(graph=adj,
-                                   mfunc=mfunc,
-                                   rfunc=rfunc,
-                                   src_frame=var_src_nf,
-                                   dst_frame=var_dst_nf,
-                                   edge_frame=var_ef,
-                                   out=var_out,
-                                   out_size=len(reduce_nodes),
-                                   edge_map=edge_map,
-                                   out_map=out_map)
-        return var_out
-
-    var_u, var_v = uv_getter()
-
-    # 4. Unable to fuse, then generate message
-    if mfunc_is_list:
-        # messages are builtin but reduce is UDF
-        # Create a tmp frame to hold the message.
-        # TODO: should replace this with an IR call to make the program
-        # stateless.
-        n_message = len(var_eid.data)
-        tmp_msg_frame = FrameRef(frame_like(edge_frame._frame, n_message))
-        var_mf = var.FEAT_DICT(data=tmp_msg_frame)
-        spmv.gen_v2e_spmv_schedule(graph=adj,
-                                   mfunc=mfunc,
-                                   src_frame=var_src_nf,
-                                   dst_frame=var_dst_nf,
-                                   edge_frame=var_ef,
-                                   out=var_mf,
-                                   out_size=n_message,
-                                   edge_map=edge_map)
-    else:
-        # generate UDF send schedule
-        var_mf = _gen_udf_send(var_src_nf, var_dst_nf, var_ef, var_u,
-                               var_v, var_eid, mfunc, canonical_etype=canonical_etype)
-
-    # 6. Generate reduce
-    if rfunc_is_list:
-        # UDF message + builtin reducer
-        spmv.gen_e2v_spmv_schedule(graph=adj,
-                                   rfunc=rfunc,
-                                   message_frame=var_mf,
-                                   out=var_out,
-                                   out_size=len(reduce_nodes),
-                                   edge_map=None,  # messages are stored compactly
-                                   out_map=out_map)
-        return var_out
-    else:
-        # gen degree bucketing schedule for UDF recv
-        mid = utils.toindex(slice(0, len(var_v.data)), var_v.data.dtype)
-        db.gen_degree_bucketing_schedule(rfunc, mid, var_v.data,
-                                         reduce_nodes, var_dst_nf, var_mf,
-                                         var_out, ntype=canonical_etype[-1])
-        return var_out
-
-def _gen_udf_send(var_src_nf, var_dst_nf, var_ef, u, v, eid, mfunc,
-                  canonical_etype=(None, None, None)):
-    """Internal function to generate send schedule for UDF message function."""
-    fdsrc = ir.READ_ROW(var_src_nf, u)
-    fddst = ir.READ_ROW(var_dst_nf, v)
-    fdedge = ir.READ_ROW(var_ef, eid)
-    def _mfunc_wrapper(src_data, edge_data, dst_data):
-        ebatch = EdgeBatch((u.data, v.data, eid.data),
-                           src_data, edge_data, dst_data,
-                           canonical_etype=canonical_etype)
-        return mfunc(ebatch)
-    _mfunc_wrapper = var.FUNC(_mfunc_wrapper)
-    msg = ir.EDGE_UDF(_mfunc_wrapper, fdsrc, fdedge, fddst)
-    return msg
-
-def _gen_send(graph, u, v, eid, mfunc, var_src_nf, var_dst_nf, var_ef, block_id=None):
-    """Internal function to generate send schedule"""
-    mfunc = _standardize_func_usage(mfunc, 'message')
-    mfunc_is_list = utils.is_iterable(mfunc)
-    # vars
-    var_u = var.IDX(u)
-    var_v = var.IDX(v)
-    var_eid = var.IDX(eid)
-
-    if mfunc_is_list:
-        if not hasattr(graph, 'num_edges'):
-            # XXX(minjie): a temporary hack to detect Nodeflow object
-            res = spmv.build_gidx_and_mapping_block(graph, block_id)
-        elif eid.is_slice(0, graph.num_edges()):
-            # full graph case
-            res = spmv.build_gidx_and_mapping_graph(graph)
-        else:
-            res = spmv.build_gidx_and_mapping_uv(
-                (u, v, eid), graph.num_src(), graph.num_dst())
-        adj, edge_map, _ = res
-        # create a tmp message frame
-        tmp_mfr = FrameRef(frame_like(var_ef.data._frame, len(eid)))
-        var_out = var.FEAT_DICT(data=tmp_mfr)
-        spmv.gen_v2e_spmv_schedule(graph=adj,
-                                   mfunc=mfunc,
-                                   src_frame=var_src_nf,
-                                   dst_frame=var_dst_nf,
-                                   edge_frame=var_ef,
-                                   out=var_out,
-                                   out_size=len(eid),
-                                   edge_map=edge_map)
-    else:
-        # UDF send
-        var_out = _gen_udf_send(var_src_nf, var_dst_nf, var_ef, var_u,
-                                var_v, var_eid, mfunc,
-                                canonical_etype=graph.canonical_etype)
-    return var_out
-
-def _build_idx_map(idx, nbits):
-    """Build a map from the input ids to continuous ids that starts from zero.
-    And the number of bits data type of each integer in the mapping uses will
-    be nbits
-
-    Examples
-    --------
-    >>> x = [1, 5, 3, 6]
-    >>> o2n = map_to_continuous(x)
-    >>> o2n
-    [n/a, 0, n/a, 2, n/a, 1, 3]
-
-    "n/a" will be filled with 0
-
-    Parameters
-    ----------
-    x : Index
-        The input ids, assumed to be unique.
-    nbits: int
-        Number of bits each integer in the mapping should use, can be 32 or 64
-
-    Returns
-    -------
-    old_to_new : CtxCachedObject
-        The mapping from old id to new id. It is a vector of length MAX(x).
-        One can use advanced indexing to convert an old id tensor to a
-        new id tensor: new_id = old_to_new[old_id]
-    """
-    x = idx.tousertensor()
-    map_len = int(F.asnumpy(F.max(x, dim=0))) + 1
-    old_to_new = F.full_1d(map_len, -1, dtype=F.int64, ctx=F.cpu())
-    # Use out-place update due to tensorflow compatibility
-    old_to_new = F.scatter_row(old_to_new, x, F.arange(0, len(x)))
-    old_to_new = utils.to_nbits_int(old_to_new, nbits)
-    old_to_new = F.zerocopy_to_dgl_ndarray(old_to_new)
-    return utils.CtxCachedObject(lambda ctx: nd.array(old_to_new, ctx=ctx))
-
-_init_api("dgl._deprecate.runtime.scheduler")
--- a/python/dgl/_deprecate/runtime/spmv.py
+++ b/python/dgl/_deprecate/runtime/spmv.py
-"""Module for SPMV rules."""
-from __future__ import absolute_import
-from functools import partial
-
-from ...base import DGLError
-from ... import backend as F
-from ... import utils
-from ... import ndarray as nd
-from ...heterograph_index import create_unitgraph_from_coo
-
-from . import ir
-from .ir import var
-
-
-def gen_v2v_spmv_schedule(graph, mfunc, rfunc, src_frame, dst_frame,
-                          edge_frame, out, out_size, src_map=None,
-                          dst_map=None, edge_map=None, out_map=None):
-    """Generate v2v spmv schedule.
-
-    Parameters
-    ----------
-    graph : utils.CtxCachedObject
-        Function that generates immutable graph index on given context
-    mfunc : list of builtin message func
-        Builtin message function list
-    rfunc : list of builtin reduce func
-        Builtin reduce function list
-    src_frame : var.Var
-        Input source node features
-    dst_frame : var.Var
-        Input destination node features
-    edge_frame : var.Var
-        Input edge features
-    out : var.Var
-        Output node features
-    out_size : int
-        Number of output nodes
-    src_map : utils.CtxCachedObject
-        Function that generates source node id mapping array on given context
-    dst_map : utils.CtxCachedObject
-        Function that generates destination node id mapping array on given
-        context
-    edge_map : utils.CtxCachedObject
-        Function that generates edge id mapping array on given context
-    out_map : utils.CtxCachedObject
-        Function that generates output id mapping array on given context
-    """
-    fld2mfunc = {fn.out_field: fn for fn in mfunc}
-    for rfn in rfunc:
-        mfld = rfn.msg_field
-        if mfld not in fld2mfunc:
-            raise DGLError('Reduce function requires message field "%s",'
-                           ' but no message function generates it.' % mfld)
-        mfn = fld2mfunc[mfld]
-        ftdst = mfn._invoke(graph, src_frame, dst_frame, edge_frame, out_size,
-                            src_map, dst_map, edge_map, out_map,
-                            reducer=rfn.name)
-        ir.WRITE_COL_(out, var.STR(rfn.out_field), ftdst)
-
-
-def gen_v2e_spmv_schedule(graph, mfunc, src_frame, dst_frame, edge_frame, out,
-                          out_size, src_map=None, dst_map=None, edge_map=None,
-                          out_map=None):
-    """Generate v2e SPMV schedule
-
-    Parameters
-    ----------
-    graph : utils.CtxCachedObject
-        Function that generates immutable graph index on given context
-    mfunc : list of builtin message func
-        Builtin message function list
-    src_frame : var.Var
-        Input source node features
-    dst_frame : var.Var
-        Input destination node features
-    edge_frame : var.Var
-        Input edge features
-    out : var.Var
-        Output node features
-    out_size : int
-        Number of output nodes
-    src_map : utils.CtxCachedObject
-        Function that generates source node id mapping array on given context
-    dst_map : utils.CtxCachedObject
-        Function that generates destination node id mapping array on given
-        context
-    edge_map : utils.CtxCachedObject
-        Function that generates edge id mapping array on given context
-    out_map : utils.CtxCachedObject
-        Function that generates output id mapping array on given context
-    """
-    for mfn in mfunc:
-        fmsg = mfn._invoke(graph, src_frame, dst_frame, edge_frame, out_size,
-                           src_map, dst_map, edge_map, out_map=out_map,
-                           reducer="none")
-        ir.WRITE_COL_(out, var.STR(mfn.out_field), fmsg)
-
-
-def gen_e2v_spmv_schedule(graph, rfunc, message_frame, out, out_size,
-                          edge_map=None, out_map=None):
-    """Generate e2v SPMV schedule.
-
-    Parameters
-    ----------
-    graph : utils.CtxCachedObject
-        Function that generates immutable graph index on given context
-    rfunc : list of builtin reduce func
-        Builtin reduce function list
-    message_frame : var.Var
-        Message features
-    out : var.Var
-        Output node features
-    out_size : int
-        Number of output nodes
-    edge_map : utils.CtxCachedObject
-        Function that generates edge id mapping array on given context
-    out_map : utils.CtxCachedObject
-        Function that generates output id mapping array on given context
-    """
-    for rfn in rfunc:
-        ftdst = rfn._invoke(graph, message_frame, out_size, edge_map=edge_map,
-                            out_map=out_map)
-        ir.WRITE_COL_(out, var.STR(rfn.out_field), ftdst)
-
-
-def build_gidx_and_mapping_graph(graph):
-    """Build immutable graph index of the whole graph.
-
-    Parameters
-    ----------
-    graph : GraphAdapter
-        Graph
-
-    Returns
-    -------
-    graph : utils.CtxCachedObject
-        Function that generates a immutable graph index on given context
-    edge_map : utils.CtxCachedObject
-        Function that generates forward and backward edge mapping on given
-        context
-    nbits : int
-        Number of ints needed to represent the graph
-    """
-    return graph.get_immutable_gidx, None, graph.bits_needed()
-
-def build_gidx_and_mapping_uv(edge_tuples, num_src, num_dst):
-    """Build immutable graph index and mapping using the given (u, v) edges
-
-    The matrix is of shape (num_src, num_dst).
-
-    Parameters
-    ---------
-    edge_tuples : tuple of three utils.Index
-        A tuple of (u, v, eid)
-    num_src : int
-        Number of source nodes.
-    num_dst : int
-        Number of destination nodes.
-
-    Returns
-    -------
-    graph : utils.CtxCachedObject
-        Function that generates a immutable graph index on given context
-    edge_map : utils.CtxCachedObject
-        Function that generates forward and backward edge mapping on given
-        context
-    nbits : int
-        Number of ints needed to represent the graph
-    """
-    u, v, eid = edge_tuples
-    gidx = create_unitgraph_from_coo(2, num_src, num_dst,
-                                     u.tousertensor(), v.tousertensor(), ['coo', 'csr', 'csc'])
-    forward, backward = gidx.get_csr_shuffle_order(0)
-    eid = eid.tousertensor()
-    nbits = gidx.bits_needed(0)
-    forward_map = utils.to_nbits_int(F.gather_row(eid, forward.tousertensor()), nbits)
-    backward_map = utils.to_nbits_int(F.gather_row(eid, backward.tousertensor()), nbits)
-    forward_map = F.zerocopy_to_dgl_ndarray(forward_map)
-    backward_map = F.zerocopy_to_dgl_ndarray(backward_map)
-    edge_map = utils.CtxCachedObject(
-        lambda ctx: (nd.array(forward_map, ctx=ctx),
-                     nd.array(backward_map, ctx=ctx)))
-    return partial(gidx.get_unitgraph, 0), edge_map, nbits
-
-def build_gidx_and_mapping_block(graph, block_id, edge_tuples=None):
-    """Build immutable graph index and mapping for node flow
-
-    Parameters
-    ----------
-    graph : NodeFlow
-        The NodeFlow
-    block_id : int
-        the block Id
-    edge_tuple :  tuple of three utils.Index
-        A tuple of (u, v, eid)
-
-    Returns
-    -------
-    graph : utils.CtxCachedObject
-        Function that generates a immutable graph index on given context
-    edge_map : utils.CtxCachedObject
-        Function that generates forward and backward edge mapping on given
-        context
-    nbits : int
-        Number of ints needed to represent the graph
-    """
-    if edge_tuples is None:
-        u, v, eid = graph.block_edges(block_id, remap_local=True)
-        u = utils.toindex(u)
-        v = utils.toindex(v)
-        eid = utils.toindex(eid)
-    else:
-        u, v, eid = edge_tuples
-    num_src, num_dst = graph.layer_size(block_id), graph.layer_size(block_id + 1)
-    gidx, edge_map, nbits = build_gidx_and_mapping_uv((u, v, eid), num_src, num_dst)
-    return gidx, edge_map, nbits
--- a/python/dgl/_deprecate/udf.py
+++ b/python/dgl/_deprecate/udf.py
-"""User-defined function related data structures."""
-from __future__ import absolute_import
-
-class EdgeBatch(object):
-    """The class that can represent a batch of edges.
-
-    Parameters
-    ----------
-    edges : tuple of utils.Index
-        The edge tuple (u, v, eid). eid can be ALL
-    src_data : dict
-        The src node features, in the form of ``dict``
-        with ``str`` keys and ``tensor`` values
-    edge_data : dict
-        The edge features, in the form of ``dict`` with
-        ``str`` keys and ``tensor`` values
-    dst_data : dict of tensors
-        The dst node features, in the form of ``dict``
-        with ``str`` keys and ``tensor`` values
-    canonical_etype : tuple of (str, str, str), optional
-        Canonical edge type of the edge batch, if UDF is
-        running on a heterograph.
-    """
-    def __init__(self, edges, src_data, edge_data, dst_data,
-                 canonical_etype=(None, None, None)):
-        self._edges = edges
-        self._src_data = src_data
-        self._edge_data = edge_data
-        self._dst_data = dst_data
-        self._canonical_etype = canonical_etype
-
-    @property
-    def src(self):
-        """Return the feature data of the source nodes.
-
-        Returns
-        -------
-        dict with str keys and tensor values
-            Features of the source nodes.
-        """
-        return self._src_data
-
-    @property
-    def dst(self):
-        """Return the feature data of the destination nodes.
-
-        Returns
-        -------
-        dict with str keys and tensor values
-            Features of the destination nodes.
-        """
-        return self._dst_data
-
-    @property
-    def data(self):
-        """Return the edge feature data.
-
-        Returns
-        -------
-        dict with str keys and tensor values
-            Features of the edges.
-        """
-        return self._edge_data
-
-    def edges(self):
-        """Return the edges contained in this batch.
-
-        Returns
-        -------
-        tuple of three tensors
-            The edge tuple :math:`(src, dst, eid)`. :math:`src[i],
-            dst[i], eid[i]` separately specifies the source node,
-            destination node and the edge id for the ith edge
-            in the batch.
-        """
-        u, v, eid = self._edges
-        return (u.tousertensor(), v.tousertensor(), eid.tousertensor())
-
-    def batch_size(self):
-        """Return the number of edges in this edge batch.
-
-        Returns
-        -------
-        int
-        """
-        return len(self._edges[0])
-
-    def __len__(self):
-        """Return the number of edges in this edge batch.
-
-        Returns
-        -------
-        int
-        """
-        return self.batch_size()
-
-    @property
-    def canonical_etype(self):
-        """Return the canonical edge type (i.e. triplet of source, edge, and
-        destination node type) for this edge batch, if available."""
-        return self._canonical_etype
-
-class NodeBatch(object):
-    """The class that can represent a batch of nodes.
-
-    Parameters
-    ----------
-    nodes : utils.Index
-        The node ids.
-    data : dict
-        The node features, in the form of ``dict``
-        with ``str`` keys and ``tensor`` values
-    msgs : dict, optional
-        The messages, , in the form of ``dict``
-        with ``str`` keys and ``tensor`` values
-    ntype : str, optional
-        The node type of this node batch, if running
-        on a heterograph.
-    """
-    def __init__(self, nodes, data, msgs=None, ntype=None):
-        self._nodes = nodes
-        self._data = data
-        self._msgs = msgs
-        self._ntype = ntype
-
-    @property
-    def data(self):
-        """Return the node feature data.
-
-        Returns
-        -------
-        dict with str keys and tensor values
-            Features of the nodes.
-        """
-        return self._data
-
-    @property
-    def mailbox(self):
-        """Return the received messages.
-
-        If no messages received, a ``None`` will be returned.
-
-        Returns
-        -------
-        dict or None
-            The messages nodes received. If dict, the keys are
-            ``str`` and the values are ``tensor``.
-        """
-        return self._msgs
-
-    def nodes(self):
-        """Return the nodes contained in this batch.
-
-        Returns
-        -------
-        tensor
-            The nodes.
-        """
-        return self._nodes.tousertensor()
-
-    def batch_size(self):
-        """Return the number of nodes in this batch.
-
-        Returns
-        -------
-        int
-        """
-        return len(self._nodes)
-
-    def __len__(self):
-        """Return the number of nodes in this node batch.
-
-        Returns
-        -------
-        int
-        """
-        return self.batch_size()
-
-    @property
-    def ntype(self):
-        """Return the node type of this node batch, if available."""
-        return self._ntype
--- a/python/dgl/_deprecate/view.py
+++ b/python/dgl/_deprecate/view.py
-"""Views of DGLGraph."""
-from __future__ import absolute_import
-
-from collections import namedtuple
-from collections.abc import MutableMapping
-
-import numpy as np
-
-from ..base import ALL, is_all, DGLError
-from .. import backend as F
-
-NodeSpace = namedtuple('NodeSpace', ['data'])
-EdgeSpace = namedtuple('EdgeSpace', ['data'])
-
-class NodeView(object):
-    """A NodeView class to act as G.nodes for a DGLGraph.
-
-    Can be used to get a list of current nodes and get and set node data.
-
-    See Also
-    --------
-    dgl.DGLGraph.nodes
-    """
-    __slots__ = ['_graph']
-
-    def __init__(self, graph):
-        self._graph = graph
-
-    def __len__(self):
-        return self._graph.number_of_nodes()
-
-    def __getitem__(self, nodes):
-        if isinstance(nodes, slice):
-            # slice
-            if not (nodes.start is None and nodes.stop is None
-                    and nodes.step is None):
-                raise DGLError('Currently only full slice ":" is supported')
-            return NodeSpace(data=NodeDataView(self._graph, ALL))
-        else:
-            return NodeSpace(data=NodeDataView(self._graph, nodes))
-
-    def __call__(self):
-        """Return the nodes."""
-        return F.copy_to(F.arange(0, len(self)), F.cpu())
-
-class NodeDataView(MutableMapping):
-    """The data view class when G.nodes[...].data is called.
-
-    See Also
-    --------
-    dgl.DGLGraph.nodes
-    """
-    __slots__ = ['_graph', '_nodes']
-
-    def __init__(self, graph, nodes):
-        self._graph = graph
-        self._nodes = nodes
-
-    def __getitem__(self, key):
-        return self._graph.get_n_repr(self._nodes)[key]
-
-    def __setitem__(self, key, val):
-        if isinstance(val, np.ndarray):
-            val = F.zerocopy_from_numpy(val)
-        self._graph.set_n_repr({key : val}, self._nodes)
-
-    def __delitem__(self, key):
-        if not is_all(self._nodes):
-            raise DGLError('Delete feature data is not supported on only a subset'
-                           ' of nodes. Please use `del G.ndata[key]` instead.')
-        self._graph.pop_n_repr(key)
-
-    def __len__(self):
-        return len(self._graph._node_frame)
-
-    def __iter__(self):
-        return iter(self._graph._node_frame)
-
-    def __repr__(self):
-        data = self._graph.get_n_repr(self._nodes)
-        return repr({key : data[key] for key in self._graph._node_frame})
-
-class EdgeView(object):
-    """A EdgeView class to act as G.edges for a DGLGraph.
-
-    Can be used to get a list of current edges and get and set edge data.
-
-    See Also
-    --------
-    dgl.DGLGraph.edges
-    """
-    __slots__ = ['_graph']
-
-    def __init__(self, graph):
-        self._graph = graph
-
-    def __len__(self):
-        return self._graph.number_of_edges()
-
-    def __getitem__(self, edges):
-        if isinstance(edges, slice):
-            # slice
-            if not (edges.start is None and edges.stop is None
-                    and edges.step is None):
-                raise DGLError('Currently only full slice ":" is supported')
-            return EdgeSpace(data=EdgeDataView(self._graph, ALL))
-        else:
-            return EdgeSpace(data=EdgeDataView(self._graph, edges))
-
-    def __call__(self, *args, **kwargs):
-        """Return all the edges."""
-        return self._graph.all_edges(*args, **kwargs)
-
-class EdgeDataView(MutableMapping):
-    """The data view class when G.edges[...].data is called.
-
-    See Also
-    --------
-    dgl.DGLGraph.edges
-    """
-    __slots__ = ['_graph', '_edges']
-
-    def __init__(self, graph, edges):
-        self._graph = graph
-        self._edges = edges
-
-    def __getitem__(self, key):
-        return self._graph.get_e_repr(self._edges)[key]
-
-    def __setitem__(self, key, val):
-        if isinstance(val, np.ndarray):
-            val = F.zerocopy_from_numpy(val)
-        self._graph.set_e_repr({key : val}, self._edges)
-
-    def __delitem__(self, key):
-        if not is_all(self._edges):
-            raise DGLError('Delete feature data is not supported on only a subset'
-                           ' of nodes. Please use `del G.edata[key]` instead.')
-        self._graph.pop_e_repr(key)
-
-    def __len__(self):
-        return len(self._graph._edge_frame)
-
-    def __iter__(self):
-        return iter(self._graph._edge_frame)
-
-    def __repr__(self):
-        data = self._graph.get_e_repr(self._edges)
-        return repr({key : data[key] for key in self._graph._edge_frame})
-
-class LayerView(object):
-    """A LayerView class to act as nflow.layers for a NodeFlow.
-
-    Can be used to get a list of current nodes and get and set node data.
-    """
-    __slots__ = ['_graph']
-
-    def __init__(self, graph):
-        self._graph = graph
-
-    def __len__(self):
-        return self._graph.num_layers()
-
-    def __getitem__(self, layer):
-        if not isinstance(layer, int):
-            raise DGLError('Currently we only support the view of one layer')
-        return NodeSpace(data=LayerDataView(self._graph, layer))
-
-    def __call__(self):
-        """Return the nodes."""
-        return F.arange(0, len(self))
-
-class LayerDataView(MutableMapping):
-    """The data view class when G.layers[...].data is called.
-    """
-    __slots__ = ['_graph', '_layer']
-
-    def __init__(self, graph, layer):
-        self._graph = graph
-        self._layer = layer
-
-    def __getitem__(self, key):
-        return self._graph._node_frames[self._layer][key]
-
-    def __setitem__(self, key, val):
-        self._graph._node_frames[self._layer][key] = val
-
-    def __delitem__(self, key):
-        del self._graph._node_frames[self._layer][key]
-
-    def __len__(self):
-        return len(self._graph._node_frames[self._layer])
-
-    def __iter__(self):
-        return iter(self._graph._node_frames[self._layer])
-
-    def __repr__(self):
-        data = self._graph._node_frames[self._layer]
-        return repr({key : data[key] for key in data})
-
-class BlockView(object):
-    """A BlockView class to act as nflow.blocks for a NodeFlow.
-
-    Can be used to get a list of current edges and get and set edge data.
-    """
-    __slots__ = ['_graph']
-
-    def __init__(self, graph):
-        self._graph = graph
-
-    def __len__(self):
-        return self._graph.num_blocks
-
-    def __getitem__(self, flow):
-        if not isinstance(flow, int):
-            raise DGLError('Currently we only support the view of one flow')
-        return EdgeSpace(data=BlockDataView(self._graph, flow))
-
-    def __call__(self, *args, **kwargs):
-        """Return all the edges."""
-        return self._graph.all_edges(*args, **kwargs)
-
-class BlockDataView(MutableMapping):
-    """The data view class when G.blocks[...].data is called.
-    """
-    __slots__ = ['_graph', '_flow']
-
-    def __init__(self, graph, flow):
-        self._graph = graph
-        self._flow = flow
-
-    def __getitem__(self, key):
-        return self._graph._edge_frames[self._flow][key]
-
-    def __setitem__(self, key, val):
-        self._graph._edge_frames[self._flow][key] = val
-
-    def __delitem__(self, key):
-        del self._graph._edge_frames[self._flow][key]
-
-    def __len__(self):
-        return len(self._graph._edge_frames[self._flow])
-
-    def __iter__(self):
-        return iter(self._graph._edge_frames[self._flow])
-
-    def __repr__(self):
-        data = self._graph._edge_frames[self._flow]
-        return repr({key : data[key] for key in data})
--- a/python/dgl/backend/mxnet/tensor.py
+++ b/python/dgl/backend/mxnet/tensor.py
@@ -9,7 +9,6 @@ import mxnet.ndarray as nd
 import numpy as np

 from ... import ndarray as dglnd
-from ..._deprecate import kernel as K
 from ...function.base import TargetCode
 from ...utils import version

@@ -525,300 +524,6 @@ def zerocopy_from_dgl_ndarray(arr):
    return nd.from_dlpack(arr.to_dlpack())


-class BinaryReduce(mx.autograd.Function):
-    def __init__(
-        self,
-        reducer,
-        binary_op,
-        graph,
-        lhs,
-        rhs,
-        out_size,
-        lhs_map,
-        rhs_map,
-        out_map,
-    ):
-        super(BinaryReduce, self).__init__()
-        self.reducer = reducer
-        self.binary_op = binary_op
-        self.graph = graph
-        self.lhs = lhs
-        self.rhs = rhs
-        self.out_size = out_size
-        self.lhs_map = lhs_map
-        self.rhs_map = rhs_map
-        self.out_map = out_map
-
-    def forward(self, lhs_data, rhs_data):
-        lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
-        rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
-        feat_shape = K.infer_binary_feature_shape(
-            self.binary_op, lhs_data_nd, rhs_data_nd
-        )
-        out_shape = feat_shape
-        if self.binary_op == "dot":
-            out_shape = feat_shape[:-1]
-        out_data = nd.empty(
-            (self.out_size,) + out_shape,
-            ctx=lhs_data.context,
-            dtype=lhs_data.dtype,
-        )
-        out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data)
-        K.binary_op_reduce(
-            self.reducer if self.reducer != "mean" else "sum",
-            self.binary_op,
-            self.graph,
-            self.lhs,
-            self.rhs,
-            lhs_data_nd,
-            rhs_data_nd,
-            out_data_nd,
-            self.lhs_map[0],
-            self.rhs_map[0],
-            self.out_map[0],
-        )
-        # normalize if mean reducer
-        # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
-        if self.reducer == "mean":
-            degs = nd.empty(
-                (out_data.shape[0],), ctx=out_data.context, dtype=out_data.dtype
-            )
-            degs_nd = zerocopy_to_dgl_ndarray(degs)
-            if self.lhs != TargetCode.DST:
-                target = self.lhs
-                n = lhs_data.shape[0]
-                in_map = self.lhs_map[0]
-            else:
-                target = self.rhs
-                n = rhs_data.shape[0]
-                in_map = self.rhs_map[0]
-            in_ones = nd.ones((n,), ctx=lhs_data.context, dtype=lhs_data.dtype)
-            in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
-            K.copy_reduce(
-                "sum",
-                self.graph,
-                target,
-                in_ones_nd,
-                degs_nd,
-                in_map,
-                self.out_map[0],
-            )
-            # reshape
-            degs = degs.reshape(
-                (out_data.shape[0],) + (1,) * (out_data.ndim - 1)
-            ).clip(1, float("inf"))
-            out_data = out_data / degs
-        else:
-            degs = None
-        self.save_for_backward(
-            lhs_data_nd, rhs_data_nd, out_data_nd, feat_shape, degs
-        )
-        return out_data
-
-    def backward(self, grad_out):
-        (
-            lhs_data_nd,
-            rhs_data_nd,
-            out_data_nd,
-            feat_shape,
-            degs,
-        ) = self.saved_tensors
-        if self.reducer == "mean":
-            grad_out = grad_out / degs
-        grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
-        grad_lhs = nd.empty(
-            (lhs_data_nd.shape[0],) + feat_shape,
-            ctx=grad_out.context,
-            dtype=grad_out.dtype,
-        )
-        K.backward_lhs_binary_op_reduce(
-            self.reducer if self.reducer != "mean" else "sum",
-            self.binary_op,
-            self.graph,
-            self.lhs,
-            self.rhs,
-            lhs_data_nd,
-            rhs_data_nd,
-            out_data_nd,
-            grad_out_nd,
-            zerocopy_to_dgl_ndarray_for_write(grad_lhs),
-            self.lhs_map[1],
-            self.rhs_map[1],
-            self.out_map[1],
-        )
-        grad_lhs = _reduce_grad(grad_lhs, lhs_data_nd.shape)
-        grad_rhs = nd.empty(
-            (rhs_data_nd.shape[0],) + feat_shape,
-            ctx=grad_out.context,
-            dtype=grad_out.dtype,
-        )
-        K.backward_rhs_binary_op_reduce(
-            self.reducer if self.reducer != "mean" else "sum",
-            self.binary_op,
-            self.graph,
-            self.lhs,
-            self.rhs,
-            lhs_data_nd,
-            rhs_data_nd,
-            out_data_nd,
-            grad_out_nd,
-            zerocopy_to_dgl_ndarray_for_write(grad_rhs),
-            self.lhs_map[1],
-            self.rhs_map[1],
-            self.out_map[1],
-        )
-        grad_rhs = _reduce_grad(grad_rhs, rhs_data_nd.shape)
-        # clear saved tensors explicitly
-        self.saved_tensors = None
-        return grad_lhs, grad_rhs
-
-
-def binary_reduce(
-    reducer,
-    binary_op,
-    graph,
-    lhs,
-    rhs,
-    lhs_data,
-    rhs_data,
-    out_size,
-    lhs_map=(None, None),
-    rhs_map=(None, None),
-    out_map=(None, None),
-):
-    func = BinaryReduce(
-        reducer, binary_op, graph, lhs, rhs, out_size, lhs_map, rhs_map, out_map
-    )
-    return func(lhs_data, rhs_data)
-
-
-class CopyReduce(mx.autograd.Function):
-    def __init__(self, reducer, graph, target, out_size, in_map, out_map):
-        super(CopyReduce, self).__init__()
-        self.reducer = reducer
-        self.graph = graph
-        self.target = target
-        self.out_size = out_size
-        self.in_map = in_map
-        self.out_map = out_map
-
-    def forward(self, in_data):
-        feat_shape = in_data.shape[1:]
-        out_data = nd.empty(
-            (self.out_size,) + feat_shape,
-            ctx=in_data.context,
-            dtype=in_data.dtype,
-        )
-        in_data_nd = zerocopy_to_dgl_ndarray(in_data)
-        out_data_nd = zerocopy_to_dgl_ndarray_for_write(out_data)
-        K.copy_reduce(
-            self.reducer if self.reducer != "mean" else "sum",
-            self.graph,
-            self.target,
-            in_data_nd,
-            out_data_nd,
-            self.in_map[0],
-            self.out_map[0],
-        )
-        # normalize if mean reducer
-        # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
-        if self.reducer == "mean":
-            in_ones = nd.ones(
-                (in_data.shape[0],), ctx=in_data.context, dtype=in_data.dtype
-            )
-            degs = nd.empty(
-                (out_data.shape[0],), ctx=out_data.context, dtype=out_data.dtype
-            )
-            in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
-            degs_nd = zerocopy_to_dgl_ndarray(degs)
-            K.copy_reduce(
-                "sum",
-                self.graph,
-                self.target,
-                in_ones_nd,
-                degs_nd,
-                self.in_map[0],
-                self.out_map[0],
-            )
-            # reshape
-            degs = degs.reshape(
-                (out_data.shape[0],) + (1,) * (out_data.ndim - 1)
-            ).clip(1, float("inf"))
-            out_data = out_data / degs
-        else:
-            degs = None
-        self.save_for_backward(in_data_nd, out_data_nd, degs)
-        return out_data
-
-    def backward(self, grad_out):
-        in_data_nd, out_data_nd, degs = self.saved_tensors
-        grad_in = nd.empty(
-            in_data_nd.shape, ctx=grad_out.context, dtype=grad_out.dtype
-        )
-        if self.reducer == "mean":
-            grad_out = grad_out / degs
-        grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
-        K.backward_copy_reduce(
-            self.reducer if self.reducer != "mean" else "sum",
-            self.graph,
-            self.target,
-            in_data_nd,
-            out_data_nd,
-            grad_out_nd,
-            zerocopy_to_dgl_ndarray_for_write(grad_in),
-            self.in_map[1],
-            self.out_map[1],
-        )
-        # clear saved tensors explicitly
-        self.saved_tensors = None
-        return grad_in
-
-
-def copy_reduce(
-    reducer,
-    graph,
-    target,
-    in_data,
-    out_size,
-    in_map=(None, None),
-    out_map=(None, None),
-):
-    func = CopyReduce(reducer, graph, target, out_size, in_map, out_map)
-    return func(in_data)
-
-
-def _reduce_grad(grad, shape):
-    """Reduce gradient on the broadcast dimension
-
-    If there is broadcast in forward pass, gradients need to be reduced on
-    broadcast dimension. This function checks the input tensor shape and
-    gradient shape and perform the reduction.
-
-    Parameters
-    ----------
-    grad: Tensor
-        Gradient tensor
-    shape: tuple
-        Shape of input tensor
-
-    Returns
-    -------
-    Tensor
-    """
-    grad_shape = grad.shape[1:]
-    in_shape = shape[1:]
-    if in_shape == grad_shape:
-        # no need to reduce
-        return grad
-    num_to_squeeze = len(grad_shape) - len(in_shape)
-    # pad in_shape
-    in_shape = (1,) * num_to_squeeze + in_shape
-    reduce_idx = np.nonzero(np.asarray(grad_shape) - np.asarray(in_shape))[0]
-    reduce_idx += 1  # skip batch dim
-    grad = grad.sum(axis=tuple(reduce_idx), keepdims=True)
-    return grad.reshape(shape)
-
-
 def sync():
    """Synchronize computation.


--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -9,7 +9,6 @@ import torch as th
 from torch.utils import dlpack

 from ... import ndarray as nd
-from ..._deprecate import kernel as K
 from ...function.base import TargetCode
 from ...utils import version

@@ -471,323 +470,6 @@ def zerocopy_from_dgl_ndarray(data):
        return dlpack.from_dlpack(data.to_dlpack())


-class BinaryReduce(th.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        reducer,
-        binary_op,
-        graph,
-        lhs,
-        rhs,
-        lhs_data,
-        rhs_data,
-        out_data,
-        out_size,
-        lhs_map,
-        rhs_map,
-        out_map,
-    ):
-        lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
-        rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
-        feat_shape = K.infer_binary_feature_shape(
-            binary_op, lhs_data_nd, rhs_data_nd
-        )
-        out_shape = feat_shape
-        if binary_op == "dot":
-            out_shape = feat_shape[:-1]
-        out_data_nd = zerocopy_to_dgl_ndarray(out_data)
-        K.binary_op_reduce(
-            reducer if reducer != "mean" else "sum",
-            binary_op,
-            graph,
-            lhs,
-            rhs,
-            lhs_data_nd,
-            rhs_data_nd,
-            out_data_nd,
-            lhs_map[0],
-            rhs_map[0],
-            out_map[0],
-        )
-        # normalize if mean reducer
-        # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
-        if reducer == "mean":
-            degs = lhs_data.new_empty((out_data.shape[0],))
-            degs_nd = zerocopy_to_dgl_ndarray(degs)
-            if lhs != TargetCode.DST:  # src or edge
-                target = lhs
-                n = lhs_data.shape[0]
-                in_map = lhs_map[0]
-            else:  # rhs != TargetCode.DST
-                target = rhs
-                n = rhs_data.shape[0]
-                in_map = rhs_map[0]
-            in_ones = lhs_data.new_ones((n,))
-            in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
-            K.copy_reduce(
-                "sum", graph, target, in_ones_nd, degs_nd, in_map, out_map[0]
-            )
-            # reshape
-            degs = degs.reshape(
-                (out_data.shape[0],) + (1,) * (out_data.dim() - 1)
-            ).clamp(min=1)
-            out_data = out_data / degs
-        else:
-            degs = None
-        # save_for_backward can only save variables
-        ctx.backward_cache = (
-            reducer,
-            binary_op,
-            graph,
-            lhs,
-            rhs,
-            lhs_map,
-            rhs_map,
-            out_map,
-            feat_shape,
-            degs,
-        )
-        ctx.save_for_backward(lhs_data, rhs_data, out_data)
-        return out_data
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        (
-            reducer,
-            binary_op,
-            graph,
-            lhs,
-            rhs,
-            lhs_map,
-            rhs_map,
-            out_map,
-            feat_shape,
-            degs,
-        ) = ctx.backward_cache
-        lhs_data, rhs_data, out_data = ctx.saved_tensors
-        lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
-        rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
-        out_data_nd = zerocopy_to_dgl_ndarray(out_data)
-        grad_lhs = None
-        grad_rhs = None
-        if reducer == "mean":
-            grad_out = grad_out / degs
-        grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
-        if ctx.needs_input_grad[5]:
-            grad_lhs = grad_out.new_empty((lhs_data_nd.shape[0],) + feat_shape)
-            K.backward_lhs_binary_op_reduce(
-                reducer if reducer != "mean" else "sum",
-                binary_op,
-                graph,
-                lhs,
-                rhs,
-                lhs_data_nd,
-                rhs_data_nd,
-                out_data_nd,
-                grad_out_nd,
-                zerocopy_to_dgl_ndarray(grad_lhs),
-                lhs_map[1],
-                rhs_map[1],
-                out_map[1],
-            )
-            grad_lhs = _reduce_grad(grad_lhs, lhs_data_nd.shape)
-        if ctx.needs_input_grad[6]:
-            grad_rhs = grad_out.new_empty((rhs_data_nd.shape[0],) + feat_shape)
-            K.backward_rhs_binary_op_reduce(
-                reducer if reducer != "mean" else "sum",
-                binary_op,
-                graph,
-                lhs,
-                rhs,
-                lhs_data_nd,
-                rhs_data_nd,
-                out_data_nd,
-                grad_out_nd,
-                zerocopy_to_dgl_ndarray(grad_rhs),
-                lhs_map[1],
-                rhs_map[1],
-                out_map[1],
-            )
-            grad_rhs = _reduce_grad(grad_rhs, rhs_data_nd.shape)
-
-        return (
-            None,
-            None,
-            None,
-            None,
-            None,
-            grad_lhs,
-            grad_rhs,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-
-
-def binary_reduce(
-    reducer,
-    binary_op,
-    graph,
-    lhs,
-    rhs,
-    lhs_data,
-    rhs_data,
-    out_size,
-    lhs_map=(None, None),
-    rhs_map=(None, None),
-    out_map=(None, None),
-):
-    lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
-    rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
-    feat_shape = K.infer_binary_feature_shape(
-        binary_op, lhs_data_nd, rhs_data_nd
-    )
-
-    out_shape = feat_shape
-    if binary_op == "dot":
-        out_shape = feat_shape[:-1]
-    out_data = lhs_data.new_empty((out_size,) + out_shape)
-
-    return BinaryReduce.apply(
-        reducer,
-        binary_op,
-        graph,
-        lhs,
-        rhs,
-        lhs_data,
-        rhs_data,
-        out_data,
-        out_size,
-        lhs_map,
-        rhs_map,
-        out_map,
-    )
-
-
-class CopyReduce(th.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        reducer,
-        graph,
-        target,
-        in_data,
-        out_data,
-        out_size,
-        in_map,
-        out_map,
-    ):
-        in_data_nd = zerocopy_to_dgl_ndarray(in_data)
-        out_data_nd = zerocopy_to_dgl_ndarray(out_data)
-        K.copy_reduce(
-            reducer if reducer != "mean" else "sum",
-            graph,
-            target,
-            in_data_nd,
-            out_data_nd,
-            in_map[0],
-            out_map[0],
-        )
-        # normalize if mean reducer
-        # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
-        if reducer == "mean":
-            in_ones = in_data.new_ones((in_data.shape[0],))
-            degs = in_data.new_empty((out_data.shape[0],))
-            in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
-            degs_nd = zerocopy_to_dgl_ndarray(degs)
-            K.copy_reduce(
-                "sum", graph, target, in_ones_nd, degs_nd, in_map[0], out_map[0]
-            )
-            # reshape
-            degs = degs.reshape(
-                (out_data.shape[0],) + (1,) * (out_data.dim() - 1)
-            ).clamp(min=1)
-            out_data = out_data / degs
-        else:
-            degs = None
-        # save_for_backward can only save variables
-        ctx.backward_cache = (reducer, graph, target, in_map, out_map, degs)
-        ctx.save_for_backward(in_data, out_data)
-        return out_data
-
-    @staticmethod
-    def backward(ctx, grad_out):
-        reducer, graph, target, in_map, out_map, degs = ctx.backward_cache
-        in_data, out_data = ctx.saved_tensors
-        in_data_nd = zerocopy_to_dgl_ndarray(in_data)
-        out_data_nd = zerocopy_to_dgl_ndarray(out_data)
-        grad_in = None
-        if reducer == "mean":
-            grad_out = grad_out / degs
-        grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
-        if ctx.needs_input_grad[3]:
-            grad_in = grad_out.new_empty(in_data_nd.shape)
-            K.backward_copy_reduce(
-                reducer if reducer != "mean" else "sum",
-                graph,
-                target,
-                in_data_nd,
-                out_data_nd,
-                grad_out_nd,
-                zerocopy_to_dgl_ndarray(grad_in),
-                in_map[1],
-                out_map[1],
-            )
-        return None, None, None, grad_in, None, None, None, None
-
-
-def copy_reduce(
-    reducer,
-    graph,
-    target,
-    in_data,
-    out_size,
-    in_map=(None, None),
-    out_map=(None, None),
-):
-    out_data = in_data.new_empty((out_size,) + in_data.shape[1:])
-    return CopyReduce.apply(
-        reducer, graph, target, in_data, out_data, out_size, in_map, out_map
-    )
-
-
-def _reduce_grad(grad, shape):
-    """Reduce gradient on the broadcast dimension
-
-    If there is broadcast in forward pass, gradients need to be reduced on
-    broadcast dimension. This function checks the input tensor shape and
-    gradient shape and perform the reduction.
-
-    Parameters
-    ----------
-    grad: Tensor
-        Gradient tensor
-    shape: tuple
-        Shape of input tensor
-
-    Returns
-    -------
-    Tensor
-    """
-    grad_shape = grad.shape[1:]
-    in_shape = shape[1:]
-    if in_shape == grad_shape:
-        # no need to reduce
-        return grad
-    num_to_squeeze = len(grad_shape) - len(in_shape)
-    # pad inshape
-    in_shape = (1,) * num_to_squeeze + in_shape
-    reduce_idx = th.nonzero(
-        th.tensor(grad_shape) - th.tensor(in_shape), as_tuple=False
-    )
-    reduce_idx += 1  # skip batch dim
-    grad = grad.sum(dim=tuple(reduce_idx), keepdim=True)
-    return grad.view(shape)
-
-
 def sync():
    # Pytorch performs computation synchronously, so no need for synchronization.
    pass

--- a/python/dgl/backend/tensorflow/tensor.py
+++ b/python/dgl/backend/tensorflow/tensor.py
@@ -8,7 +8,6 @@ import numpy as np
 import tensorflow as tf

 from ... import ndarray as nd
-from ..._deprecate import kernel as K
 from ...function.base import TargetCode
 from ...utils import version

@@ -515,269 +514,6 @@ def zerocopy_from_dgl_ndarray(input):
    return zerocopy_from_dlpack(input.to_dlpack())


-def binary_reduce(
-    reducer,
-    binary_op,
-    graph,
-    lhs,
-    rhs,
-    lhs_data,
-    rhs_data,
-    out_size,
-    lhs_map=(None, None),
-    rhs_map=(None, None),
-    out_map=(None, None),
-):
-    @tf.custom_gradient
-    def _lambda(lhs_data, rhs_data):
-        return binary_reduce_real(
-            reducer,
-            binary_op,
-            graph,
-            lhs,
-            rhs,
-            lhs_data,
-            rhs_data,
-            out_size,
-            lhs_map,
-            rhs_map,
-            out_map,
-        )
-
-    return _lambda(lhs_data, rhs_data)
-
-
-def binary_reduce_real(
-    reducer,
-    binary_op,
-    graph,
-    lhs,
-    rhs,
-    lhs_data,
-    rhs_data,
-    out_size,
-    lhs_map,
-    rhs_map,
-    out_map,
-):
-    with tf.device(lhs_data.device):
-        lhs_data_nd = zerocopy_to_dgl_ndarray(lhs_data)
-        rhs_data_nd = zerocopy_to_dgl_ndarray(rhs_data)
-        feat_shape = K.infer_binary_feature_shape(
-            binary_op, lhs_data_nd, rhs_data_nd
-        )
-        out_shape = feat_shape
-        if binary_op == "dot":
-            out_shape = feat_shape[:-1]
-        out_data = tf.zeros((out_size,) + out_shape, dtype=lhs_data.dtype)
-        out_data_nd = zerocopy_to_dgl_ndarray(out_data)
-        K.binary_op_reduce(
-            reducer if reducer != "mean" else "sum",
-            binary_op,
-            graph,
-            lhs,
-            rhs,
-            lhs_data_nd,
-            rhs_data_nd,
-            out_data_nd,
-            lhs_map[0],
-            rhs_map[0],
-            out_map[0],
-        )
-        # normalize if mean reducer
-        # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
-        if reducer == "mean":
-            degs = tf.zeros((out_data.shape[0],), dtype=lhs_data.dtype)
-            degs_nd = zerocopy_to_dgl_ndarray(degs)
-            if lhs != TargetCode.DST:  # src or edge
-                target = lhs
-                n = lhs_data.shape[0]
-                in_map = lhs_map[0]
-            else:  # rhs != TargetCode.DST
-                target = rhs
-                n = rhs_data.shape[0]
-                in_map = rhs_map[0]
-            in_ones = tf.ones((n,), dtype=lhs_data.dtype)
-            in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
-            K.copy_reduce(
-                "sum", graph, target, in_ones_nd, degs_nd, in_map, out_map[0]
-            )
-            # reshape
-            degs = tf.reshape(
-                degs, (out_data.shape[0],) + (1,) * (out_data.ndim - 1)
-            )
-            degs = tf.clip_by_value(
-                degs, clip_value_min=1, clip_value_max=np.inf
-            )  # ???
-            out_data = out_data / degs
-        else:
-            degs = None
-
-    def grad(grad_out):
-        with tf.device(grad_out.device):
-            grad_lhs = None
-            grad_rhs = None
-            if reducer == "mean":
-                grad_out = grad_out / degs
-            grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
-
-            # comptue gradient for lhs
-            grad_lhs = tf.zeros((lhs_data_nd.shape[0],) + feat_shape)
-            K.backward_lhs_binary_op_reduce(
-                reducer if reducer != "mean" else "sum",
-                binary_op,
-                graph,
-                lhs,
-                rhs,
-                lhs_data_nd,
-                rhs_data_nd,
-                out_data_nd,
-                grad_out_nd,
-                zerocopy_to_dgl_ndarray(grad_lhs),
-                lhs_map[1],
-                rhs_map[1],
-                out_map[1],
-            )
-            grad_lhs = _reduce_grad(grad_lhs, lhs_data_nd.shape)
-
-            # compute gradient for rhs
-            grad_rhs = tf.zeros((rhs_data_nd.shape[0],) + feat_shape)
-            K.backward_rhs_binary_op_reduce(
-                reducer if reducer != "mean" else "sum",
-                binary_op,
-                graph,
-                lhs,
-                rhs,
-                lhs_data_nd,
-                rhs_data_nd,
-                out_data_nd,
-                grad_out_nd,
-                zerocopy_to_dgl_ndarray(grad_rhs),
-                lhs_map[1],
-                rhs_map[1],
-                out_map[1],
-            )
-            grad_rhs = _reduce_grad(grad_rhs, rhs_data_nd.shape)
-
-            return grad_lhs, grad_rhs
-
-    return out_data, grad
-
-
-def copy_reduce(
-    reducer,
-    graph,
-    target,
-    in_data,
-    out_size,
-    in_map=(None, None),
-    out_map=(None, None),
-):
-    @tf.custom_gradient
-    def _lambda(in_data):
-        return copy_reduce_real(
-            reducer, graph, target, in_data, out_size, in_map, out_map
-        )
-
-    return _lambda(in_data)
-
-
-def copy_reduce_real(
-    reducer, graph, target, in_data, out_size, in_map, out_map
-):
-    with tf.device(in_data.device):
-        out_data = tf.zeros(
-            (out_size,) + tuple(in_data.shape[1:]), dtype=in_data.dtype
-        )
-        in_data_nd = zerocopy_to_dgl_ndarray(in_data)
-        out_data_nd = zerocopy_to_dgl_ndarray(out_data)
-        K.copy_reduce(
-            reducer if reducer != "mean" else "sum",
-            graph,
-            target,
-            in_data_nd,
-            out_data_nd,
-            in_map[0],
-            out_map[0],
-        )
-        # normalize if mean reducer
-        # NOTE(zihao): this is a temporary hack and we should have better solution in the future.
-        if reducer == "mean":
-            in_ones = tf.ones(in_data.shape[0], dtype=in_data.dtype)
-            degs = tf.zeros(out_data.shape[0], dtype=in_data.dtype)
-            in_ones_nd = zerocopy_to_dgl_ndarray(in_ones)
-            degs_nd = zerocopy_to_dgl_ndarray(degs)
-            K.copy_reduce(
-                "sum", graph, target, in_ones_nd, degs_nd, in_map[0], out_map[0]
-            )
-            # reshape
-            degs = tf.reshape(
-                degs, (out_data.shape[0],) + (1,) * (out_data.ndim - 1)
-            )
-            degs = tf.clip_by_value(
-                degs, clip_value_min=1, clip_value_max=np.inf
-            )  # TODO: ???
-            out_data = out_data / degs
-        else:
-            degs = None
-
-    def grad(grad_out):
-        with tf.device(grad_out.device):
-            if reducer == "mean":
-                grad_out = grad_out / degs
-            grad_out_nd = zerocopy_to_dgl_ndarray(grad_out)
-            grad_in = tf.zeros(in_data_nd.shape)
-            K.backward_copy_reduce(
-                reducer if reducer != "mean" else "sum",
-                graph,
-                target,
-                in_data_nd,
-                out_data_nd,
-                grad_out_nd,
-                zerocopy_to_dgl_ndarray(grad_in),
-                in_map[1],
-                out_map[1],
-            )
-            return grad_in
-
-    return out_data, grad
-
-
-def _reduce_grad(grad, shape):
-    """Reduce gradient on the broadcast dimension
-
-    If there is broadcast in forward pass, gradients need to be reduced on
-    broadcast dimension. This function checks the input tensor shape and
-    gradient shape and perform the reduction.
-
-    Parameters
-    ----------
-    grad: Tensor
-        Gradient tensor
-    shape: tuple
-        Shape of input tensor
-
-    Returns
-    -------
-    Tensor
-    """
-    grad_shape = grad.shape[1:]
-    in_shape = shape[1:]
-    if in_shape == grad_shape:
-        # no need to reduce
-        return grad
-    num_to_squeeze = len(grad_shape) - len(in_shape)
-    # pad inshape
-    in_shape = (1,) * num_to_squeeze + in_shape
-    reduce_idx = np.asarray(
-        np.nonzero(np.asarray(grad_shape) - np.asarray(in_shape))
-    )
-    reduce_idx += 1  # skip batch dim
-    reduce_idx_tensor = tf.constant(tuple(reduce_idx.flatten().tolist()))
-    grad = tf.reduce_sum(grad, axis=reduce_idx_tensor, keepdims=True)
-    return tf.reshape(grad, shape)
-
-
 def sync():
    context = context().context()
    context.async_wait()

--- a/python/dgl/contrib/__init__.py
+++ b/python/dgl/contrib/__init__.py
-from . import sampling
-from . import graph_store
-from .dis_kvstore import KVClient, KVServer
-from .dis_kvstore import read_ip_config
-from .unified_tensor import UnifiedTensor
--- a/python/dgl/contrib/data/__init__.py
+++ b/python/dgl/contrib/data/__init__.py
-from __future__ import absolute_import
-
-from . import knowledge_graph as knwlgrh
-def load_data(dataset, bfs_level=3, relabel=False):
-    if dataset in ['aifb', 'mutag', 'bgs', 'am']:
-        return knwlgrh.load_entity(dataset, bfs_level, relabel)
-    elif dataset in ['FB15k', 'wn18', 'FB15k-237']:
-        return knwlgrh.load_link(dataset)
-    else:
-        raise ValueError('Unknown dataset: {}'.format(dataset))
--- a/python/dgl/contrib/data/knowledge_graph.py
+++ b/python/dgl/contrib/data/knowledge_graph.py
-""" Knowledge graph dataset for Relational-GCN
-Code adapted from authors' implementation of Relational-GCN
-https://github.com/tkipf/relational-gcn
-https://github.com/MichSchli/RelationPrediction
-"""
-
-from __future__ import print_function
-from __future__ import absolute_import
-import numpy as np
-import scipy.sparse as sp
-import os, gzip
-import rdflib as rdf
-import pandas as pd
-from collections import Counter
-
-from dgl.data.utils import download, extract_archive, get_download_dir, _get_dgl_url
-
-np.random.seed(123)
-
-_downlaod_prefix = _get_dgl_url('dataset/')
-
-class RGCNEntityDataset(object):
-    """RGCN Entity Classification dataset
-
-    The dataset contains a graph depicting the connectivity of a knowledge
-    base. Currently, four knowledge bases from the
-    `RGCN paper <https://arxiv.org/pdf/1703.06103.pdf>`_ are supported: aifb,
-    mutag, bgs, and am.
-
-    The original knowledge base is stored as an RDF file, and this class will
-    download and parse the RDF file, and performs preprocessing.
-
-    An object of this class has 11 member attributes needed for entity
-    classification:
-
-    num_nodes: int
-        number of entities of knowledge base
-    num_rels: int
-        number of relations (including reverse relation) of knowledge base
-    num_classes: int
-        number of classes/labels that of entities in knowledge base
-    edge_src: numpy.array
-        source node ids of all edges
-    edge_dst: numpy.array
-        destination node ids of all edges
-    edge_type: numpy.array
-        type of all edges
-    edge_norm: numpy.array
-        normalization factor of all edges
-    labels: numpy.array
-        labels of node entities
-    train_idx: numpy.array
-        ids of entities used for training
-    valid_idx: numpy.array
-        ids of entities used for validation
-    test_idx: numpy.array
-        ids of entities used for testing
-
-    Usually, users don't need to directly use this class. Instead, DGL provides
-    wrapper function to load data (see example below).
-    When loading data, besides specifying dataset name, user can provide two
-    optional arguments:
-
-    Parameters
-    ----------
-    bfs_level: int
-        prune out nodes that are more than ``bfs_level`` hops away from
-        labeled nodes, i.e., nodes won't be touched during propagation. If set
-        to a number less or equal to 0, all nodes will be retained.
-    relabel: bool
-        After pruning, whether or not to relabel all nodes with consecutive
-        node ids
-
-    Examples
-    --------
-    Load aifb dataset, prune out nodes that are more than 3 hops away from
-    labeled nodes, and relabel the remaining nodes with consecutive ids
-
-    >>> from dgl.contrib.data import load_data
-    >>> data = load_data(dataset='aifb', bfs_level=3, relabel=True)
-
-    """
-
-    def __init__(self, name):
-        self.name = name
-        self.dir = get_download_dir()
-        tgz_path = os.path.join(self.dir, '{}.tgz'.format(self.name))
-        download(_downlaod_prefix + '{}.tgz'.format(self.name), tgz_path)
-        self.dir = os.path.join(self.dir, self.name)
-        extract_archive(tgz_path, self.dir)
-
-    def load(self, bfs_level=2, relabel=False):
-        self.num_nodes, edges, self.num_rels, self.labels, labeled_nodes_idx, self.train_idx, self.test_idx = _load_data(self.name, self.dir)
-
-        # bfs to reduce edges
-        if bfs_level > 0:
-            print("removing nodes that are more than {} hops away".format(bfs_level))
-            row, col, edge_type = edges.transpose()
-            A = sp.csr_matrix((np.ones(len(row)), (row, col)), shape=(self.num_nodes, self.num_nodes))
-            bfs_generator = _bfs_relational(A, labeled_nodes_idx)
-            lvls = list()
-            lvls.append(set(labeled_nodes_idx))
-            for _ in range(bfs_level):
-                lvls.append(next(bfs_generator))
-            to_delete = list(set(range(self.num_nodes)) - set.union(*lvls))
-            eid_to_delete = np.isin(row, to_delete) + np.isin(col, to_delete)
-            eid_to_keep = np.logical_not(eid_to_delete)
-            self.edge_src = row[eid_to_keep]
-            self.edge_dst = col[eid_to_keep]
-            self.edge_type = edge_type[eid_to_keep]
-
-            if relabel:
-                uniq_nodes, edges = np.unique((self.edge_src, self.edge_dst), return_inverse=True)
-                self.edge_src, self.edge_dst = np.reshape(edges, (2, -1))
-                node_map = np.zeros(self.num_nodes, dtype=int)
-                self.num_nodes = len(uniq_nodes)
-                node_map[uniq_nodes] = np.arange(self.num_nodes)
-                self.labels = self.labels[uniq_nodes]
-                self.train_idx = node_map[self.train_idx]
-                self.test_idx = node_map[self.test_idx]
-                print("{} nodes left".format(self.num_nodes))
-        else:
-            self.edge_src, self.edge_dst, self.edge_type = edges.transpose()
-
-        # normalize by dst degree
-        _, inverse_index, count = np.unique((self.edge_dst, self.edge_type), axis=1, return_inverse=True, return_counts=True)
-        degrees = count[inverse_index]
-        self.edge_norm = np.ones(len(self.edge_dst), dtype=np.float32) / degrees.astype(np.float32)
-
-        # convert to pytorch label format
-        self.num_classes = self.labels.shape[1]
-        self.labels = np.argmax(self.labels, axis=1)
-
-
-class RGCNLinkDataset(object):
-    """RGCN link prediction dataset
-
-    The dataset contains a graph depicting the connectivity of a knowledge
-    base. Currently, the knowledge bases from the
-    `RGCN paper <https://arxiv.org/pdf/1703.06103.pdf>`_ supported are
-    FB15k-237, FB15k, wn18
-
-    The original knowledge base is stored as an RDF file, and this class will
-    download and parse the RDF file, and performs preprocessing.
-
-    An object of this class has 5 member attributes needed for link
-    prediction:
-
-    num_nodes: int
-        number of entities of knowledge base
-    num_rels: int
-        number of relations (including reverse relation) of knowledge base
-    train: numpy.array
-        all relation triplets (src, rel, dst) for training
-    valid: numpy.array
-        all relation triplets (src, rel, dst) for validation
-    test: numpy.array
-        all relation triplets (src, rel, dst) for testing
-
-    Usually, user don't need to directly use this class. Instead, DGL provides
-    wrapper function to load data (see example below).
-
-    Examples
-    --------
-    Load FB15k-237 dataset
-
-    >>> from dgl.contrib.data import load_data
-    >>> data = load_data(dataset='FB15k-237')
-
-    """
-    def __init__(self, name):
-        self.name = name
-        self.dir = get_download_dir()
-        tgz_path = os.path.join(self.dir, '{}.tar.gz'.format(self.name))
-        download(_downlaod_prefix + '{}.tgz'.format(self.name), tgz_path)
-        self.dir = os.path.join(self.dir, self.name)
-        extract_archive(tgz_path, self.dir)
-
-    def load(self):
-        entity_path = os.path.join(self.dir, 'entities.dict')
-        relation_path = os.path.join(self.dir, 'relations.dict')
-        train_path = os.path.join(self.dir, 'train.txt')
-        valid_path = os.path.join(self.dir, 'valid.txt')
-        test_path = os.path.join(self.dir, 'test.txt')
-        entity_dict = _read_dictionary(entity_path)
-        relation_dict = _read_dictionary(relation_path)
-        self.train = np.asarray(_read_triplets_as_list(train_path, entity_dict, relation_dict))
-        self.valid = np.asarray(_read_triplets_as_list(valid_path, entity_dict, relation_dict))
-        self.test = np.asarray(_read_triplets_as_list(test_path, entity_dict, relation_dict))
-        self.num_nodes = len(entity_dict)
-        print("# entities: {}".format(self.num_nodes))
-        self.num_rels = len(relation_dict)
-        print("# relations: {}".format(self.num_rels))
-        print("# edges: {}".format(len(self.train)))
-
-
-def load_entity(dataset, bfs_level, relabel):
-    data = RGCNEntityDataset(dataset)
-    data.load(bfs_level, relabel)
-    return data
-
-def load_link(dataset):
-    data = RGCNLinkDataset(dataset)
-    data.load()
-    return data
-
-
-def _sp_row_vec_from_idx_list(idx_list, dim):
-    """Create sparse vector of dimensionality dim from a list of indices."""
-    shape = (1, dim)
-    data = np.ones(len(idx_list))
-    row_ind = np.zeros(len(idx_list))
-    col_ind = list(idx_list)
-    return sp.csr_matrix((data, (row_ind, col_ind)), shape=shape)
-
-def _get_neighbors(adj, nodes):
-    """Takes a set of nodes and a graph adjacency matrix and returns a set of neighbors."""
-    sp_nodes = _sp_row_vec_from_idx_list(list(nodes), adj.shape[1])
-    sp_neighbors = sp_nodes.dot(adj)
-    neighbors = set(sp.find(sp_neighbors)[1])  # convert to set of indices
-    return neighbors
-
-def _bfs_relational(adj, roots):
-    """
-    BFS for graphs with multiple edge types. Returns list of level sets.
-    Each entry in list corresponds to relation specified by adj_list.
-    """
-    visited = set()
-    current_lvl = set(roots)
-
-    next_lvl = set()
-
-    while current_lvl:
-
-        for v in current_lvl:
-            visited.add(v)
-
-        next_lvl = _get_neighbors(adj, current_lvl)
-        next_lvl -= visited  # set difference
-
-        yield next_lvl
-
-        current_lvl = set.union(next_lvl)
-
-
-class RDFReader(object):
-    __graph = None
-    __freq = {}
-
-    def __init__(self, file):
-
-        self.__graph = rdf.Graph()
-
-        if file.endswith('nt.gz'):
-            with gzip.open(file, 'rb') as f:
-                self.__graph.parse(file=f, format='nt')
-        else:
-            self.__graph.parse(file, format=rdf.util.guess_format(file))
-
-        # See http://rdflib.readthedocs.io for the rdflib documentation
-
-        self.__freq = Counter(self.__graph.predicates())
-
-        print("Graph loaded, frequencies counted.")
-
-    def triples(self, relation=None):
-        for s, p, o in self.__graph.triples((None, relation, None)):
-            yield s, p, o
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.__graph.destroy("store")
-        self.__graph.close(True)
-
-    def subjectSet(self):
-        return set(self.__graph.subjects())
-
-    def objectSet(self):
-        return set(self.__graph.objects())
-
-    def relationList(self):
-        """
-        Returns a list of relations, ordered descending by frequency
-        :return:
-        """
-        res = list(set(self.__graph.predicates()))
-        res.sort(key=lambda rel: - self.freq(rel))
-        return res
-
-    def __len__(self):
-        return len(self.__graph)
-
-    def freq(self, rel):
-        if rel not in self.__freq:
-            return 0
-        return self.__freq[rel]
-
-
-def _load_sparse_csr(filename):
-    loader = np.load(filename)
-    return sp.csr_matrix((loader['data'], loader['indices'], loader['indptr']),
-                         shape=loader['shape'], dtype=np.float32)
-
-
-def _save_sparse_csr(filename, array):
-    np.savez(filename, data=array.data, indices=array.indices,
-             indptr=array.indptr, shape=array.shape)
-
-
-def _load_data(dataset_str='aifb', dataset_path=None):
-    """
-
-    :param dataset_str:
-    :param rel_layers:
-    :param limit: If > 0, will only load this many adj. matrices
-        All adjacencies are preloaded and saved to disk,
-        but only a limited a then restored to memory.
-    :return:
-    """
-
-    print('Loading dataset', dataset_str)
-
-    graph_file = os.path.join(dataset_path, '{}_stripped.nt.gz'.format(dataset_str))
-    task_file = os.path.join(dataset_path, 'completeDataset.tsv')
-    train_file = os.path.join(dataset_path, 'trainingSet.tsv')
-    test_file = os.path.join(dataset_path, 'testSet.tsv')
-    if dataset_str == 'am':
-        label_header = 'label_cateogory'
-        nodes_header = 'proxy'
-    elif dataset_str == 'aifb':
-        label_header = 'label_affiliation'
-        nodes_header = 'person'
-    elif dataset_str == 'mutag':
-        label_header = 'label_mutagenic'
-        nodes_header = 'bond'
-    elif dataset_str == 'bgs':
-        label_header = 'label_lithogenesis'
-        nodes_header = 'rock'
-    else:
-        raise NameError('Dataset name not recognized: ' + dataset_str)
-
-    edge_file = os.path.join(dataset_path, 'edges.npz')
-    labels_file = os.path.join(dataset_path, 'labels.npz')
-    train_idx_file = os.path.join(dataset_path, 'train_idx.npy')
-    test_idx_file = os.path.join(dataset_path, 'test_idx.npy')
-    # train_names_file = os.path.join(dataset_path, 'train_names.npy')
-    # test_names_file = os.path.join(dataset_path, 'test_names.npy')
-    # rel_dict_file = os.path.join(dataset_path, 'rel_dict.pkl')
-    # nodes_file = os.path.join(dataset_path, 'nodes.pkl')
-
-    if os.path.isfile(edge_file) and os.path.isfile(labels_file) and \
-            os.path.isfile(train_idx_file) and os.path.isfile(test_idx_file):
-
-        # load precomputed adjacency matrix and labels
-        all_edges = np.load(edge_file)
-        num_node = all_edges['n'].item()
-        edge_list = all_edges['edges']
-        num_rel = all_edges['nrel'].item()
-
-        print('Number of nodes: ', num_node)
-        print('Number of edges: ', len(edge_list))
-        print('Number of relations: ', num_rel)
-
-        labels = _load_sparse_csr(labels_file)
-        labeled_nodes_idx = list(labels.nonzero()[0])
-
-        print('Number of classes: ', labels.shape[1])
-
-        train_idx = np.load(train_idx_file)
-        test_idx = np.load(test_idx_file)
-
-        # train_names = np.load(train_names_file)
-        # test_names = np.load(test_names_file)
-        # relations_dict = pkl.load(open(rel_dict_file, 'rb'))
-
-    else:
-
-        # loading labels of nodes
-        labels_df = pd.read_csv(task_file, sep='\t', encoding='utf-8')
-        labels_train_df = pd.read_csv(train_file, sep='\t', encoding='utf8')
-        labels_test_df = pd.read_csv(test_file, sep='\t', encoding='utf8')
-
-        with RDFReader(graph_file) as reader:
-
-            relations = reader.relationList()
-            subjects = reader.subjectSet()
-            objects = reader.objectSet()
-
-            nodes = list(subjects.union(objects))
-            num_node = len(nodes)
-            num_rel = len(relations)
-            num_rel = 2 * num_rel + 1 # +1 is for self-relation
-
-            assert num_node < np.iinfo(np.int32).max
-            print('Number of nodes: ', num_node)
-            print('Number of relations: ', num_rel)
-
-            relations_dict = {rel: i for i, rel in enumerate(list(relations))}
-            nodes_dict = {node: i for i, node in enumerate(nodes)}
-
-            edge_list = []
-            # self relation
-            for i in range(num_node):
-                edge_list.append((i, i, 0))
-
-            for i, (s, p, o) in enumerate(reader.triples()):
-                src = nodes_dict[s]
-                dst = nodes_dict[o]
-                assert src < num_node and dst < num_node
-                rel = relations_dict[p]
-                # relation id 0 is self-relation, so others should start with 1
-                edge_list.append((src, dst, 2 * rel + 1))
-                # reverse relation
-                edge_list.append((dst, src, 2 * rel + 2))
-
-            # sort indices by destination
-            edge_list = sorted(edge_list, key=lambda x: (x[1], x[0], x[2]))
-            edge_list = np.asarray(edge_list, dtype=np.int)
-            print('Number of edges: ', len(edge_list))
-
-            np.savez(edge_file, edges=edge_list, n=np.asarray(num_node), nrel=np.asarray(num_rel))
-
-        nodes_u_dict = {np.unicode(to_unicode(key)): val for key, val in
-                        nodes_dict.items()}
-
-        labels_set = set(labels_df[label_header].values.tolist())
-        labels_dict = {lab: i for i, lab in enumerate(list(labels_set))}
-
-        print('{} classes: {}'.format(len(labels_set), labels_set))
-
-        labels = sp.lil_matrix((num_node, len(labels_set)))
-        labeled_nodes_idx = []
-
-        print('Loading training set')
-
-        train_idx = []
-        train_names = []
-        for nod, lab in zip(labels_train_df[nodes_header].values,
-                            labels_train_df[label_header].values):
-            nod = np.unicode(to_unicode(nod))  # type: unicode
-            if nod in nodes_u_dict:
-                labeled_nodes_idx.append(nodes_u_dict[nod])
-                label_idx = labels_dict[lab]
-                labels[labeled_nodes_idx[-1], label_idx] = 1
-                train_idx.append(nodes_u_dict[nod])
-                train_names.append(nod)
-            else:
-                print(u'Node not in dictionary, skipped: ',
-                      nod.encode('utf-8', errors='replace'))
-
-        print('Loading test set')
-
-        test_idx = []
-        test_names = []
-        for nod, lab in zip(labels_test_df[nodes_header].values,
-                            labels_test_df[label_header].values):
-            nod = np.unicode(to_unicode(nod))
-            if nod in nodes_u_dict:
-                labeled_nodes_idx.append(nodes_u_dict[nod])
-                label_idx = labels_dict[lab]
-                labels[labeled_nodes_idx[-1], label_idx] = 1
-                test_idx.append(nodes_u_dict[nod])
-                test_names.append(nod)
-            else:
-                print(u'Node not in dictionary, skipped: ',
-                      nod.encode('utf-8', errors='replace'))
-
-        labeled_nodes_idx = sorted(labeled_nodes_idx)
-        labels = labels.tocsr()
-        print('Number of classes: ', labels.shape[1])
-
-        _save_sparse_csr(labels_file, labels)
-
-        np.save(train_idx_file, train_idx)
-        np.save(test_idx_file, test_idx)
-
-        # np.save(train_names_file, train_names)
-        # np.save(test_names_file, test_names)
-
-        # pkl.dump(relations_dict, open(rel_dict_file, 'wb'))
-
-    # end if
-
-    return num_node, edge_list, num_rel, labels, labeled_nodes_idx, train_idx, test_idx
-
-
-def to_unicode(input):
-    # FIXME (lingfan): not sure about python 2 and 3 str compatibility
-    return str(input)
-    """ lingfan: comment out for now
-    if isinstance(input, unicode):
-        return input
-    elif isinstance(input, str):
-        return input.decode('utf-8', errors='replace')
-    return str(input).decode('utf-8', errors='replace')
-    """
-
-
-def _read_dictionary(filename):
-    d = {}
-    with open(filename, 'r+') as f:
-        for line in f:
-            line = line.strip().split('\t')
-            d[line[1]] = int(line[0])
-    return d
-
-def _read_triplets(filename):
-    with open(filename, 'r+') as f:
-        for line in f:
-            processed_line = line.strip().split('\t')
-            yield processed_line
-
-def _read_triplets_as_list(filename, entity_dict, relation_dict):
-    l = []
-    for triplet in _read_triplets(filename):
-        s = entity_dict[triplet[0]]
-        r = relation_dict[triplet[1]]
-        o = entity_dict[triplet[2]]
-        l.append([s, r, o])
-    return l
--- a/python/dgl/contrib/dis_kvstore.py
+++ b/python/dgl/contrib/dis_kvstore.py
-# This file contains DGL distributed kvstore APIs.
-from ..network import _create_sender, _create_receiver
-from ..network import _finalize_sender, _finalize_receiver
-from ..network import _network_wait, _add_receiver_addr
-from ..network import _receiver_wait, _sender_connect
-from ..network import _send_kv_msg, _recv_kv_msg
-from ..network import _clear_kv_msg
-from ..network import _fast_pull
-from ..network import KVMsgType, KVStoreMsg
-
-from .. import backend as F
-from .._ffi.ndarray import empty_shared_mem
-
-import os
-import time
-import random
-import numpy as np
-import socket
-
-if os.name != 'nt':
-    import fcntl
-    import struct
-
-def read_ip_config(filename):
-    """Read network configuration information of kvstore from file.
-
-    The format of configuration file should be:
-
-        [ip] [base_port] [server_count]
-
-        172.31.40.143 30050 2
-        172.31.36.140 30050 2
-        172.31.47.147 30050 2
-        172.31.30.180 30050 2
-
-    Note that, DGL KVStore supports multiple servers that can share data with each other
-    on the same machine via shared-tensor. So the server_count should be >= 1.
-
-    Parameters
-    ----------
-    filename : str
-        name of configuration file.
-
-    Returns
-    -------
-    dict
-        server namebook. e.g.,
-
-        [server_id]:[machine_id, ip, port, group_count]
-
-          {0:[0, '172.31.40.143', 30050, 2],
-           1:[0, '172.31.40.143', 30051, 2],
-           2:[1, '172.31.36.140', 30050, 2],
-           3:[1, '172.31.36.140', 30051, 2],
-           4:[2, '172.31.47.147', 30050, 2],
-           5:[2, '172.31.47.147', 30051, 2],
-           6:[3, '172.31.30.180', 30050, 2],
-           7:[3, '172.31.30.180', 30051, 2]}
-    """
-    assert len(filename) > 0, 'filename cannot be empty.'
-
-    server_namebook = {}
-
-    try:
-        server_id = 0
-        machine_id = 0
-        lines = [line.rstrip('\n') for line in open(filename)]
-        for line in lines:
-            ip, port, server_count = line.split(' ')
-            for s_count in range(int(server_count)):
-                server_namebook[server_id] = [int(machine_id), ip, int(port)+s_count, int(server_count)]
-                server_id += 1
-            machine_id += 1
-    except:
-        print("Error: data format on each line should be: [ip] [base_port] [server_count]")
-
-    return server_namebook
-
-
-def get_type_str(dtype):
-    """Get data type string
-    """
-    if 'float16' in str(dtype):
-        return 'float16'
-    elif 'float32' in str(dtype):
-        return 'float32'
-    elif 'float64' in str(dtype):
-        return 'float64'
-    elif 'uint8' in str(dtype):
-        return 'uint8'
-    elif 'int8' in str(dtype):
-        return 'int8'
-    elif 'int16' in str(dtype):
-        return 'int16'
-    elif 'int32' in str(dtype):
-        return 'int32'
-    elif 'int64' in str(dtype):
-        return 'int64'
-    else:
-        raise RuntimeError('Unknown data type: %s' % str(dtype))
-
-
-class KVServer(object):
-    """KVServer is a lightweight key-value store service for DGL distributed training.
-
-    In practice, developers can use KVServer to hold large-scale graph features or
-    graph embeddings across machines in a distributed setting. Also, user can re-wriite _push_handler()
-    and _pull_handler() API to support flexibale algorithms.
-
-    DGL kvstore supports multiple-servers on single-machine. That means we can lunach many servers on the same machine and all of
-    these servers will share the same shared-memory tensor for load-balance.
-
-    Note that, DO NOT use KVServer in multiple threads on Python because this behavior is not defined.
-
-    For now, KVServer can only run in CPU. We will support GPU KVServer in the future.
-
-    Parameters
-    ----------
-    server_id : int
-        KVServer's ID (start from 0).
-    server_namebook: dict
-        IP address namebook of KVServer, where key is the KVServer's ID
-        (start from 0) and value is the server's machine_id, IP address and port, e.g.,
-
-          {0:'[0, 172.31.40.143, 30050],
-           1:'[0, 172.31.40.143, 30051],
-           2:'[1, 172.31.36.140, 30050],
-           3:'[1, 172.31.36.140, 30051],
-           4:'[2, 172.31.47.147, 30050],
-           5:'[2, 172.31.47.147, 30051],
-           6:'[3, 172.31.30.180, 30050],
-           7:'[3, 172.31.30.180, 30051]}
-
-    num_client : int
-        Total number of client nodes.
-    queue_size : int
-        Sise (bytes) of kvstore message queue buffer (~20 GB on default).
-        Note that the 20 GB is just an upper-bound number and DGL will not allocate 20GB memory.
-    net_type : str
-        networking type, e.g., 'socket' (default) or 'mpi' (do not support yet).
-    """
-    def __init__(self, server_id, server_namebook, num_client, queue_size=20*1024*1024*1024, net_type='socket'):
-        assert server_id >= 0, 'server_id (%d) cannot be a negative number.' % server_id
-        assert len(server_namebook) > 0, 'server_namebook cannot be empty.'
-        assert num_client >= 0, 'num_client (%d) cannot be a negative number.' % num_client
-        assert queue_size > 0, 'queue_size (%d) cannot be a negative number.' % queue_size
-        assert net_type == 'socket' or net_type == 'mpi', 'net_type (%s) can only be \'socket\' or \'mpi\'.' % net_type
-
-        # check if target data has been initialized
-        self._has_data = set()
-        # Store the tensor data with specified data name
-        self._data_store = {}
-        # Used for barrier() API on KVClient
-        self._barrier_count = 0
-        # Server information
-        self._server_id = server_id
-        self._server_namebook = server_namebook
-        self._machine_id = server_namebook[server_id][0]
-        self._ip = server_namebook[server_id][1]
-        self._port = server_namebook[server_id][2]
-        self._group_count = server_namebook[server_id][3]
-        # client_namebook will be sent from remote client nodes
-        self._client_namebook = {}
-        self._client_count = num_client
-        # Create C communicator of sender and receiver
-        self._sender = _create_sender(net_type, queue_size)
-        self._receiver = _create_receiver(net_type, queue_size)
-        # Delete temp file when kvstore service is closed
-        self._open_file_list = []
-        # record for total message count
-        self._msg_count = 0
-        # user-defined push handler
-        self._udf_push_handler = None
-        self._udf_push_param = None
-        # user-defined pull handler
-        self._udf_pull_handler = None
-
-
-    def __del__(self):
-        """Finalize KVServer
-        """
-        # Finalize C communicator of sender and receiver
-        _finalize_sender(self._sender)
-        _finalize_receiver(self._receiver)
-        # Delete temp file when kvstore service is closed
-        for file in self._open_file_list:
-            if (os.path.exists(file)):
-                os.remove(file)
-
-
-    def set_global2local(self, name, global2local=None):
-        """Set data mapping of global ID to local ID.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        global2local : list or tensor (mx.ndarray or torch.tensor)
-            A data mapping of global ID to local ID. KVStore will use global ID by default
-            if the global2local is not been set.
-
-            Note that, if the global2local is None KVServer will read shared-tensor.
-        """
-        assert len(name) > 0, 'name cannot be empty.'
-
-        if global2local is not None: # Create shared-tensor
-            if isinstance(global2local, list):
-                global2local = F.tensor(global2local)
-            assert 'int64' == get_type_str(F.dtype(global2local)), 'global2local must be int64 type.'
-            shared_data = empty_shared_mem(name+'-g2l-', True, global2local.shape, 'int64')
-            dlpack = shared_data.to_dlpack()
-            self._data_store[name+'-g2l-'] = F.zerocopy_from_dlpack(dlpack)
-            self._data_store[name+'-g2l-'][:] = global2local[:]
-            # write data information to temp file that can be read by other processes
-            self._write_data_shape_type(name+'-g2l-shape-'+str(self._machine_id), global2local)
-            self._open_file_list.append(name+'-g2l-shape-'+str(self._machine_id))
-        else: # Read shared-tensor
-            while True:
-                if (os.path.exists(name+'-g2l-shape-'+str(self._machine_id))):
-                    time.sleep(2) # wait writing finish
-                    break
-                else:
-                    time.sleep(2) # wait until the file been created
-            data_shape, data_type = self._read_data_shape_type(name+'-g2l-shape-'+str(self._machine_id))
-            assert data_type == 'int64'
-            shared_data = empty_shared_mem(name+'-g2l-', False, data_shape, 'int64')
-            dlpack = shared_data.to_dlpack()
-            self._data_store[name+'-g2l-'] = F.zerocopy_from_dlpack(dlpack)
-
-        self._has_data.add(name+'-g2l-')
-
-
-    def set_partition_book(self, name, partition_book=None):
-        """Partition book contains the data mapping of global ID to machine ID.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        partition_book : list or tensor (mx.ndarray or torch.tensor)
-            Mapping global ID to target machine ID.
-
-        Note that, if the partition_book is None KVClient will read shared-tensor by name.
-        """
-        assert len(name) > 0, 'name connot be empty.'
-
-        if partition_book is not None: # Create shared-tensor
-            if isinstance(partition_book, list):
-                partition_book = F.tensor(partition_book)
-            assert 'int64' == get_type_str(F.dtype(partition_book)), 'partition_book must be int64 type.'
-            shared_data = empty_shared_mem(name+'-part-', True, partition_book.shape, 'int64')
-            dlpack = shared_data.to_dlpack()
-            self._data_store[name+'-part-'] = F.zerocopy_from_dlpack(dlpack)
-            self._data_store[name+'-part-'][:] = partition_book[:]
-            self._write_data_shape_type(name+'-part-shape-'+str(self._machine_id), partition_book)
-            self._open_file_list.append(name+'-part-shape-'+str(self._machine_id))
-        else: # Read shared-tensor
-            while True:
-                if (os.path.exists(name+'-part-shape-'+str(self._machine_id))):
-                    time.sleep(2) # wait writing finish
-                    break
-                else:
-                    time.sleep(2) # wait until the file been created
-            data_shape, data_type = self._read_data_shape_type(name+'-part-shape-'+str(self._machine_id))
-            assert data_type == 'int64'
-            shared_data = empty_shared_mem(name+'-part-', False, data_shape, 'int64')
-            dlpack = shared_data.to_dlpack()
-            self._data_store[name+'-part-'] = F.zerocopy_from_dlpack(dlpack)
-
-        self._has_data.add(name+'-part-')
-
-
-    def init_data(self, name, data_tensor=None):
-        """Initialize data tensor on KVServe.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        data_tensor : tensor (mx.ndarray or torch.tensor)
-            data tensor
-
-            Note that, if the data_tensor is None KVServer will read shared-tensor.
-        """
-        assert len(name) > 0, 'name cannot be empty.'
-
-        if data_tensor is not None: # Create shared-tensor
-            data_type = get_type_str(F.dtype(data_tensor))
-            shared_data = empty_shared_mem(name+'-data-', True, data_tensor.shape, data_type)
-            dlpack = shared_data.to_dlpack()
-            self._data_store[name+'-data-'] = F.zerocopy_from_dlpack(dlpack)
-            self._data_store[name+'-data-'][:] = data_tensor[:]
-            self._write_data_shape_type(name+'-data-shape-'+str(self._machine_id), data_tensor)
-            self._open_file_list.append(name+'-data-shape-'+str(self._machine_id))
-        else: # Read shared-tensor
-            while True:
-                if (os.path.exists(name+'-data-shape-'+str(self._machine_id))):
-                    break
-                else:
-                    time.sleep(2) # wait until the file been created
-            data_shape, data_type = self._read_data_shape_type(name+'-data-shape-'+str(self._machine_id))
-            shared_data = empty_shared_mem(name+'-data-', False, data_shape, data_type)
-            dlpack = shared_data.to_dlpack()
-            self._data_store[name+'-data-'] = F.zerocopy_from_dlpack(dlpack)
-
-        self._has_data.add(name+'-data-')
-
-
-    def get_id(self):
-        """Get current server id
-
-        Return
-        ------
-        int
-            KVServer ID
-        """
-        return self._server_id
-
-
-    def get_addr(self):
-        """Get current server IP address and port
-
-        Return
-        ------
-        str
-            IP address and port
-        """
-        return self._ip + ':' + str(self._port)
-
-
-    def get_machine_id(self):
-        """Get local machine ID
-
-        Return
-        -------
-        int
-            machine ID
-        """
-        return self._machine_id
-
-
-    def get_group_count(self):
-        """Get count of server inside a machine
-
-        Return
-        ------
-        int
-            count of server
-        """
-        return self._group_count
-
-
-    def get_message_count(self):
-        """Get total message count on current KVServer
-
-        Return
-        ------
-        int
-            count of message
-        """
-        return self._msg_count
-
-
-    def print(self):
-        """Print server information (Used by debug)
-        """
-        print("----- KVStore Info -----")
-        print("server id: %d" % self.get_id())
-        print("data:")
-        for name, data in self._data_store.items():
-            print(name)
-            print(data)
-        print("------------------------")
-
-
-    def start(self):
-        """Start service of KVServer.
-
-        The start() api performs the following things:
-
-          1. Get connected with all client nodes.
-          2. Recv client address information.
-          3. assign client ID to each client node.
-          4. send shared-tensor information to each client node.
-          5. Service loop for listening requests from client nodes.
-
-        """
-        # Get connected with all client nodes
-        _receiver_wait(self._receiver, self._ip, self._port, self._client_count)
-
-        print("%d clients connected!" % self._client_count)
-
-        # recv client address information
-        addr_list = []
-        for i in range(self._client_count):
-            msg = _recv_kv_msg(self._receiver)
-            assert msg.type == KVMsgType.IP_ID
-            addr_list.append(msg.name)
-
-        # Assign client ID to each client node
-        addr_list.sort()
-        for ID in range(len(addr_list)):
-            self._client_namebook[ID] = addr_list[ID]
-
-        _network_wait()
-
-        for ID, addr in self._client_namebook.items():
-            client_ip, client_port = addr.split(':')
-            _add_receiver_addr(self._sender, client_ip, int(client_port), ID)
-
-        _sender_connect(self._sender)
-
-        if self._server_id == 0:
-            for client_id in range(len(self._client_namebook)):
-                msg = KVStoreMsg(
-                    type=KVMsgType.IP_ID,
-                    rank=self._server_id,
-                    name=str(client_id),
-                    id=None,
-                    data=None,
-                    shape=None,
-                    c_ptr=None)
-                _send_kv_msg(self._sender, msg, client_id)
-
-        # Send shared-tensor information to each client node
-        if self._server_id == 0:
-            shared_tensor = ''
-            for name in self._has_data:
-                shared_tensor += self._serialize_shared_tensor(
-                    name, F.dtype(self._data_store[name]))
-                shared_tensor += '|'
-
-            msg = KVStoreMsg(
-                type=KVMsgType.IP_ID,
-                rank=self._server_id,
-                name=shared_tensor,
-                id=None,
-                data=None,
-                shape=None,
-                c_ptr=None)
-
-            for client_id in range(len(self._client_namebook)):
-                _send_kv_msg(self._sender, msg, client_id)
-
-        print('KVStore service %d start successfully! Listen for request ...' % self.get_id())
-
-        # Service loop
-        while True:
-            msg = _recv_kv_msg(self._receiver)
-            # Push message
-            if msg.type == KVMsgType.PUSH:
-                if (msg.name+'-g2l-' in self._has_data) == True:
-                    local_id = self._data_store[msg.name+'-g2l-'][msg.id]
-                else:
-                    local_id = msg.id
-                if self._udf_push_handler is not None:
-                    self._udf_push_handler(msg.name+'-data-', local_id, msg.data, self._data_store, self._udf_push_param)
-                else:
-                    self._default_push_handler(msg.name+'-data-', local_id, msg.data, self._data_store)
-            # Pull message
-            elif msg.type == KVMsgType.PULL:
-                if (msg.name+'-g2l-' in self._has_data) == True:
-                    local_id = self._data_store[msg.name+'-g2l-'][msg.id]
-                else:
-                    local_id = msg.id
-                if self._udf_pull_handler is not None:
-                    res_tensor = self._udf_pull_handler(msg.name+'-data-', local_id, self._data_store)
-                else:
-                    res_tensor = self._default_pull_handler(msg.name+'-data-', local_id, self._data_store)
-                back_msg = KVStoreMsg(
-                    type=KVMsgType.PULL_BACK,
-                    rank=self._server_id,
-                    name=msg.name,
-                    id=msg.id,
-                    data=res_tensor,
-                    shape=None,
-                    c_ptr=None)
-                _send_kv_msg(self._sender, back_msg, msg.rank)
-            # Init new data
-            elif msg.type == KVMsgType.INIT:
-                assert msg.rank == 0
-                data_str, target_name = msg.name.split('|')
-                data_name, data_type = self._deserialize_shared_tensor(data_str)
-                dtype = F.data_type_dict[data_type]
-                data_shape = F.asnumpy(msg.shape).tolist()
-                if self._server_id % self._group_count == 0: # master server
-                    data_tensor = F.zeros(data_shape, dtype, F.cpu())
-                    self.init_data(name=data_name, data_tensor=data_tensor)
-                else: # backup server
-                    self.init_data(name=data_name)
-                g2l = self._data_store[target_name+'-g2l-']
-                self._data_store[data_name+'-g2l-'] = g2l
-                self._has_data.add(data_name+'-g2l-')
-                back_msg = KVStoreMsg(
-                    type=KVMsgType.INIT,
-                    rank=self._server_id,
-                    name=msg.name,
-                    id=None,
-                    data=None,
-                    shape=msg.shape,
-                    c_ptr=None)
-                _send_kv_msg(self._sender, back_msg, 0)
-            # Get shape message
-            elif msg.type == KVMsgType.GET_SHAPE:
-                data_shape = F.tensor(F.shape(self._data_store[msg.name+'-data-']))
-                back_msg = KVStoreMsg(
-                    type=KVMsgType.GET_SHAPE_BACK,
-                    rank=self._server_id,
-                    name=msg.name,
-                    id=None,
-                    data=None,
-                    shape=data_shape,
-                    c_ptr=None)
-                _send_kv_msg(self._sender, back_msg, msg.rank)
-            # Barrier message
-            elif msg.type == KVMsgType.BARRIER:
-                self._barrier_count += 1
-                if self._barrier_count == self._client_count:
-                    back_msg = KVStoreMsg(
-                        type=KVMsgType.BARRIER,
-                        rank=self._server_id,
-                        name=None,
-                        id=None,
-                        data=None,
-                        shape=None,
-                        c_ptr=None)
-                    for client_id in range(self._client_count):
-                        _send_kv_msg(self._sender, back_msg, client_id)
-                    self._barrier_count = 0
-            # Final message
-            elif msg.type == KVMsgType.FINAL:
-                print("Exit KVStore service %d, solved message count: %d" % (self.get_id(), self.get_message_count()))
-                break # exit loop
-            else:
-                raise RuntimeError('Unknown type of kvstore message: %d' % msg.type.value)
-
-            _clear_kv_msg(msg)
-
-            self._msg_count += 1
-
-
-    def _serialize_shared_tensor(self, name, dtype):
-        """Serialize shared tensor information.
-
-        Parameters
-        ----------
-        name : str
-            tensor name
-        dtype : dtype
-            data type
-
-        Returns
-        -------
-        str
-            serialized string
-        """
-        assert len(name) > 0, 'data name cannot be empty.'
-
-        str_data = name
-        str_data += '/'
-        str_data += get_type_str(dtype)
-        return str_data
-
-
-    def _deserialize_shared_tensor(self, data):
-        """Deserialize shared tensor information sent from server
-
-        Parameters
-        ----------
-        data : str
-            serialized string
-
-        Returns
-        -------
-        str
-            tensor name
-        str
-            data type
-        """
-        data_list = data.split('/')
-        tensor_name = data_list[0]
-        data_type = data_list[-1]
-
-        return tensor_name, data_type
-
-
-    def _write_data_shape_type(self, filename, data):
-        """Write data shape to a temp file.
-
-        Parameters
-        ----------
-        filename : str
-            name of temp file.
-        data : tensor (mx.ndarray or torch.tensor)
-            data tensor
-        """
-        assert len(filename) > 0, 'filename cannot be empty.'
-
-        if(os.path.exists(filename)):
-            os.remove(filename)
-
-        shape = F.shape(data)
-        str_data = ''
-        str_data += get_type_str(F.dtype(data))
-        str_data += '|'
-        f = open(filename, "a");
-        for s in shape:
-            str_data += str(s)
-            str_data += '|'
-        f.write(str_data)
-        f.close()
-
-
-    def _read_data_shape_type(self, filename):
-        """Read data shape from a tmp file.
-
-        Parameters
-        ----------
-        filename : str
-            name of temp file
-
-        Return
-        ------
-        tuple
-            data shape
-        """
-        assert len(filename) > 0, 'filename cannot be empty.'
-
-        f = open(filename, "r")
-        str_data = f.read()
-        data_list = str_data.split('|')
-        data_type = data_list[0]
-        data_shape = []
-        for i in range(1, len(data_list)-1):
-            data_shape.append(int(data_list[i]))
-        f.close()
-
-        return data_shape, data_type
-
-
-    def _default_push_handler(self, name, ID, data, target):
-        """Default handler for PUSH message.
-
-        On default, _push_handler perform update operation for the tensor.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        ID : tensor (mx.ndarray or torch.tensor)
-            a vector storing the ID list.
-        data : tensor (mx.ndarray or torch.tensor)
-            a tensor with the same row size of id
-        target : dict of data
-            self._data_store
-        """
-        target[name][ID] = data
-
-
-    def _default_pull_handler(self, name, ID, target):
-        """Default handler for PULL operation.
-
-        On default, _pull_handler perform get operation for the tensor.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        ID : tensor (mx.ndarray or torch.tensor)
-            a vector storing the ID list.
-        target : dict of data
-            self._data_store
-
-        Return
-        ------
-        tensor
-            a tensor with the same row size of ID.
-        """
-        return target[name][ID]
-
-
-class KVClient(object):
-    """KVClient is used to push/pull tensors to/from KVServer. If the server node and client node are on the
-    same machine, they can commuincate with each other using local shared-memory tensor, instead of TCP/IP connections.
-
-    Note that, DO NOT use KVClient in multiple threads on Python because this behavior is not defined.
-
-    For now, KVClient can only run in CPU, and we will support GPU KVClient in the future.
-
-    Parameters
-    ----------
-    server_namebook: dict
-        IP address namebook of KVServer, where key is the KVServer's ID
-        (start from 0) and value is the server's machine_id, IP address and port, and group_count, e.g.,
-
-          {0:'[0, 172.31.40.143, 30050, 2],
-           1:'[0, 172.31.40.143, 30051, 2],
-           2:'[1, 172.31.36.140, 30050, 2],
-           3:'[1, 172.31.36.140, 30051, 2],
-           4:'[2, 172.31.47.147, 30050, 2],
-           5:'[2, 172.31.47.147, 30051, 2],
-           6:'[3, 172.31.30.180, 30050, 2],
-           7:'[3, 172.31.30.180, 30051, 2]}
-
-    queue_size : int
-        Sise (bytes) of kvstore message queue buffer (~20 GB on default).
-    net_type : str
-        networking type, e.g., 'socket' (default) or 'mpi'.
-    """
-    def __init__(self, server_namebook, queue_size=20*1024*1024*1024, net_type='socket'):
-        assert len(server_namebook) > 0, 'server_namebook cannot be empty.'
-        assert queue_size > 0, 'queue_size (%d) cannot be a negative number.' % queue_size
-        assert net_type == 'socket' or net_type == 'mpi', 'net_type (%s) can only be \'socket\' or \'mpi\'.' % net_type
-
-        # check if target data has been initialized
-        self._has_data = set()
-        # This is used to store local data, which can share memory with local KVServer.
-        self._data_store = {}
-        self._full_data_shape = {}
-        self._data_name_list = []
-        # Server information
-        self._server_namebook = server_namebook
-        self._server_count = len(server_namebook)
-        self._group_count = server_namebook[0][3]
-        self._machine_count = int(self._server_count / self._group_count)
-        # client ID will be assign by server after connecting to server
-        self._client_id = -1
-        # Get local machine id via server_namebook
-        self._machine_id = self._get_local_machine_id()
-        # create C communicator of sender and receiver
-        self._sender = _create_sender(net_type, queue_size)
-        self._receiver = _create_receiver(net_type, queue_size)
-        # Delete temp file when kvstore service is closed
-        self._open_file_list = []
-        # Gargage_collection
-        self._garbage_msg = []
-        # User-defined pull handler
-        self._udf_pull_handler = None
-        # User-defined push handler
-        self._udf_push_handler = None
-        self._udf_push_param = None
-        # Used load-balance
-        random.seed(time.time())
-
-
-    def __del__(self):
-        """Finalize KVClient
-        """
-        # finalize C communicator of sender and receiver
-        _finalize_sender(self._sender)
-        _finalize_receiver(self._receiver)
-        # Delete temp file whhen kvstore service is closed
-        for file in self._open_file_list:
-            if(os.path.exists(file)):
-                os.remove(file)
-
-
-    def connect(self):
-        """Connect to all the KVServer nodes
-
-        The connect() api performs the following things:
-
-          1. Get connected with all server nodes.
-          2. Send client address information to server.
-          3. Recv client ID from server.
-          4. Recv shared-tensor information from server.
-
-        """
-        # Get connected with all server nodes
-        for ID, addr in self._server_namebook.items():
-            server_ip = addr[1]
-            server_port = addr[2]
-            _add_receiver_addr(self._sender, server_ip, server_port, ID)
-        _sender_connect(self._sender)
-
-        # Send client address to server nodes
-        self._addr = self._get_local_usable_addr()
-        client_ip, client_port = self._addr.split(':')
-
-        msg = KVStoreMsg(
-            type=KVMsgType.IP_ID,
-            rank=0, # a tmp client ID
-            name=self._addr,
-            id=None,
-            data=None,
-            shape=None,
-            c_ptr=None)
-
-        for server_id in range(self._server_count):
-            _send_kv_msg(self._sender, msg, server_id)
-
-        _receiver_wait(self._receiver, client_ip, int(client_port), self._server_count)
-
-        # Recv client ID from server
-        msg = _recv_kv_msg(self._receiver)
-        assert msg.rank == 0
-        self._client_id = int(msg.name)
-
-        # Recv shared-tensor information from server
-        msg = _recv_kv_msg(self._receiver)
-        assert msg.rank == 0
-        data_str = msg.name.split('|')
-        for data in data_str:
-            if data != '':
-                tensor_name, dtype = self._deserialize_shared_tensor(data)
-                while True:
-                    if (os.path.exists(tensor_name+'shape-'+str(self._machine_id))):
-                        break
-                    else:
-                        time.sleep(1) # wait until the file been created
-                shape, data_type = self._read_data_shape_type(tensor_name+'shape-'+str(self._machine_id))
-                assert data_type == dtype
-                shared_data = empty_shared_mem(tensor_name, False, shape, dtype)
-                dlpack = shared_data.to_dlpack()
-                self._data_store[tensor_name] = F.zerocopy_from_dlpack(dlpack)
-                if '-data-' in tensor_name:
-                    self._data_name_list.append(tensor_name[0:-6])
-                self._has_data.add(tensor_name)
-
-        # Get full shape of each data
-        for name in self._data_name_list:
-            data_shape = list(F.shape(self._data_store[name+'-data-']))
-            data_shape[0] = 0
-            msg = KVStoreMsg(
-                type=KVMsgType.GET_SHAPE,
-                rank=self._client_id,
-                name=name,
-                id=None,
-                data=None,
-                shape=None,
-                c_ptr=None)
-            # send msg
-            for m_id in range(self._machine_count):
-                s_id = m_id * self._group_count
-                _send_kv_msg(self._sender, msg, s_id)
-            # recv msg
-            for m_id in range(self._machine_count):
-                back_msg = _recv_kv_msg(self._receiver)
-                assert back_msg.type == KVMsgType.GET_SHAPE_BACK
-                data_shape[0] += ((F.asnumpy(back_msg.shape)).tolist())[0]
-            self._full_data_shape[name] = tuple(data_shape)
-
-        print("KVClient %d connect to kvstore successfully!" % self.get_id())
-
-
-    def init_data(self, name, shape, dtype, target_name):
-        """Send message to kvserver to initialize new data and
-        get corresponded shared-tensor (e.g., partition_book, g2l) on kvclient.
-
-        The new data will be initialized to zeros.
-
-        Note that, this API must be invoked after the conenct() API.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        shape : list or tuple of int
-            data shape
-        dtype : dtype
-            data type
-        target_name : str
-            target name is used to find existing partition_book and g2l mapping.
-        """
-        assert len(name) > 0, 'name cannot be empty.'
-        assert len(shape) > 0, 'shape cannot be empty.'
-        assert len(target_name) > 0, 'target_name cannot be empty.'
-
-        if self._client_id == 0: # only client_0 send message to server
-            partition_book = self._data_store[target_name+'-part-']
-            machines, count = np.unique(F.asnumpy(partition_book), return_counts=True)
-            assert shape[0] == len(partition_book)
-            # send message to all of the server nodes
-            for idx in range(len(machines)):
-                m_id = machines[idx]
-                data_str = self._serialize_shared_tensor(name, dtype)
-                data_str = data_str + '|' + target_name
-                partitioned_shape = list(shape)
-                partitioned_shape[0] = count[idx]
-                for n in range(self._group_count):
-                    server_id = m_id * self._group_count + n
-                    msg = KVStoreMsg(
-                        type=KVMsgType.INIT,
-                        rank=0,
-                        name=data_str,
-                        id=None,
-                        data=None,
-                        shape=F.tensor(partitioned_shape),
-                        c_ptr=None)
-                    _send_kv_msg(self._sender, msg, server_id)
-            # recv confirmation message from server nodes
-            for server_id in range(self._server_count):
-                msg = _recv_kv_msg(self._receiver)
-                assert msg.type == KVMsgType.INIT
-        self.barrier() # wait all the client and server finish its job
-        g2l = self._data_store[target_name+'-g2l-']
-        partition_book = self._data_store[target_name+'-part-']
-        self._data_store[name+'-g2l-'] = g2l
-        self._data_store[name+'-part-'] = partition_book
-        self._has_data.add(name+'-g2l-')
-        self._has_data.add(name+'-part-')
-        # Read new data from shared-memory created by server
-        shape, data_type = self._read_data_shape_type(name+'-data-shape-'+str(self._machine_id))
-        assert data_type == get_type_str(dtype)
-        shared_data = empty_shared_mem(name+'-data-', False, shape, data_type)
-        dlpack = shared_data.to_dlpack()
-        self._data_store[name+'-data-'] = F.zerocopy_from_dlpack(dlpack)
-        self._has_data.add(name+'-data-')
-        self._data_name_list.append(name)
-        self._full_data_shape[name] = tuple(shape)
-
-
-    def print(self):
-        """Print client information (Used by debug)
-        """
-        print("----- KVClient Info -----")
-        print("client id: %d" % self.get_id())
-        print("data:")
-        for name, data in self._data_store.items():
-            print(name)
-            print(data)
-        print("-------------------------")
-
-
-    def get_id(self):
-        """Get current client id
-
-        Return
-        ------
-        int
-            KVClient ID
-        """
-        return self._client_id
-
-
-    def get_addr(self):
-        """Get current client IP address
-
-        Return
-        ------
-        str
-            IP address
-        """
-        return self._addr
-
-
-    def get_machine_id(self):
-        """Get local machine ID
-
-        Return
-        -------
-        int
-            machine ID
-        """
-        return self._machine_id
-
-
-    def get_data_name_list(self):
-        """Get all the data name
-
-        Return
-        ------
-        list of str
-            name list
-        """
-        return self._data_name_list
-
-
-    def get_data_meta(self, name):
-        """Get meta data (data_type, data_shape, partition_book) of the target shared-tensor
-
-        Parameter
-        ---------
-        name : str
-            data name
-
-        Return
-        ------
-        tuple
-            (data_type, data_shape, partition_book)
-        """
-        assert len(name) > 0, 'name cannot be empty.'
-        assert name + '-data-' in self._has_data, 'Data (%s) does not exist!' % name
-
-        data_type = F.dtype(self._data_store[name+'-data-'])
-        partition_book = self._data_store[name+'-part-']
-        data_shape = self._full_data_shape[name]
-
-        return (data_type, data_shape, partition_book)
-
-
-    def push(self, name, id_tensor, data_tensor):
-        """Push data to KVServer.
-
-        Note that push() is an async operation that will return immediately after calling.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        id_tensor : tensor (mx.ndarray or torch.tensor)
-            a vector storing the global data ID
-        data_tensor : tensor (mx.ndarray or torch.tensor)
-            a tensor with the same row size of data ID
-        """
-        assert len(name) > 0, 'name cannot be empty.'
-        assert F.ndim(id_tensor) == 1, 'ID must be a vector.'
-        assert F.shape(id_tensor)[0] == F.shape(data_tensor)[0], 'The data must has the same row size with ID.'
-
-        # partition data
-        machine_id = self._data_store[name+'-part-'][id_tensor]
-        # sort index by machine id
-        sorted_id = F.tensor(np.argsort(F.asnumpy(machine_id)))
-        id_tensor = id_tensor[sorted_id]
-        data_tensor = data_tensor[sorted_id]
-        machine, count = np.unique(F.asnumpy(machine_id), return_counts=True)
-        # push data to server by order
-        start = 0
-        local_id = None
-        local_data = None
-        for idx in range(len(machine)):
-            end = start + count[idx]
-            if start == end: # No data for target machine
-                continue
-            partial_id = id_tensor[start:end]
-            partial_data = data_tensor[start:end]
-            if machine[idx] == self._machine_id: # local push
-                # Note that DO NOT push local data right now because we can overlap
-                # communication-local_push here
-                if (name+'-g2l-' in self._has_data) == True:
-                    local_id = self._data_store[name+'-g2l-'][partial_id]
-                else:
-                    local_id = partial_id
-                local_data = partial_data
-            else: # push data to remote server
-                msg = KVStoreMsg(
-                    type=KVMsgType.PUSH,
-                    rank=self._client_id,
-                    name=name,
-                    id=partial_id,
-                    data=partial_data,
-                    shape=None,
-                    c_ptr=None)
-                # randomly select a server node in target machine for load-balance
-                s_id = random.randint(machine[idx]*self._group_count, (machine[idx]+1)*self._group_count-1)
-                _send_kv_msg(self._sender, msg, s_id)
-
-            start += count[idx]
-
-        if local_id is not None: # local push
-            if self._udf_push_handler is not None:
-                self._udf_push_handler(name+'-data-', local_id, local_data, self._data_store, self._udf_push_param)
-            else:
-                self._default_push_handler(name+'-data-', local_id, local_data, self._data_store)
-
-
-    def pull(self, name, id_tensor):
-        """Pull message from KVServer.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        id_tensor : tensor (mx.ndarray or torch.tensor)
-            a vector storing the ID list
-
-        Returns
-        -------
-        tensor
-            a data tensor with the same row size of id_tensor.
-        """
-        assert len(name) > 0, 'name cannot be empty.'
-        assert F.ndim(id_tensor) == 1, 'ID must be a vector.'
-
-        if self._udf_pull_handler is None: # Use fast-pull
-            g2l = None
-            if name+'-g2l-' in self._data_store:
-               g2l = self._data_store[name+'-g2l-']
-            return _fast_pull(name, id_tensor,
-                        self._machine_count,
-                        self._group_count,
-                        self._machine_id,
-                        self._client_id,
-                        self._data_store[name+'-part-'],
-                        g2l,
-                        self._data_store[name+'-data-'],
-                        self._sender,
-                        self._receiver)
-        else:
-            for msg in self._garbage_msg:
-                _clear_kv_msg(msg)
-            self._garbage_msg = []
-
-            # partition data
-            machine_id = self._data_store[name+'-part-'][id_tensor]
-            # sort index by machine id
-            sorted_id = F.tensor(np.argsort(F.asnumpy(machine_id)))
-            back_sorted_id = F.tensor(np.argsort(F.asnumpy(sorted_id)))
-            id_tensor = id_tensor[sorted_id]
-            machine, count = np.unique(F.asnumpy(machine_id), return_counts=True)
-            # pull data from server by order
-            start = 0
-            pull_count = 0
-            local_id = None
-            for idx in range(len(machine)):
-                end = start + count[idx]
-                if start == end: # No data for target machine
-                    continue
-                partial_id = id_tensor[start:end]
-                if machine[idx] == self._machine_id: # local pull
-                    # Note that DO NOT pull local data right now because we can overlap
-                    # communication-local_pull here
-                    if (name+'-g2l-' in self._has_data) == True:
-                        local_id = self._data_store[name+'-g2l-'][partial_id]
-                    else:
-                        local_id = partial_id
-                else: # pull data from remote server
-                    msg = KVStoreMsg(
-                        type=KVMsgType.PULL,
-                        rank=self._client_id,
-                        name=name,
-                        id=partial_id,
-                        data=None,
-                        shape=None,
-                        c_ptr=None)
-                    # randomly select a server node in target machine for load-balance
-                    s_id = random.randint(machine[idx]*self._group_count, (machine[idx]+1)*self._group_count-1)
-                    _send_kv_msg(self._sender, msg, s_id)
-                    pull_count += 1
-
-                start += count[idx]
-
-            msg_list = []
-            if local_id is not None: # local pull
-                local_data = self._udf_pull_handler(name+'-data-', local_id, self._data_store)
-                s_id = random.randint(self._machine_id*self._group_count, (self._machine_id+1)*self._group_count-1)
-                local_msg = KVStoreMsg(
-                    type=KVMsgType.PULL_BACK,
-                    rank=s_id,
-                    name=name,
-                    id=None,
-                    data=local_data,
-                    shape=None,
-                    c_ptr=None)
-                msg_list.append(local_msg)
-                self._garbage_msg.append(local_msg)
-
-            # wait message from server nodes
-            for idx in range(pull_count):
-                remote_msg = _recv_kv_msg(self._receiver)
-                msg_list.append(remote_msg)
-                self._garbage_msg.append(remote_msg)
-
-            # sort msg by server id and merge tensor together
-            msg_list.sort(key=self._takeId)
-            data_tensor = F.cat(seq=[msg.data for msg in msg_list], dim=0)
-
-            return data_tensor[back_sorted_id] # return data with original index order
-
-
-    def barrier(self):
-        """Barrier for all client nodes
-
-        This API will be blocked untill all the clients call this API.
-        """
-        msg = KVStoreMsg(
-            type=KVMsgType.BARRIER,
-            rank=self._client_id,
-            name=None,
-            id=None,
-            data=None,
-            shape=None,
-            c_ptr=None)
-
-        for server_id in range(self._server_count):
-            _send_kv_msg(self._sender, msg, server_id)
-
-        for server_id in range(self._server_count):
-            back_msg = _recv_kv_msg(self._receiver)
-            assert back_msg.type == KVMsgType.BARRIER, 'Recv kv msg error.'
-
-
-    def shut_down(self):
-        """Shut down all KVServer nodes.
-
-        We usually invoke this API by just one client (e.g., client_0).
-        """
-        if self._client_id == 0:
-            for server_id in range(self._server_count):
-                msg = KVStoreMsg(
-                    type=KVMsgType.FINAL,
-                    rank=self._client_id,
-                    name=None,
-                    id=None,
-                    data=None,
-                    shape=None,
-                    c_ptr=None)
-                _send_kv_msg(self._sender, msg, server_id)
-
-
-    def _get_local_usable_addr(self):
-        """Get local available IP and port
-
-        Return
-        ------
-        str
-            IP address, e.g., '192.168.8.12:50051'
-        """
-        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        try:
-            # doesn't even have to be reachable
-            s.connect(('10.255.255.255', 1))
-            IP = s.getsockname()[0]
-        except:
-            IP = '127.0.0.1'
-        finally:
-            s.close()
-
-        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        s.bind(("",0))
-        s.listen(1)
-        port = s.getsockname()[1]
-        s.close()
-
-        return IP + ':' + str(port)
-
-
-    def _get_local_machine_id(self):
-        """Get local machine ID from server_namebook
-
-        Return
-        ------
-        int
-            local machine ID
-        """
-        res = 0
-        for ID, data in self._server_namebook.items():
-            machine_id = data[0]
-            ip = data[1]
-            if ip in self._local_ip4_addr_list():
-                res = machine_id
-                break
-
-        return res
-
-
-    def _local_ip4_addr_list(self):
-        """Return a set of IPv4 address
-        """
-        nic = set()
-
-        for ix in socket.if_nameindex():
-            name = ix[1]
-            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-            ip = socket.inet_ntoa(fcntl.ioctl(
-                s.fileno(),
-                0x8915,  # SIOCGIFADDR
-                struct.pack('256s', name[:15].encode("UTF-8")))[20:24])
-            nic.add(ip)
-
-        return nic
-
-
-    def _serialize_shared_tensor(self, name, dtype):
-        """Serialize shared tensor information.
-
-        Parameters
-        ----------
-        name : str
-            tensor name
-        dtype : dtype
-            data type
-
-        Returns
-        -------
-        str
-            serialized string
-        """
-        assert len(name) > 0, 'data name cannot be empty.'
-
-        str_data = name
-        str_data += '/'
-        str_data += get_type_str(dtype)
-        return str_data
-
-
-    def _deserialize_shared_tensor(self, data):
-        """Deserialize shared tensor information sent from server
-
-        Parameters
-        ----------
-        data : str
-            serialized string
-
-        Returns
-        -------
-        str
-            tensor name
-        str
-            data type
-        """
-        data_list = data.split('/')
-        tensor_name = data_list[0]
-        data_type = data_list[-1]
-
-        return tensor_name, data_type
-
-
-    def _write_data_shape(self, filename, data):
-        """Write data shape to a temp file.
-
-        Parameters
-        ----------
-        filename : str
-            name of temp file.
-        data : tensor (mx.ndarray or torch.tensor)
-            data tensor
-        """
-        assert len(filename) > 0, 'filename cannot be empty.'
-
-        if(os.path.exists(filename)):
-            os.remove(filename)
-
-        shape = F.shape(data)
-        str_data = ''
-        f = open(filename, "a");
-        for s in shape:
-            str_data += str(s)
-            str_data += '|'
-        f.write(str_data)
-        f.close()
-
-
-    def _read_data_shape_type(self, filename):
-        """Read data shape from a tmp file.
-
-        Parameters
-        ----------
-        filename : str
-            name of temp file
-
-        Return
-        ------
-        tuple
-            data shape
-        """
-        assert len(filename) > 0, 'filename cannot be empty.'
-
-        f = open(filename, "r")
-        str_data = f.read()
-        data_list = str_data.split('|')
-        data_type = data_list[0]
-        data_shape = []
-        for i in range(1, len(data_list)-1):
-            data_shape.append(int(data_list[i]))
-        f.close()
-
-        return data_shape, data_type
-
-
-    def _takeId(self, elem):
-        """Used by sort message list
-        """
-        return elem.rank
-
-
-    def _default_push_handler(self, name, ID, data, target):
-        """Default handler for PUSH message.
-
-        On default, _push_handler perform update operation for the tensor.
-
-        Parameters
-        ----------
-        name : str
-            data name
-        ID : tensor (mx.ndarray or torch.tensor)
-            a vector storing the ID list.
-        data : tensor (mx.ndarray or torch.tensor)
-            a tensor with the same row size of id
-        target : dict of data
-            self._data_store
-        """
-        target[name][ID] = data
-