Remove deprecated kernels (#3316)

* remove * remove * fix * remove * remove

Remove deprecated kernels (#3316)
* remove * remove * fix * remove * remove
c81efdf2 · Jinjing Zhou · GitHub · 75d793a1 · c81efdf2 · c81efdf2
Unverified Commit c81efdf2 authored Sep 06, 2021 by Jinjing Zhou Committed by GitHub Sep 06, 2021
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,8 +145,6 @@ file(GLOB DGL_SRC
  src/array/cpu/*.cc
  src/random/*.cc
  src/random/cpu/*.cc
-  src/kernel/*.cc
-  src/kernel/cpu/*.cc
  src/runtime/*.cc
  src/geometry/*.cc
  src/geometry/cpu/*.cc

--- a/src/array/cuda/atomic.cuh
+++ b/src/array/cuda/atomic.cuh
@@ -10,6 +10,9 @@
 #include <cassert>
 #include "fp16.cuh"

+#if __CUDA_ARCH__ >= 600
+#include <cuda_fp16.h>
+#endif

 namespace dgl {
 namespace aten {
@@ -133,6 +136,84 @@ DEFINE_ATOMIC_HALF(Min)
 DEFINE_ATOMIC(Add)
 #undef OP

+
+/**
+* \brief Performs an atomic compare-and-swap on 64 bit integers. That is,
+* it the word `old` at the memory location `address`, computes
+* `(old == compare ? val : old)` , and stores the result back to memory at
+* the same address.
+*
+* \param address The address to perform the atomic operation on.
+* \param compare The value to compare to.
+* \param val The new value to conditionally store.
+*
+* \return The old value at the address.
+*/
+inline __device__ int64_t AtomicCAS(
+    int64_t * const address,
+    const int64_t compare,
+    const int64_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = unsigned long long int; // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicCAS(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(compare),
+                   static_cast<Type>(val));
+}
+
+/**
+* \brief Performs an atomic compare-and-swap on 32 bit integers. That is,
+* it the word `old` at the memory location `address`, computes
+* `(old == compare ? val : old)` , and stores the result back to memory at
+* the same address.
+*
+* \param address The address to perform the atomic operation on.
+* \param compare The value to compare to.
+* \param val The new value to conditionally store.
+*
+* \return The old value at the address.
+*/
+inline __device__ int32_t AtomicCAS(
+    int32_t * const address,
+    const int32_t compare,
+    const int32_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = int; // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicCAS(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(compare),
+                   static_cast<Type>(val));
+}
+
+inline __device__ int64_t AtomicMax(
+    int64_t * const address,
+    const int64_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = unsigned long long int; // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicMax(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(val));
+}
+
+inline __device__ int32_t AtomicMax(
+    int32_t * const address,
+    const int32_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = int; // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicMax(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(val));
+}
+
+
 template <>
 __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
 #if __CUDA_ARCH__ >= 200

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -10,10 +10,10 @@
 #include <numeric>

 #include "./dgl_cub.cuh"
-#include "../../kernel/cuda/atomic.cuh"
+#include "../../array/cuda/atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"

-using namespace dgl::kernel::cuda;
+using namespace dgl::aten::cuda;

 namespace dgl {
 namespace aten {

--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce.cc
- * \brief Binary reduce C APIs and definitions.
- */
-#include <dgl/packed_func_ext.h>
-#include <dgl/immutable_graph.h>
-#include "./binary_reduce.h"
-#include "./common.h"
-#include "./binary_reduce_impl_decl.h"
-#include "./utils.h"
-#include "../c_api_common.h"
-#include "../array/check.h"
-#include "../graph/unit_graph.h"
-#include "./csr_interface.h"
-
-using namespace dgl::runtime;
-
-namespace dgl {
-namespace kernel {
-namespace {
-
-// convert ndarray shape to string
-std::string ShapeString(NDArray nd) {
-  std::ostringstream oss;
-  oss << "(";
-  for (int i = 1; i < nd->ndim; ++i) {
-    oss << nd->shape[i];
-    if (i != nd->ndim - 1) {
-      oss << ",";
-    }
-  }
-  oss << ")";
-  return oss.str();
-}
-
-// compute stride vector given shape; assume row-major storage
-std::vector<int64_t> ComputeStride(const std::vector<int64_t>& shape) {
-  std::vector<int64_t> ret(shape.size(), 1);
-  for (int i = shape.size() - 2; i >= 0; --i) {
-    ret[i] = ret[i+1] * shape[i+1];
-  }
-  return ret;
-}
-
-// Return true if the feature shapes of the two ndarrays can be
-// computed element-wisely *without* broadcasting.
-// Examples:
-//
-// valid:
-//  lhs.shape = (N, D1, D2)
-//  rhs.shape = (M, D1, D2)  # the first dimension could be different
-//
-// invalid:
-//  lhs.shape = (N, D1, D2)
-//  rhs.shape = (M, D1)
-bool IsValidBinaryOpShape(NDArray lhs, NDArray rhs) {
-  if (lhs->ndim != rhs->ndim) {
-    return false;
-  }
-  for (int i = 1; i < lhs->ndim; ++i) {
-    if (lhs->shape[i] != rhs->shape[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Return true if broadcasting might be required to compute the element-wise
-// operation between the features of the two ndarrays.
-// The broadcasting semantic strictly follows numpy.
-// Note that the function could return true for invalid element-wise shapes
-// (e.g. lhs.shape = (N, 3), rhs.shape = (N, 5)). This is fine since
-// ``CalcBcastInfo`` will handle that.
-bool HasBcast(NDArray lhs, NDArray rhs) {
-  if (lhs->ndim != rhs->ndim) {
-    return true;
-  }
-  for (int i = 1; i < lhs->ndim; ++i) {
-    if (lhs->shape[i] != rhs->shape[i]) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Compute auxiliary information of broadcasting dimensions.
-// The function preprocesses the feature shapes so that:
-//  - The first dimension (for graph) is removed.
-//  - Feature dimensions are aligned.
-//    e.g. (4,) and (3, 4) become (1, 4) and (3, 4)
-//  - Continuous non-broadcasting dimenions are flattened to reduce number of
-//    integers used to represent the feature shape.
-//    e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9)
-//
-// See also: BcastInfo (kernel/binary_reduce.h)
-BcastInfo CalcBcastInfo(const std::string& op, NDArray lhs, NDArray rhs) {
-  BcastInfo ret;
-  const int max_ndim = std::max(lhs->ndim, rhs->ndim) - 1;
-  int64_t accum = 0;
-  int j = 0;
-  // for dot operation: vector [dot] vector
-  // lhs_shape[ndim-1] == rhs_shape[ndim-1] = sizeof(vector)
-  // out_shape[ndim-1] = 1
-  if (op == binary_op::kDot) {
-    // get size of vector
-    ret.data_len = lhs->shape[lhs->ndim - 1];
-    // skip vector size dim
-    ++j;
-    ret.real_out_shape.push_back(ret.data_len);
-  } else {  // op != binary_op::kDot
-    ret.data_len = 1;
-  }
-
-  for (; j < max_ndim; ++j) {
-    const int dl = (lhs->ndim - 1 - j < 1)? 1 : lhs->shape[lhs->ndim - 1 - j];
-    const int dr = (rhs->ndim - 1 - j < 1)? 1 : rhs->shape[rhs->ndim - 1 - j];
-    if (dl != dr) {
-      if (dl != 1 && dr != 1) {
-        LOG(FATAL) << "Invalid broadcasting between feature shapes "
-          << ShapeString(lhs) << " and " << ShapeString(rhs);
-      }
-      if (accum != 0) {
-        ret.lhs_shape.push_back(accum);
-        ret.rhs_shape.push_back(accum);
-        ret.out_shape.push_back(accum);
-        accum = 0;
-      }
-      ret.lhs_shape.push_back(dl);
-      ret.rhs_shape.push_back(dr);
-      ret.out_shape.push_back(std::max(dl, dr));
-    } else {
-      if (accum == 0) {
-        accum = dl;
-      } else {
-        accum *= dl;
-      }
-    }
-    ret.real_out_shape.push_back(std::max(dl, dr));
-  }
-  if (accum != 0) {
-    ret.lhs_shape.push_back(accum);
-    ret.rhs_shape.push_back(accum);
-    ret.out_shape.push_back(accum);
-    accum = 0;
-  }
-  std::reverse(ret.real_out_shape.begin(), ret.real_out_shape.end());
-  std::reverse(ret.lhs_shape.begin(), ret.lhs_shape.end());
-  std::reverse(ret.rhs_shape.begin(), ret.rhs_shape.end());
-  std::reverse(ret.out_shape.begin(), ret.out_shape.end());
-  // stride
-  ret.lhs_stride = ComputeStride(ret.lhs_shape);
-  ret.rhs_stride = ComputeStride(ret.rhs_shape);
-  ret.out_stride = ComputeStride(ret.out_shape);
-  return ret;
-}
-
-// Function to convert an idarray to string
-std::string IdArrayToStr(IdArray arr) {
-  arr = arr.CopyTo(DLContext{kDLCPU, 0});
-  int64_t len = arr->shape[0];
-  std::ostringstream oss;
-  oss << "(" << len << ")[";
-  if (arr->dtype.bits == 32) {
-    int32_t* data = static_cast<int32_t*>(arr->data);
-    for (int64_t i = 0; i < len; ++i) {
-      oss << data[i] << " ";
-    }
-  } else {
-    int64_t* data = static_cast<int64_t*>(arr->data);
-    for (int64_t i = 0; i < len; ++i) {
-      oss << data[i] << " ";
-    }
-  }
-  oss << "]";
-  return oss.str();
-}
-
-// Check whether the given arguments use the same number of bits.
-inline void CheckIdArray(
-    const uint8_t bits,
-    const std::vector<NDArray>& arrays,
-    const std::vector<std::string>& names) {
-  for (size_t i = 0; i < arrays.size(); ++i) {
-    if (aten::IsNullArray(arrays[i]))
-      continue;
-    CHECK(arrays[i]->dtype.code == kDLInt);
-    CHECK_EQ(arrays[i]->ndim, 1);
-    CHECK_EQ(bits, arrays[i]->dtype.bits)
-      << "Expected " << bits << " integer array. But got "
-      << arrays[i]->dtype.bits << " for " << names[i] << ".";
-  }
-}
-
-// Return true if the operator is commutative and lhs and rhs need
-// to be switched. For example, Add(kDst, kSrc) needs to be changed
-// to Add(kSrc, kDst).
-// This is because we only generate kernels for
-//  Add(kSrc, kDst), Add(kDst, kEdge), Add(kSrc, kDst)
-// to save compilation time.
-inline bool NeedSwitchOrder(const std::string& op,
-    binary_op::Target lhs, binary_op::Target rhs) {
-  CHECK_NE(lhs, rhs);
-  return (op == binary_op::kAdd || op == binary_op::kMul)
-    && lhs > rhs;
-}
-
-class ImmutableGraphCSRWrapper : public CSRWrapper {
- public:
-  explicit ImmutableGraphCSRWrapper(const ImmutableGraph* graph) :
-    gptr_(graph) { }
-
-  aten::CSRMatrix GetInCSRMatrix() const override {
-    return gptr_->GetInCSR()->ToCSRMatrix();
-  }
-
-  aten::CSRMatrix GetOutCSRMatrix() const override {
-    return gptr_->GetOutCSR()->ToCSRMatrix();
-  }
-
-  DGLContext Context() const override {
-    return gptr_->Context();
-  }
-
-  int NumBits() const override {
-    return gptr_->NumBits();
-  }
-
- private:
-  const ImmutableGraph* gptr_;
-};
-
-class UnitGraphCSRWrapper : public CSRWrapper {
- public:
-  explicit UnitGraphCSRWrapper(const UnitGraph* graph) :
-    gptr_(graph) { }
-
-  aten::CSRMatrix GetInCSRMatrix() const override {
-    return gptr_->GetCSCMatrix(0);
-  }
-
-  aten::CSRMatrix GetOutCSRMatrix() const override {
-    return gptr_->GetCSRMatrix(0);
-  }
-
-  DGLContext Context() const override {
-    return gptr_->Context();
-  }
-
-  int NumBits() const override {
-    return gptr_->NumBits();
-  }
-
- private:
-  const UnitGraph* gptr_;
-};
-
-}  // namespace
-
-
-std::vector<int64_t> InferBinaryFeatureShape(
-    const std::string& op,
-    NDArray lhs,
-    NDArray rhs) {
-  return CalcBcastInfo(op, lhs, rhs).real_out_shape;
-}
-
-DGL_REGISTER_GLOBAL("_deprecate.kernel._CAPI_DGLKernelInferBinaryFeatureShape")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    std::string op = args[0];
-    NDArray lhs = args[1];
-    NDArray rhs = args[2];
-    const auto& shape = InferBinaryFeatureShape(op, lhs, rhs);
-    const int64_t len = shape.size();
-    NDArray ret = NDArray::Empty(
-        {len}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0});
-    int64_t* ret_data = static_cast<int64_t*>(ret->data);
-    std::copy(shape.begin(), shape.end(), ret_data);
-    *rv = ret;
-  });
-
-void BinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    NDArray lhs_data, NDArray rhs_data,
-    NDArray out_data,
-    NDArray lhs_mapping, NDArray rhs_mapping,
-    NDArray out_mapping) {
-  const auto& ctx = graph.Context();
-  // sanity check
-  aten::CheckCtx(ctx,
-      {lhs_data, rhs_data, out_data, lhs_mapping, rhs_mapping, out_mapping},
-      {"lhs_data", "rhs_data", "out_data", "lhs_mapping", "rhs_mapping", "out_mapping"});
-  CheckIdArray(graph.NumBits(),
-      {lhs_mapping, rhs_mapping, out_mapping},
-      {"lhs_mapping", "rhs_mapping", "out_mapping"});
-  // Switch order for commutative operation
-  if (NeedSwitchOrder(op, lhs, rhs)) {
-    BinaryOpReduce(reducer, op, graph,
-        rhs, lhs, rhs_data, lhs_data, out_data,
-        rhs_mapping, lhs_mapping, out_mapping);
-  } else {
-    if (HasBcast(lhs_data, rhs_data)) {
-      BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data);
-      DGL_XPU_SWITCH(ctx.device_type, BinaryReduceBcastImpl,
-          info, reducer, op, graph,
-          lhs, rhs,
-          lhs_data, rhs_data, out_data,
-          lhs_mapping, rhs_mapping, out_mapping);
-    } else {
-      CHECK(IsValidBinaryOpShape(lhs_data, rhs_data))
-        << "Cannot compute binary operation between feature shapes "
-        << ShapeString(lhs_data) << " and " << ShapeString(rhs_data);
-      DGL_XPU_SWITCH(ctx.device_type, BinaryReduceImpl,
-          reducer, op, graph,
-          lhs, rhs,
-          lhs_data, rhs_data, out_data,
-          lhs_mapping, rhs_mapping, out_mapping);
-    }
-  }
-}
-
-
-void csrwrapper_switch(DGLArgValue argval,
-                       std::function<void(const CSRWrapper&)> fn) {
-  DGL_CHECK_TYPE_CODE(argval.type_code(), kObjectHandle);
-  if (argval.IsObjectType<GraphRef>()) {
-    GraphRef g = argval;
-    auto igptr = std::dynamic_pointer_cast<ImmutableGraph>(g.sptr());
-    CHECK_NOTNULL(igptr);
-    ImmutableGraphCSRWrapper wrapper(igptr.get());
-    fn(wrapper);
-  } else if (argval.IsObjectType<HeteroGraphRef>()) {
-    HeteroGraphRef g = argval;
-    auto bgptr = std::dynamic_pointer_cast<UnitGraph>(g->GetRelationGraph(0));
-    CHECK_NOTNULL(bgptr);
-    UnitGraphCSRWrapper wrapper(bgptr.get());
-    fn(wrapper);
-  }
-}
-
-DGL_REGISTER_GLOBAL("_deprecate.kernel._CAPI_DGLKernelBinaryOpReduce")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    std::string reducer = args[0];
-    std::string op = args[1];
-    int lhs = args[3];
-    int rhs = args[4];
-    NDArray lhs_data = args[5];
-    NDArray rhs_data = args[6];
-    NDArray out_data = args[7];
-    NDArray lhs_mapping = args[8];
-    NDArray rhs_mapping = args[9];
-    NDArray out_mapping = args[10];
-
-    auto f = [&reducer, &op, &lhs, &rhs, &lhs_data, &rhs_data, &out_data,
-              &lhs_mapping, &rhs_mapping,
-              &out_mapping](const CSRWrapper& wrapper) {
-      BinaryOpReduce(reducer, op, wrapper, static_cast<binary_op::Target>(lhs),
-                     static_cast<binary_op::Target>(rhs), lhs_data, rhs_data,
-                     out_data, lhs_mapping, rhs_mapping, out_mapping);
-    };
-    csrwrapper_switch(args[2], f);
-  });
-
-void BackwardLhsBinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    NDArray lhs_mapping,
-    NDArray rhs_mapping,
-    NDArray out_mapping,
-    NDArray lhs_data,
-    NDArray rhs_data,
-    NDArray out_data,
-    NDArray grad_out_data,
-    NDArray grad_lhs_data) {
-  const auto& ctx = graph.Context();
-  // sanity check
-  aten::CheckCtx(ctx,
-      {lhs_data, rhs_data, out_data, grad_out_data, grad_lhs_data,
-       lhs_mapping, rhs_mapping, out_mapping},
-      {"lhs_data", "rhs_data", "out_data", "grad_out_data", "grad_lhs_data",
-       "lhs_mapping", "rhs_mapping", "out_mapping"});
-  CheckIdArray(graph.NumBits(),
-      {lhs_mapping, rhs_mapping, out_mapping},
-      {"lhs_mapping", "rhs_mapping", "out_mapping"});
-  // Switch order for commutative operation
-  if (NeedSwitchOrder(op, lhs, rhs)) {
-    BackwardRhsBinaryOpReduce(reducer, op, graph,
-        rhs, lhs,
-        rhs_mapping, lhs_mapping, out_mapping,
-        rhs_data, lhs_data, out_data,
-        grad_out_data, grad_lhs_data);
-  } else {
-    if (HasBcast(lhs_data, rhs_data)) {
-      BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data);
-      DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceBcastImpl,
-          info, reducer, op, graph,
-          lhs, rhs,
-          lhs_mapping, rhs_mapping, out_mapping,
-          lhs_data, rhs_data, out_data, grad_out_data,
-          grad_lhs_data, aten::NullArray());
-    } else {
-      DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceImpl,
-          reducer, op, graph,
-          lhs, rhs,
-          lhs_mapping, rhs_mapping, out_mapping,
-          lhs_data, rhs_data, out_data, grad_out_data,
-          grad_lhs_data, aten::NullArray());
-    }
-  }
-}
-
-DGL_REGISTER_GLOBAL("_deprecate.kernel._CAPI_DGLKernelBackwardLhsBinaryOpReduce")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    std::string reducer = args[0];
-    std::string op = args[1];
-    int lhs = args[3];
-    int rhs = args[4];
-    NDArray lhs_mapping = args[5];
-    NDArray rhs_mapping = args[6];
-    NDArray out_mapping = args[7];
-    NDArray lhs_data = args[8];
-    NDArray rhs_data = args[9];
-    NDArray out_data = args[10];
-    NDArray grad_out_data = args[11];
-    NDArray grad_lhs_data = args[12];
-
-    auto f = [&reducer, &op, &lhs, &rhs, &lhs_mapping, &rhs_mapping,
-              &out_mapping, &lhs_data, &rhs_data, &out_data, &grad_out_data,
-              &grad_lhs_data](const CSRWrapper& wrapper) {
-      BackwardLhsBinaryOpReduce(
-          reducer, op, wrapper, static_cast<binary_op::Target>(lhs),
-          static_cast<binary_op::Target>(rhs), lhs_mapping, rhs_mapping,
-          out_mapping, lhs_data, rhs_data, out_data, grad_out_data,
-          grad_lhs_data);
-    };
-    csrwrapper_switch(args[2], f);
-  });
-
-void BackwardRhsBinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    NDArray lhs_mapping,
-    NDArray rhs_mapping,
-    NDArray out_mapping,
-    NDArray lhs_data,
-    NDArray rhs_data,
-    NDArray out_data,
-    NDArray grad_out_data,
-    NDArray grad_rhs_data) {
-  const auto& ctx = graph.Context();
-  // sanity check
-  aten::CheckCtx(ctx,
-      {lhs_data, rhs_data, out_data, grad_out_data, grad_rhs_data,
-       lhs_mapping, rhs_mapping, out_mapping},
-      {"lhs_data", "rhs_data", "out_data", "grad_out_data", "grad_rhs_data",
-       "lhs_mapping", "rhs_mapping", "out_mapping"});
-  CheckIdArray(graph.NumBits(),
-      {lhs_mapping, rhs_mapping, out_mapping},
-      {"lhs_mapping", "rhs_mapping", "out_mapping"});
-  if (NeedSwitchOrder(op, lhs, rhs)) {
-    BackwardLhsBinaryOpReduce(reducer, op, graph,
-        rhs, lhs,
-        rhs_mapping, lhs_mapping, out_mapping,
-        rhs_data, lhs_data, out_data,
-        grad_out_data, grad_rhs_data);
-  } else {
-    if (HasBcast(lhs_data, rhs_data)) {
-      BcastInfo info = CalcBcastInfo(op, lhs_data, rhs_data);
-      DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceBcastImpl,
-          info, reducer, op, graph,
-          lhs, rhs,
-          lhs_mapping, rhs_mapping, out_mapping,
-          lhs_data, rhs_data, out_data, grad_out_data,
-          aten::NullArray(), grad_rhs_data);
-    } else {
-      DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceImpl,
-          reducer, op, graph,
-          lhs, rhs,
-          lhs_mapping, rhs_mapping, out_mapping,
-          lhs_data, rhs_data, out_data, grad_out_data,
-          aten::NullArray(), grad_rhs_data);
-    }
-  }
-}
-
-DGL_REGISTER_GLOBAL("_deprecate.kernel._CAPI_DGLKernelBackwardRhsBinaryOpReduce")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    std::string reducer = args[0];
-    std::string op = args[1];
-    int lhs = args[3];
-    int rhs = args[4];
-    NDArray lhs_mapping = args[5];
-    NDArray rhs_mapping = args[6];
-    NDArray out_mapping = args[7];
-    NDArray lhs_data = args[8];
-    NDArray rhs_data = args[9];
-    NDArray out_data = args[10];
-    NDArray grad_out_data = args[11];
-    NDArray grad_rhs_data = args[12];
-
-    auto f = [&reducer, &op, &lhs, &rhs, &lhs_mapping, &rhs_mapping,
-              &out_mapping, &lhs_data, &rhs_data, out_data, &grad_out_data,
-              &grad_rhs_data](const CSRWrapper& wrapper) {
-      BackwardRhsBinaryOpReduce(
-          reducer, op, wrapper, static_cast<binary_op::Target>(lhs),
-          static_cast<binary_op::Target>(rhs), lhs_mapping, rhs_mapping,
-          out_mapping, lhs_data, rhs_data, out_data, grad_out_data,
-          grad_rhs_data);
-    };
-
-    csrwrapper_switch(args[2], f);
-  });
-
-void CopyReduce(
-    const std::string& reducer,
-    const CSRWrapper& graph,
-    binary_op::Target target,
-    NDArray in_data, NDArray out_data,
-    NDArray in_mapping, NDArray out_mapping) {
-  const auto& ctx = graph.Context();
-  // sanity check
-  aten::CheckCtx(ctx,
-      {in_data, out_data, in_mapping, out_mapping},
-      {"in_data", "out_data", "in_mapping", "out_mapping"});
-  CheckIdArray(graph.NumBits(),
-      {in_mapping, out_mapping},
-      {"in_mapping", "out_mapping"});
-  DGL_XPU_SWITCH(ctx.device_type, BinaryReduceImpl,
-      reducer, binary_op::kUseLhs, graph,
-      target, binary_op::kNone,
-      in_data, aten::NullArray(), out_data,
-      in_mapping, aten::NullArray(), out_mapping);
-}
-
-DGL_REGISTER_GLOBAL("_deprecate.kernel._CAPI_DGLKernelCopyReduce")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    std::string reducer = args[0];
-    int target = args[2];
-    NDArray in_data = args[3];
-    NDArray out_data = args[4];
-    NDArray in_mapping = args[5];
-    NDArray out_mapping = args[6];
-
-    auto f = [&reducer, &target, &in_data, &out_data, &in_mapping,
-              &out_mapping](const CSRWrapper& wrapper) {
-      CopyReduce(reducer, wrapper, static_cast<binary_op::Target>(target),
-                 in_data, out_data, in_mapping, out_mapping);
-    };
-
-    csrwrapper_switch(args[1], f);
-  });
-
-void BackwardCopyReduce(
-    const std::string& reducer,
-    const CSRWrapper& graph,
-    binary_op::Target target,
-    NDArray in_mapping,
-    NDArray out_mapping,
-    NDArray in_data,
-    NDArray out_data,
-    NDArray grad_out_data,
-    NDArray grad_in_data) {
-  const auto& ctx = graph.Context();
-  // sanity check
-  aten::CheckCtx(ctx,
-      {in_data, out_data, grad_out_data, grad_in_data, in_mapping, out_mapping},
-      {"in_data", "out_data", "grad_out_data", "grad_in_data", "in_mapping", "out_mapping"});
-  CheckIdArray(graph.NumBits(),
-      {in_mapping, out_mapping},
-      {"in_mapping", "out_mapping"});
-  if (!aten::IsNullArray(out_mapping)) {
-    CHECK_EQ(ctx, out_mapping->ctx) << "Expected device context " << ctx
-      << ". But got " << out_mapping->ctx << " for rhs_data.";
-  }
-  DGL_XPU_SWITCH(ctx.device_type, BackwardBinaryReduceImpl,
-      reducer, binary_op::kUseLhs, graph,
-      target, binary_op::kNone,
-      in_mapping, aten::NullArray(), out_mapping,
-      in_data, aten::NullArray(), out_data, grad_out_data,
-      grad_in_data, aten::NullArray());
-}
-
-DGL_REGISTER_GLOBAL("_deprecate.kernel._CAPI_DGLKernelBackwardCopyReduce")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    std::string reducer = args[0];
-    int target = args[2];
-    NDArray in_data = args[3];
-    NDArray out_data = args[4];
-    NDArray grad_out_data = args[5];
-    NDArray grad_in_data = args[6];
-    NDArray in_mapping = args[7];
-    NDArray out_mapping = args[8];
-
-    auto f = [&reducer, &target, &in_mapping, &out_mapping, &in_data, &out_data,
-              &grad_out_data, &grad_in_data](const CSRWrapper& wrapper) {
-      BackwardCopyReduce(
-          reducer, wrapper, static_cast<binary_op::Target>(target), in_mapping,
-          out_mapping, in_data, out_data, grad_out_data, grad_in_data);
-    };
-
-    csrwrapper_switch(args[1], f);
-  });
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/binary_reduce.h
+++ b/src/kernel/binary_reduce.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce.h
- * \brief Binary reduce function C++ header.
- */
-#ifndef DGL_KERNEL_BINARY_REDUCE_H_
-#define DGL_KERNEL_BINARY_REDUCE_H_
-
-#include <dgl/runtime/ndarray.h>
-
-#include <vector>
-#include <string>
-
-#include "./binary_reduce_common.h"
-#include "./csr_interface.h"
-
-namespace dgl {
-namespace kernel {
-
-// Structure for broadcasting shapes
-struct BcastInfo {
-  // inferred output shape
-  std::vector<int64_t> real_out_shape;
-  // Following shapes here have been preprocessed, so that:
-  //  - The first dimension (for graph) is removed. Shapes here are only for features.
-  //  - They have the same number of dimensions.
-  //    e.g. (4,) and (3, 4) become (1, 4) and (3, 4)
-  //  - Continuous non-broadcasting dimenions are flattened.
-  //    e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9)
-  std::vector<int64_t> lhs_shape, lhs_stride;
-  std::vector<int64_t> rhs_shape, rhs_stride;
-  std::vector<int64_t> out_shape, out_stride;
-
-  int64_t data_len;
-};
-
-/*
- * !\brief Compute the feature shape after binary reduce computation.
- */
-std::vector<int64_t> InferBinaryFeatureShape(
-    runtime::NDArray lhs,
-    runtime::NDArray rhs);
-
-/*!
- * \brief Perform binary operation between the given data and reduce by the graph.
- *
- * If the reducer is one of "sum, "max, "min", "prod", the operator computes,
- * for each node i,
- *
- *   out[i] = Sigma_{j\in Neighbor(i)} ( A[s1(i, j, e)] op B[s2(i, j, e)] )
- *
- * , where A, B are two input feature tensors, op could be element-wise add/sub/div/mul.
- * Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
- * ids of each neighbor.
- *
- * If the reducer is "none", the operator computes, for each edge e,
- *
- *   out[e] = A[s1(i, j, e)] op B[s2(i, j, e)]
- *
- * Here, the node/edge feature (e.g., A[i], B[e]) could be dense tensor. In such
- * case, broadcasting is supported on the feature dimensions.
- *
- * Examples:
- *
- * A.shape = (N, D1, D2)  # N is the number of nodes
- * B.shape = (M, D1, 1)   # M is the number of edges
- * C = BinaryOpReduce("sum", "add", graph, A, B, ...)
- * C.shape = (N, D1, D2)
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-void BinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-
-/*!
- * \brief Compute the lhs gradient of BinaryOpReduce
- *
- * Broadcasting along feature dimensions is supported. However, the gradient
- * of the being-broadcasted dimensions will *not* be reduced. Therefore, the
- * gradient tensor has the same shape with the out tensor.
- *
- * Examples:
- * A.shape = (N, D1, 1)    # N is the number of nodes
- * B.shape = (M, D1, D2)   # M is the number of edges
- * C = BinaryOpReduce("sum", "add", graph, A, B, ...)
- * C.shape = (N, D1, D2)
- * dC.shape = (N, D1, D2)
- * dA = BackwardLhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
- * dA.shape = (N, D1, D2)  # extra reduction should be handled afterwards
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_lhs_data The gradient lhs tensor.
- */
-void BackwardLhsBinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping,
-    runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data,
-    runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data);
-
-/*!
- * \brief Compute the rhs gradient of BinaryOpReduce
- *
- * Broadcasting along feature dimensions is supported. However, the gradient
- * of the being-broadcasted dimensions will *not* be reduced. Therefore, the
- * gradient tensor has the same shape with the out tensor.
- *
- * Examples:
- * A.shape = (N, D1, D2)   # N is the number of nodes
- * B.shape = (M, D1, 1)    # M is the number of edges
- * C = BinaryOpReduce("sum", "add", graph, A, B, ...)
- * C.shape = (N, D1, D2)
- * dC.shape = (N, D1, D2)
- * dB = BackwardRhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
- * dB.shape = (N, D1, D2)  # extra reduction should be handled afterwards
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_rhs_data The gradient rhs tensor.
- */
-void BackwardRhsBinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping,
-    runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data,
-    runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_rhs_data);
-
-/*!
- * \brief Copy the target data and reduce by graph structure.
- *
- * If the reducer is one of "sum, "max, "min", "prod", the operator computes,
- * for each node i,
- *
- *   out[i] = Sigma_{j\in Neighbor(i)} A[s1(i, j, e)]
- *
- * , where A, B are two input feature tensors.
- * Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
- * ids of each neighbor.
- *
- * If the reducer is "none", the operator computes, for each edge e,
- *
- *   out[e] = A[s1(i, j, e)]
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param graph The graph object.
- * \param target The nput target (src, edge)
- * \param in_data The input feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param in_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-void CopyReduce(
-    const std::string& reducer,
-    const CSRWrapper& graph,
-    binary_op::Target target,
-    runtime::NDArray in_data, runtime::NDArray out_data,
-    runtime::NDArray in_mapping, runtime::NDArray out_mapping);
-
-/*!
- * \brief Compute backward of the CopyReduce
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param graph The graph object.
- * \param target The nput target (src, edge)
- * \param in_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param in_data The input feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_in_data The gradient input tensor.
- */
-void BackwardCopyReduce(
-    const std::string& reducer,
-    const CSRWrapper& graph,
-    binary_op::Target target,
-    runtime::NDArray in_mapping,
-    runtime::NDArray out_mapping,
-    runtime::NDArray in_data,
-    runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_in_data);
-
-}  // namespace kernel
-}  // namespace dgl
-
-#endif  // DGL_KERNEL_BINARY_REDUCE_H_
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce_common.h
- * \brief Common utilities for binary reduce operation.
- */
-#ifndef DGL_KERNEL_BINARY_REDUCE_COMMON_H_
-#define DGL_KERNEL_BINARY_REDUCE_COMMON_H_
-
-#include <dgl/runtime/ndarray.h>
-
-#include <limits>
-#include <string>
-
-#include "./common.h"
-
-namespace dgl {
-namespace kernel {
-namespace binary_op {
-/*! \brief Reducer names. */
-static const char kReduceSum[] = "sum";
-static const char kReduceMax[] = "max";
-static const char kReduceMin[] = "min";
-static const char kReduceMean[] = "mean";
-static const char kReduceProd[] = "prod";
-static const char kReduceNone[] = "none";
-
-/*! \brief Binary op names. */
-static const char kAdd[] = "add";
-static const char kSub[] = "sub";
-static const char kMul[] = "mul";
-static const char kDiv[] = "div";
-static const char kDot[] = "dot";
-static const char kUseLhs[] = "use_lhs";
-
-/*!
- * \brief Enum code for operand targets.
- * \seealso BinaryOpReduce in binary_reduce_common.h
- */
-enum Target {
-  kSrc = 0,  // select src node
-  kDst,      // select dst node
-  kEdge,     // select edge
-  kNone,     // select none
-};
-
-/*! \brief Enum code for backward operator mode. */
-enum BackwardMode {
-  kGradLhs = 0,  // compute lhs gradient
-  kGradRhs,      // compute rhs gradient
-  kGradBoth,     // compute both gradients
-};
-}  // namespace binary_op
-
-//////////////////////////////////////////////////////////////////////////
-// Defines operand target category. Each category is a structure with
-// two static members:
-//  - target: The enum code of this category.
-//  - Call: The call functor that returns the selected target.
-//////////////////////////////////////////////////////////////////////////
-
-/*! \brief Select src category. */
-struct SelectSrc {
-  // Target value
-  static constexpr binary_op::Target target = binary_op::kSrc;
-  // Call functor.
-  template <typename T>
-  static DGLDEVICE DGLINLINE T Call(T src, T edge, T dst) {
-    return src;
-  }
-};
-
-/*! \brief Select dst category. */
-struct SelectDst {
-  // Target value
-  static constexpr binary_op::Target target = binary_op::kDst;
-  // Call functor.
-  template <typename T>
-  static DGLDEVICE DGLINLINE T Call(T src, T edge, T dst) {
-    return dst;
-  }
-};
-
-/*! \brief Select edge category. */
-struct SelectEdge {
-  // Target value
-  static constexpr binary_op::Target target = binary_op::kEdge;
-  // Call functor.
-  template <typename T>
-  static DGLDEVICE DGLINLINE T Call(T src, T edge, T dst) {
-    return edge;
-  }
-};
-
-/*! \brief Select none category. */
-struct SelectNone {
-  // Target value
-  static constexpr binary_op::Target target = binary_op::kNone;
-  // Call functor.
-  template <typename T>
-  static DGLDEVICE DGLINLINE T Call(T src, T edge, T dst) {
-    return 0;
-  }
-};
-
-/*! \brief Type functor to switch SelectSrc and SelectDst category.
- * SelectEdge and SelectNone will remain the same. */
-template <typename Selector>
-struct SwitchSrcDst {
-  typedef Selector Type;
-};
-
-template <>
-struct SwitchSrcDst<SelectSrc> {
-  typedef SelectDst Type;
-};
-
-template <>
-struct SwitchSrcDst<SelectDst> {
-  typedef SelectSrc Type;
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Defines binary op category. Each category is a structure with
-// three static members:
-//  - Call: The forward computation given two operand.
-//  - BackwardLhs: Compute lhs gradient.
-//  - BackwardRhs: Compute rhs gradient.
-//////////////////////////////////////////////////////////////////////////
-
-// common binary functors
-template <typename DType>
-struct BinaryAdd {
-  static DGLDEVICE DGLINLINE DType Call(const DType *lhs, const DType *rhs, int64_t len) {
-    return lhs[0] + rhs[0];
-  }
-  static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
-    return 1;
-  }
-  static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
-    return 1;
-  }
-};
-
-template <typename DType>
-struct BinaryMul {
-  static DGLDEVICE DGLINLINE DType Call(const DType *lhs, const DType *rhs, int64_t len) {
-    return lhs[0] * rhs[0];
-  }
-  static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
-    return rhs;
-  }
-  static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
-    return lhs;
-  }
-};
-
-template <typename DType>
-struct BinarySub {
-  static DGLDEVICE DGLINLINE DType Call(const DType *lhs, const DType *rhs, int64_t len) {
-    return lhs[0] - rhs[0];
-  }
-  static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
-    return 1;
-  }
-  static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
-    return -1;
-  }
-};
-
-template <typename DType>
-struct BinaryDiv {
-  static DGLDEVICE DGLINLINE DType Call(const DType *lhs, const DType *rhs, int64_t len) {
-    return lhs[0] / rhs[0];
-  }
-  static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
-    return static_cast<DType>(1) / rhs;
-  }
-  static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
-    return -lhs / (rhs * rhs);
-  }
-};
-
-template <typename DType>
-struct BinaryUseLhs {
-  static DGLDEVICE DGLINLINE DType Call(const DType *lhs, const DType *rhs, int64_t len) {
-    return lhs[0];
-  }
-  static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
-    return 1;
-  }
-  static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
-    return 0;
-  }
-};
-
-template <typename DType>
-struct BinaryDot {
-  static DGLDEVICE DGLINLINE DType Call(const DType *lhs, const DType *rhs, int64_t len) {
-    DType out = 0;
-    // simple vector dot vector
-#pragma unroll
-    for (int i = 0; i < len; i ++)
-      out += lhs[i] * rhs[i];
-
-    return out;
-  }
-  static DGLDEVICE DGLINLINE DType BackwardLhs(DType lhs, DType rhs, DType out) {
-    return rhs;
-  }
-  static DGLDEVICE DGLINLINE DType BackwardRhs(DType lhs, DType rhs, DType out) {
-    return lhs;
-  }
-};
-
-// Macro for dispatching op enum code and target code into template arguments.
-// The macro dispatches following combinations:
-//  - Add(Src, Dst), Add(Src, Edge), Add(Dst, Edge)
-//  - Mul(Src, Dst), Mul(Src, Edge), Mul(Dst, Edge)
-//  - Sub(Src, Dst), Sub(Src, Edge), Sub(Dst, Edge)
-//    Sub(Dst, Src), Sub(Edge, Src), Sub(Edge, Dst)
-//  - Div(Src, Dst), Div(Src, Edge), Div(Dst, Edge)
-//    Div(Dst, Src), Div(Edge, Src), Div(Edge, Dst)
-//  - UseLhs(Src, None), UseLhs(Edge, None)
-//  - Dot(Src, Dst), Dot(Src, Edge), Dot(Dst, Edge)
-//  - Dot(Dst, Src), Dot(Edge, Src), Dot(Edge, Dst)
-// Note that for commutative operators (e.g. Add and Mul), we only generate
-// kernels for lhs code smaller than rhs code.
-#define OP_TARGET_SWITCH(op, lhs, rhs, DType, OpType, LeftType, RightType, ...)   \
-  {                                                            \
-  using namespace binary_op;                                   \
-  if (op == kAdd && lhs == kSrc && rhs == kDst) {              \
-    typedef BinaryAdd<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kAdd && lhs == kSrc && rhs == kEdge) {      \
-    typedef BinaryAdd<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kAdd && lhs == kDst && rhs == kEdge) {      \
-    typedef BinaryAdd<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kMul && lhs == kSrc && rhs == kDst) {       \
-    typedef BinaryMul<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kMul && lhs == kSrc && rhs == kEdge) {      \
-    typedef BinaryMul<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kMul && lhs == kDst && rhs == kEdge) {      \
-    typedef BinaryMul<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kSub && lhs == kSrc && rhs == kDst) {       \
-    typedef BinarySub<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kSub && lhs == kDst && rhs == kSrc) {       \
-    typedef BinarySub<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectSrc RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kSub && lhs == kSrc && rhs == kEdge) {      \
-    typedef BinarySub<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kSub && lhs == kEdge && rhs == kSrc) {      \
-    typedef BinarySub<DType> OpType;                           \
-    typedef SelectEdge LeftType;                               \
-    typedef SelectSrc RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kSub && lhs == kDst && rhs == kEdge) {      \
-    typedef BinarySub<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kSub && lhs == kEdge && rhs == kDst) {      \
-    typedef BinarySub<DType> OpType;                           \
-    typedef SelectEdge LeftType;                               \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDiv && lhs == kSrc && rhs == kDst) {       \
-    typedef BinaryDiv<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDiv && lhs == kDst && rhs == kSrc) {       \
-    typedef BinaryDiv<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectSrc RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDiv && lhs == kSrc && rhs == kEdge) {      \
-    typedef BinaryDiv<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDiv && lhs == kEdge && rhs == kSrc) {      \
-    typedef BinaryDiv<DType> OpType;                           \
-    typedef SelectEdge LeftType;                               \
-    typedef SelectSrc RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDiv && lhs == kDst && rhs == kEdge) {      \
-    typedef BinaryDiv<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDiv && lhs == kEdge && rhs == kDst) {      \
-    typedef BinaryDiv<DType> OpType;                           \
-    typedef SelectEdge LeftType;                               \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kUseLhs && lhs == kSrc) {                   \
-    typedef BinaryUseLhs<DType> OpType;                        \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectNone RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kUseLhs && lhs == kEdge) {                  \
-    typedef BinaryUseLhs<DType> OpType;                        \
-    typedef SelectEdge LeftType;                               \
-    typedef SelectNone RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDot && lhs == kSrc && rhs == kDst) {       \
-    typedef BinaryDot<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDot && lhs == kSrc && rhs == kEdge) {      \
-    typedef BinaryDot<DType> OpType;                           \
-    typedef SelectSrc LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDot && lhs == kDst && rhs == kEdge) {      \
-    typedef BinaryDot<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectEdge RightType;                              \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDot && lhs == kDst && rhs == kSrc) {       \
-    typedef BinaryDot<DType> OpType;                           \
-    typedef SelectDst LeftType;                                \
-    typedef SelectSrc RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDot && lhs == kEdge && rhs == kSrc) {      \
-    typedef BinaryDot<DType> OpType;                           \
-    typedef SelectEdge LeftType;                               \
-    typedef SelectSrc RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else if (op == kDot && lhs == kEdge && rhs == kDst) {      \
-    typedef BinaryDot<DType> OpType;                           \
-    typedef SelectEdge LeftType;                               \
-    typedef SelectDst RightType;                               \
-    {__VA_ARGS__}                                              \
-  } else {                                                     \
-    LOG(FATAL) << "Unsupported operation: op=" << op           \
-      << " lhs=" << lhs << " rhs=" << rhs;                     \
-  }                                                            \
-  }
-
-// Macro for unrolling with various template argument combinations
-#define GEN_OP_TARGET(GEN, ...) \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryAdd))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryAdd))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryAdd))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryMul))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryMul))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryMul))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinarySub))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinarySub))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinarySub))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinarySub))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinarySub))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinarySub))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDiv))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinaryDiv))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDiv))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinaryDiv))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDiv))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDiv))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectNone, BinaryUseLhs))  \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectNone, BinaryUseLhs)) \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectDst, BinaryDot))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectSrc, SelectEdge, BinaryDot))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectEdge, BinaryDot))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectDst, SelectSrc, BinaryDot))      \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectSrc, BinaryDot))     \
-  MSVC_EXPAND(GEN(__VA_ARGS__, SelectEdge, SelectDst, BinaryDot))
-
-//////////////////////////////////////////////////////////////////////////
-// Defines reducer category. Each category is an empty structure.
-// The call functor is device dependent, so should be specialized
-// in the each device's implementation.
-// See Also:
-//  - kernel/cpu/functor.h
-//  - kernel/cuda/functor.cuh
-//////////////////////////////////////////////////////////////////////////
-
-// functors for reducers
-template <int XPU, typename DType>
-struct ReduceSum { };
-
-template <int XPU, typename DType>
-struct ReduceMax { };
-
-template <int XPU, typename DType>
-struct ReduceMin { };
-
-template <int XPU, typename DType>
-struct ReduceProd { };
-
-template <int XPU, typename DType>
-struct ReduceNone { };
-
-// Macro for dispatching reducer names to Reducer op structure
-#define REDUCER_SWITCH(val, XPU, DType, RedType, ...)   \
-  if (val == binary_op::kReduceSum                 \
-      || val == binary_op::kReduceMean) {          \
-    typedef ReduceSum<XPU, DType> RedType;         \
-    {__VA_ARGS__}                                  \
-  } else if (val == binary_op::kReduceMax) {       \
-    typedef ReduceMax<XPU, DType> RedType;         \
-    {__VA_ARGS__}                                  \
-  } else if (val == binary_op::kReduceMin) {       \
-    typedef ReduceMin<XPU, DType> RedType;         \
-    {__VA_ARGS__}                                  \
-  } else if (val == binary_op::kReduceProd) {      \
-    typedef ReduceProd<XPU, DType> RedType;        \
-    {__VA_ARGS__}                                  \
-  } else if (val == binary_op::kReduceNone) {      \
-    typedef ReduceNone<XPU, DType> RedType;        \
-    {__VA_ARGS__}                                  \
-  } else {                                         \
-    LOG(FATAL) << "Unsupported reducer: " << val;  \
-  }
-
-// Type trait for getting zero value of the given reducer type.
-template <typename Reducer>
-struct Zero { };
-
-template <int XPU, typename DType>
-struct Zero<ReduceSum<XPU, DType>> {
-  static constexpr DType value = 0;
-};
-
-template <int XPU, typename DType>
-struct Zero<ReduceMax<XPU, DType>> {
-  static constexpr DType value = std::numeric_limits<DType>::lowest();
-};
-
-template <int XPU, typename DType>
-struct Zero<ReduceMin<XPU, DType>> {
-  static constexpr DType value = std::numeric_limits<DType>::max();
-};
-
-template <int XPU, typename DType>
-struct Zero<ReduceProd<XPU, DType>> {
-  static constexpr DType value = 1;
-};
-
-template <int XPU, typename DType>
-struct Zero<ReduceNone<XPU, DType>> {
-  static constexpr DType value = 0;
-};
-
-template <int XPU, typename DType>
-constexpr DType Zero<ReduceSum<XPU, DType>>::value;
-
-template <int XPU, typename DType>
-constexpr DType Zero<ReduceMax<XPU, DType>>::value;
-
-template <int XPU, typename DType>
-constexpr DType Zero<ReduceMin<XPU, DType>>::value;
-
-template <int XPU, typename DType>
-constexpr DType Zero<ReduceProd<XPU, DType>>::value;
-
-template <int XPU, typename DType>
-constexpr DType Zero<ReduceNone<XPU, DType>>::value;
-
-// Type functor for selecting output target based on reducer type.
-/*! \brief For all the reducer types except ReduceNone, select dst as the output target. */
-template <typename Reducer>
-struct OutSelector {
-  typedef SelectDst Type;
-};
-
-/*! \brief For ReduceNone, select edge as the output target. */
-template <int XPU, typename DType>
-struct OutSelector<ReduceNone<XPU, DType>> {
-  typedef SelectEdge Type;
-};
-
-// macro for dispatching number of broadcasting dimensions to template argument
-#define BCAST_NDIM_SWITCH(ndim, NDim, ...) \
-  if (ndim <= 2) {                         \
-    constexpr int NDim = 2;                \
-    {__VA_ARGS__}                          \
-  } else if (ndim <= 4) {                  \
-    constexpr int NDim = 4;                \
-    {__VA_ARGS__}                          \
-  } else if (ndim <= 8) {                  \
-    constexpr int NDim = 8;                \
-    {__VA_ARGS__}                          \
-  } else {                                 \
-    LOG(FATAL) << "Too many broadcasting dimensions."; \
-  }
-
-// macro for unrolling different broadcasting dimensions
-#define GEN_NDIM(GEN, ...) \
-  MSVC_EXPAND(GEN(__VA_ARGS__, 2)) \
-  MSVC_EXPAND(GEN(__VA_ARGS__, 4)) \
-  MSVC_EXPAND(GEN(__VA_ARGS__, 8))
-
-// macro for dispatching backward mode enum to template argument
-#define BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, ...) \
-  CHECK(!(req_lhs && req_rhs));                           \
-  if (req_lhs) {                                          \
-    constexpr int Mode = binary_op::kGradLhs;             \
-    {__VA_ARGS__}                                         \
-  } else {                                                \
-    constexpr int Mode = binary_op::kGradRhs;             \
-    {__VA_ARGS__}                                         \
-  }
-
-// macro for unrolling different backward mode
-#define GEN_BACKWARD_MODE(GEN, ...)        \
-  MSVC_EXPAND(GEN(__VA_ARGS__, binary_op::kGradLhs))    \
-  MSVC_EXPAND(GEN(__VA_ARGS__, binary_op::kGradRhs))    \
-  MSVC_EXPAND(GEN(__VA_ARGS__, binary_op::kGradBoth))
-
-}  // namespace kernel
-}  // namespace dgl
-
-#endif  // DGL_KERNEL_BINARY_REDUCE_COMMON_H_
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce_impl.h
- * \brief Implementations of binary reduce operations.
- */
-#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_H_
-#define DGL_KERNEL_BINARY_REDUCE_IMPL_H_
-
-#include <minigun/minigun.h>
-#include <dgl/runtime/device_api.h>
-
-#include <algorithm>
-#include <string>
-
-#ifdef __CUDACC__
-#include "../runtime/cuda/cuda_common.h"
-#endif
-#include "./binary_reduce.h"
-#include "./binary_reduce_impl_decl.h"
-#include "./csr_interface.h"
-#include "./utils.h"
-
-namespace dgl {
-namespace kernel {
-
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduce device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-
-template <int XPU, typename Idx, typename DType, typename Reducer>
-GData<Idx, DType> AllocGData(const std::string& op,
-    const DLContext& ctx, int64_t x_len,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_mapping, runtime::NDArray out_data) {
-  // GData
-  GData<Idx, DType> gdata;
-  gdata.x_length = x_len;
-  gdata.lhs_data = static_cast<DType*>(lhs_data->data);
-  gdata.rhs_data = static_cast<DType*>(rhs_data->data);
-  gdata.out_data = static_cast<DType*>(out_data->data);
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-
-  // for dot operation: vector [dot] vector
-  if (op == binary_op::kDot) {
-    // get size of vector
-    gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
-  } else {
-    gdata.data_len = 1;
-  }
-
-  // fill out data with zero values
-  utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
-  return gdata;
-}
-
-template <int XPU>
-void BinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping) {
-  using runtime::NDArray;
-  using minigun::Csr;
-  // device
-#ifdef __CUDACC__
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  const int64_t x_len = utils::ComputeXLength(out_data);
-
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out_data->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  const DLDataType& dtype = out_data->dtype;
-  const auto bits = graph.NumBits();
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-        auto gdata = AllocGData<XPU, Idx, DType, Reducer>(op,
-            rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
-            lhs_data, rhs_data, out_mapping, out_data);
-        OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-          CallBinaryReduce<XPU, Idx, DType, LeftTarget,
-            RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-        });
-      });
-    });
-  });
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduce device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-
-template <int XPU, typename Idx, typename DType>
-BackwardGData<Idx, DType> AllocBackwardGData(
-    const std::string& op, const DLContext& ctx, int64_t x_len,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data) {
-  // GData
-  BackwardGData<Idx, DType> gdata;
-  gdata.x_length = x_len;
-  gdata.lhs_data = static_cast<DType*>(lhs_data->data);
-  gdata.rhs_data = static_cast<DType*>(rhs_data->data);
-  gdata.out_data = static_cast<DType*>(out_data->data);
-  gdata.grad_out_data = static_cast<DType*>(grad_out_data->data);
-  if (!aten::IsNullArray(grad_lhs_data)) {
-    gdata.grad_lhs_data = static_cast<DType*>(grad_lhs_data->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_lhs_data, utils::NElements(grad_lhs_data),
-                static_cast<DType>(0));
-  }
-  if (!aten::IsNullArray(grad_rhs_data)) {
-    gdata.grad_rhs_data = static_cast<DType*>(grad_rhs_data->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_rhs_data, utils::NElements(grad_rhs_data),
-                static_cast<DType>(0));
-  }
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-
-  // for dot operation: vector [dot] vector
-  if (op == binary_op::kDot) {
-    // get size of vector
-    gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
-  } else {
-    gdata.data_len = 1;
-  }
-  return gdata;
-}
-
-template <int XPU>
-void BackwardBinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data) {
-  using runtime::NDArray;
-  using minigun::Csr;
-#ifdef __CUDACC__
-  // device
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  // Graph
-  const int64_t x_len = utils::ComputeXLength(out_data);
-
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out_data->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-
-  const DLDataType& dtype = out_data->dtype;
-  const bool req_lhs = !aten::IsNullArray(grad_lhs_data);
-  const bool req_rhs = !aten::IsNullArray(grad_rhs_data);
-  const auto bits = graph.NumBits();
-
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      auto gdata = AllocBackwardGData<XPU, Idx, DType>(op,
-          rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, out_mapping,
-          lhs_data, rhs_data, out_data, grad_out_data,
-          grad_lhs_data, grad_rhs_data);
-      BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, {
-        REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-          OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-            CallBackwardBinaryReduce<XPU, Mode, Idx, DType, LeftTarget,
-              RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-          });
-        });
-      });
-    });
-  });
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduceBcast device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-
-template <int XPU, int NDim, typename Idx, typename DType, typename Reducer>
-BcastGData<NDim, Idx, DType> AllocBcastGData(
-    const DLContext& ctx, const BcastInfo& info,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_mapping, runtime::NDArray out_data) {
-  // GData
-  BcastGData<NDim, Idx, DType> gdata;
-  // dim, shape and stride
-  gdata.ndim = info.lhs_shape.size();
-  std::copy(info.lhs_shape.begin(), info.lhs_shape.end(), gdata.lhs_shape);
-  std::copy(info.lhs_stride.begin(), info.lhs_stride.end(), gdata.lhs_stride);
-  std::copy(info.rhs_shape.begin(), info.rhs_shape.end(), gdata.rhs_shape);
-  std::copy(info.rhs_stride.begin(), info.rhs_stride.end(), gdata.rhs_stride);
-  std::copy(info.out_shape.begin(), info.out_shape.end(), gdata.out_shape);
-  std::copy(info.out_stride.begin(), info.out_stride.end(), gdata.out_stride);
-  gdata.lhs_len = utils::Prod(info.lhs_shape);
-  gdata.rhs_len = utils::Prod(info.rhs_shape);
-  gdata.out_len = utils::Prod(info.out_shape);
-  // data
-  gdata.lhs_data = static_cast<DType*>(lhs_data->data);
-  gdata.rhs_data = static_cast<DType*>(rhs_data->data);
-  gdata.out_data = static_cast<DType*>(out_data->data);
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-  gdata.data_len = info.data_len;
-
-  // fill out data with zero values
-  utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
-  return gdata;
-}
-
-template <int XPU>
-void BinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs,
-    binary_op::Target rhs,
-    runtime::NDArray lhs_data,
-    runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping,
-    runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping) {
-  using runtime::NDArray;
-  using minigun::Csr;
-#ifdef __CUDACC__
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out_data->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int64_t x_len = utils::ComputeXLength(out_data);
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-
-  const DLDataType& dtype = out_data->dtype;
-  const int bcast_ndim = info.out_shape.size();
-  const auto bits = graph.NumBits();
-
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-        BCAST_NDIM_SWITCH(bcast_ndim, NDim, {
-          auto gdata = AllocBcastGData<XPU, NDim, Idx, DType, Reducer>(
-              rtcfg.ctx, info, lhs_mapping, rhs_mapping,
-              lhs_data, rhs_data, out_mapping, out_data);
-          OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-            CallBinaryReduceBcast<XPU, NDim, Idx, DType, LeftTarget,
-              RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-          });
-        });
-      });
-    });
-  });
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduceBcast device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-
-template <int XPU, int NDim, typename Idx, typename DType>
-BackwardBcastGData<NDim, Idx, DType> AllocBackwardBcastGData(
-    const DLContext& ctx, const BcastInfo& info,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
-    runtime::NDArray grad_lhs, runtime::NDArray grad_rhs) {
-  // GData
-  BackwardBcastGData<NDim, Idx, DType> gdata;
-  // dim, shape and stride
-  gdata.ndim = info.lhs_shape.size();
-  gdata.lhs_len = utils::Prod(info.lhs_shape);
-  gdata.rhs_len = utils::Prod(info.rhs_shape);
-  gdata.out_len = utils::Prod(info.out_shape);
-  std::copy(info.lhs_shape.begin(), info.lhs_shape.end(), gdata.lhs_shape);
-  std::copy(info.lhs_stride.begin(), info.lhs_stride.end(), gdata.lhs_stride);
-  std::copy(info.rhs_shape.begin(), info.rhs_shape.end(), gdata.rhs_shape);
-  std::copy(info.rhs_stride.begin(), info.rhs_stride.end(), gdata.rhs_stride);
-  std::copy(info.out_shape.begin(), info.out_shape.end(), gdata.out_shape);
-  std::copy(info.out_stride.begin(), info.out_stride.end(), gdata.out_stride);
-  // mappings
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-  gdata.data_len = info.data_len;
-
-  // data
-  gdata.lhs_data = static_cast<DType*>(lhs->data);
-  gdata.rhs_data = static_cast<DType*>(rhs->data);
-  gdata.out_data = static_cast<DType*>(out->data);
-  gdata.grad_out_data = static_cast<DType*>(grad_out->data);
-  if (!aten::IsNullArray(grad_lhs)) {
-    gdata.grad_lhs_data = static_cast<DType*>(grad_lhs->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_lhs_data, utils::NElements(grad_lhs),
-                static_cast<DType>(0));
-  }
-  if (!aten::IsNullArray(grad_rhs)) {
-    gdata.grad_rhs_data = static_cast<DType*>(grad_rhs->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_rhs_data, utils::NElements(grad_rhs),
-                static_cast<DType>(0));
-  }
-  return gdata;
-}
-
-template <int XPU>
-void BackwardBinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs_tgt, binary_op::Target rhs_tgt,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
-    runtime::NDArray grad_lhs, runtime::NDArray grad_rhs) {
-  using runtime::NDArray;
-  using minigun::Csr;
-#ifdef __CUDACC__
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int64_t x_len = utils::ComputeXLength(out);
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-
-  const DLDataType& dtype = out->dtype;
-  const int bcast_ndim = info.out_shape.size();
-  const bool req_lhs = !aten::IsNullArray(grad_lhs);
-  const bool req_rhs = !aten::IsNullArray(grad_rhs);
-  const auto bits = graph.NumBits();
-
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      BCAST_NDIM_SWITCH(bcast_ndim, NDim, {
-        auto gdata = AllocBackwardBcastGData<XPU, NDim, Idx, DType>(
-            rtcfg.ctx, info,
-            lhs_mapping, rhs_mapping, out_mapping,
-            lhs, rhs, out, grad_out,
-            grad_lhs, grad_rhs);
-        BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, {
-          REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-            OP_TARGET_SWITCH(op, lhs_tgt, rhs_tgt, DType, BinaryOp, LeftTarget, RightTarget, {
-              CallBackwardBinaryReduceBcast<XPU, Mode, NDim, Idx, DType,
-                LeftTarget, RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-            });
-          });
-        });
-      });
-    });
-  });
-}
-
-}  // namespace kernel
-}  // namespace dgl
-
-#endif  // DGL_KERNEL_BINARY_REDUCE_IMPL_H_
--- a/src/kernel/binary_reduce_impl_decl.h
+++ b/src/kernel/binary_reduce_impl_decl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce_impl_decl.h
- * \brief Data structure and function declarations for implementations.
- */
-#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
-#define DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
-
-#include <dgl/runtime/ndarray.h>
-
-#include <string>
-
-#include "./binary_reduce_common.h"
-#include "./csr_interface.h"
-
-namespace minigun {
-namespace advance {
-// forward declaration
-struct RuntimeConfig;
-}  // namespace advance
-}  // namespace minigun
-
-namespace dgl {
-
-namespace kernel {
-
-// forward declaration
-struct BcastInfo;
-
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduce declarations
-///////////////////////////////////////////////////////////////////////////////
-
-/*!\brief Data structure used by computing BinaryOpReduce in Minigun. */
-template <typename Idx, typename DType>
-struct GData {
-  // length along x(feature) dimension
-  int64_t x_length{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr};
-  // output data
-  DType *out_data{nullptr};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
-  // output id mapping
-  Idx *out_mapping{nullptr};
-};
-
-/*!
- * \brief Template declaration for BinaryReduce operator.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp Binary operator type
- * \tparam Reducer Reducer type
- * \param rtcfg Runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduce(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    GData<Idx, DType>* gdata);
-
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-template <int XPU>
-void BinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping);
-
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduce declarations
-///////////////////////////////////////////////////////////////////////////////
-
-/*!\brief Data structure used by computing BackwardBinaryReduce in Minigun. */
-template <typename Idx, typename DType>
-struct BackwardGData {
-  // length along x(feature) dimension
-  int64_t x_length{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr}, *out_data{nullptr};
-  DType *grad_out_data{nullptr};
-  // output data
-  DType *grad_lhs_data{nullptr}, *grad_rhs_data{nullptr};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
-  // output id mapping
-  Idx *out_mapping{nullptr};
-};
-
-/*!
- * \brief Template declaration for BackwardBinaryReduce operator.
- *
- * Mode must be one of the enum code in binary_op::BackwardMode.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/backward_binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam Mode the backward mode code
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp Binary operator type
- * \tparam Reducer Reducer type
- * \param rtcfg Runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, int Mode, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduce(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<Idx, DType>* gdata);
-
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_lhs_data The gradient lhs tensor.
- */
-template <int XPU>
-void BackwardBinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data);
-
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduce with broadcasting declarations
-///////////////////////////////////////////////////////////////////////////////
-
-/*!
- * \brief Data structure used by computing BinaryOp with broadcasting in Minigun.
- *
- * Note that all the shapes and strides are for the feature dimensions.
- *
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx id index type
- * \tparam DType feature data type
- */
-template <int NDim, typename Idx, typename DType>
-struct BcastGData {
-  // actual number of feature dimensions
-  int ndim{0};
-  // input feature shape and stride
-  int64_t lhs_len{0}, rhs_len{0};
-  int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
-  int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
-  // output feature shape and stride
-  int64_t out_len{0};  // output total feature length (equal to prod(out_shape));
-  int64_t out_shape[NDim]{0}, out_stride[NDim]{0};
-  // output data
-  DType *out_data{nullptr};
-  // output id mapping
-  Idx *out_mapping{nullptr};
-};
-
-/*!
- * \brief Template declaration for BinaryReduce with broadcasting operator.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp rinary operator type
- * \tparam Reducer reducer type
- * \param rtcfg runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduceBcast(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BcastGData<NDim, Idx, DType>* gdata);
-
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-template <int XPU>
-void BinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduce with broadcasting declarations
-///////////////////////////////////////////////////////////////////////////////
-
-/*!
- * \brief Data and auxiliary information for backward binary broadcasting op.
- *
- * Note that all the shapes and strides are for the feature dimensions.
- *
- * The gradients of the broadcasting dimensions are not reduced. As a result,
- * The grad_lhs and grad_rhs have the same shape as grad_out.
- *
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx id index type
- * \tparam DType feature data type
- */
-template <int NDim, typename Idx, typename DType>
-struct BackwardBcastGData {
-  // actual number of feature dimensions
-  int ndim{0};
-  // input shape and stride
-  int64_t lhs_len{0}, rhs_len{0}, out_len{0};
-  int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
-  int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
-  int64_t out_shape[NDim]{0}, out_stride[NDim]{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr}, *out_mapping{nullptr};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr}, *out_data{nullptr};
-  DType *grad_out_data{nullptr};
-  // output data
-  DType *grad_lhs_data{nullptr}, *grad_rhs_data{nullptr};
-};
-
-/*!
- * \brief Template declaration for BackwardBinaryReduce with broadcasting operator.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam Mode the backward mode code
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp rinary operator type
- * \tparam Reducer reducer type
- * \param rtcfg runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, int Mode, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduceBcast(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardBcastGData<NDim, Idx, DType>* gdata);
-
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_lhs_data The gradient lhs tensor.
- */
-template <int XPU>
-void BackwardBinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data);
-
-}  // namespace kernel
-}  // namespace dgl
-
-#endif  // DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
--- a/src/kernel/common.h
+++ b/src/kernel/common.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/common.h
- * \brief Kernel common utilities
- */
-#ifndef DGL_KERNEL_COMMON_H_
-#define DGL_KERNEL_COMMON_H_
-
-#include <dgl/runtime/ndarray.h>
-
-#include <cstdint>
-#include "../c_api_common.h"
-
-namespace dgl {
-namespace kernel {
-
-#ifdef __CUDACC__
-#define DGLDEVICE __device__
-#define DGLINLINE __forceinline__
-#else
-#define DGLDEVICE
-#define DGLINLINE inline
-#endif  // __CUDACC__
-
-// Macro for dispatch device flag to template function calls
-#ifdef DGL_USE_CUDA
-#define DGL_XPU_SWITCH(val, Method, ...)  \
-  if (val == kDLCPU) {                    \
-    Method<kDLCPU>(__VA_ARGS__);          \
-  } else if (val == kDLGPU) {             \
-    Method<kDLGPU>(__VA_ARGS__);          \
-  } else {                                \
-    LOG(FATAL) << "Unsupported device type: " << val;  \
-  }
-#else  // DGL_USE_CUDA
-#define DGL_XPU_SWITCH(val, Method, ...)  \
-  if (val == kDLCPU) {                    \
-    Method<kDLCPU>(__VA_ARGS__);          \
-  } else {                                \
-    LOG(FATAL) << "Unsupported device type: " << val;  \
-  }
-#endif  // DGL_USE_CUDA
-
-// MSVC does not expand __VA_ARGS__ correctly, and needs this expand hack
-#define MSVC_EXPAND(x) x
-
-// Macro for dispatch dtype flag to template argument. Currently only
-// support float32.
-#define DGL_DTYPE_SWITCH(val, DType, ...)       \
-  if (val.code == kDLFloat && val.bits == 32) { \
-    typedef float DType;                        \
-    { __VA_ARGS__ }                             \
-  } else {                                      \
-    LOG(FATAL) << "Unsupported dtype: " << val; \
-  }
-
-// Macro for unrolling with data type arguments.
-#define GEN_DTYPE(GEN, ...)  \
-  MSVC_EXPAND(GEN(__VA_ARGS__, float))
-
-// Macro for dispatch index nbits to template argument.
-#ifdef __CUDACC__
-#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...)            \
-  if (bits == 32) {                                    \
-    typedef int32_t Idx;                               \
-    {__VA_ARGS__}                                      \
-  } else {                                             \
-    LOG(FATAL) << "Unsupported idx bits: " << bits;    \
-  }
-#else
-#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...)            \
-  if (bits == 32) {                                    \
-    typedef int32_t Idx;                               \
-    {__VA_ARGS__}                                      \
-  } else if (bits == 64) {                             \
-    typedef int64_t Idx;                               \
-    {__VA_ARGS__}                                      \
-  } else {                                             \
-    LOG(FATAL) << "Unsupported idx bits: " << bits;    \
-  }
-#endif
-
-}  // namespace kernel
-}  // namespace dgl
-
-#endif  // DGL_KERNEL_COMMON_H_
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/backward_binary_reduce_impl.h
- * \brief Minigun CPU UDFs for bacward binary reduce
- */
-#ifndef DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
-#define DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
-
-#include <minigun/minigun.h>
-
-#include "../binary_reduce_impl_decl.h"
-#include "../utils.h"
-#include "./functor.h"
-#include "../csr_interface.h"
-
-namespace dgl {
-namespace kernel {
-namespace cpu {
-
-// Minigun UDF to compute backward binary reduce.
-template <int Mode, typename Idx, typename DType, typename Functors>
-struct BackwardBinaryReduce {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
-    const int64_t D = gdata->x_length;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * D * len;
-    DType* rhsoff = gdata->rhs_data + rid * D * len;
-    DType* outoff = gdata->out_data + oid * D;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len;
-    DType* gradoutoff = gdata->grad_out_data + oid * D;
-    for (int64_t tx = 0; tx < D; ++tx) {
-      DType out = Functors::Read(outoff + tx);
-      DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
-      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
-      if (0 == grad_e)
-        continue;
-      DType* lhs_base = lhsoff + tx * len;
-      DType* rhs_base = rhsoff + tx * len;
-      if (Mode == binary_op::kGradBoth) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          DType grad = grad_lhs + grad_rhs;
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad;
-        }
-      } else if (Mode == binary_op::kGradLhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad_lhs;
-        }
-      } else if (Mode == binary_op::kGradRhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-#pragma omp atomic
-          gradrhsoff[tx * len + i] += grad_rhs;
-        }
-      }
-    }
-  }
-};
-
-// Minigun UDF to compute backward binary reduce with broadcasting.
-template <int Mode, int NDim,
-          typename Idx, typename DType, typename Functors>
-struct BackwardBinaryReduceBcast {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len;
-    DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len;
-    int64_t tmp[NDim];  // store unraveled idx.
-    for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
-      Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType out = Functors::Read(outoff + tx);
-      DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(
-        lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
-        rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
-        len);
-      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
-      // (pawelpiotrowicz) Although we can technically add the same condition for
-      // skipping atomic additions as in BackwardBinaryReduce, doing so made the
-      // speed 2% slower in GCMC training on MovieLens-1M with 24 OpenMP threads.
-      // For more details, see https://github.com/dmlc/dgl/pull/1527.
-      // TODO(BarclayII): Needs further investigation and benchmarking.
-
-      DType* lhs_base = lhsoff +
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
-      DType* rhs_base = rhsoff +
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
-      if (Mode == binary_op::kGradBoth) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          DType grad = grad_lhs + grad_rhs;
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad;
-        }
-      } else if (Mode == binary_op::kGradLhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad_lhs;
-        }
-      } else if (Mode == binary_op::kGradRhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-#pragma omp atomic
-          gradrhsoff[tx * len + i] += grad_rhs;
-        }
-      }
-    }
-  }
-};
-
-// Auxiliary template used in UDF.
-template <typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-struct BackwardFunctorsTempl {
-  static inline Idx SelectOut(
-      Idx src, Idx edge, Idx dst) {
-    typedef typename OutSelector<Reducer>::Type OutTarget;
-    return SwitchSrcDst<OutTarget>::Type::Call(src, edge, dst);
-  }
-  static inline Idx SelectLeft(
-      Idx src, Idx edge, Idx dst) {
-    return LeftSelector::Call(src, edge, dst);
-  }
-  static inline Idx SelectRight(
-      Idx src, Idx edge, Idx dst) {
-    return RightSelector::Call(src, edge, dst);
-  }
-  static inline DType Op(DType* lhs, DType* rhs, int64_t len) {
-    return BinaryOp::Call(lhs, rhs, len);
-  }
-  static inline DType Read(DType* addr) {
-    return *addr;
-  }
-  static inline void Write(DType* addr, DType val) {
-    Reducer::Call(addr, val);
-  }
-  static inline Idx GetId(Idx id, Idx* id_map) {
-    return *(id_map + id);
-  }
-  static inline DType BackwardWrite(DType val, DType accum) {
-    return Reducer::BackwardCall(val, accum);
-  }
-  static inline DType BackwardOpLhs(DType lhs, DType rhs, DType out) {
-    return BinaryOp::BackwardLhs(lhs, rhs, out);
-  }
-  static inline DType BackwardOpRhs(DType lhs, DType rhs, DType out) {
-    return BinaryOp::BackwardRhs(lhs, rhs, out);
-  }
-};
-
-typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
-
-}  // namespace cpu
-
-// Template implementation of BackwardBinaryReduce operator.
-template <int XPU, int Mode, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduce(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<Idx, DType>* gdata) {
-  // For backward computation, we use reverse csr and switch dst and src.
-  // This benefits the most common src_op_edge or copy_src case, because the
-  // gradients of src are now aggregated into destination buffer to reduce
-  // competition of atomic add.
-  auto incsr = graph.GetInCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
-  typedef cpu::BackwardFunctorsTempl<Idx, DType,
-          typename SwitchSrcDst<LeftSelector>::Type,
-          typename SwitchSrcDst<RightSelector>::Type,
-          BinaryOp, Reducer> Functors;
-  typedef cpu::BackwardBinaryReduce<Mode, Idx, DType, Functors> UDF;
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge
-      && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge
-      && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, BackwardGData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_BACKWARD_DEFINE(mode, dtype, lhs_tgt, rhs_tgt, op)  \
-  template void CallBackwardBinaryReduce<XPU,                \
-                    mode, IDX, dtype,                           \
-                    lhs_tgt, rhs_tgt,                           \
-                    op<dtype>, REDUCER<XPU, dtype>>(            \
-      const minigun::advance::RuntimeConfig& rtcfg,             \
-      const CSRWrapper& graph,                                  \
-      BackwardGData<IDX, dtype>* gdata);
-
-// Template implementation of BackwardBinaryReduce with broadcasting operator.
-template <int XPU, int Mode, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduceBcast(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardBcastGData<NDim, Idx, DType>* gdata) {
-  // For backward computation, we use reverse csr and switch dst and src.
-  // This benefits the most common src_op_edge or copy_src case, because the
-  // gradients of src are now aggregated into destination buffer to reduce
-  // competition of atomic add.
-  auto incsr = graph.GetInCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
-  typedef cpu::BackwardFunctorsTempl<Idx, DType,
-          typename SwitchSrcDst<LeftSelector>::Type,
-          typename SwitchSrcDst<RightSelector>::Type,
-          BinaryOp, Reducer> Functors;
-  typedef cpu::BackwardBinaryReduceBcast<Mode, NDim, Idx, DType, Functors> UDF;
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge
-      && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge
-      && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig,
-    BackwardBcastGData<NDim, Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_BACKWARD_BCAST_DEFINE(mode, ndim, dtype, lhs_tgt, rhs_tgt, op)  \
-  template void CallBackwardBinaryReduceBcast<XPU,                       \
-                    mode, ndim, IDX, dtype,                                 \
-                    lhs_tgt, rhs_tgt,                                       \
-                    op<dtype>, REDUCER<XPU, dtype>>(                        \
-      const minigun::advance::RuntimeConfig& rtcfg,                         \
-      const CSRWrapper& graph,                                              \
-      BackwardBcastGData<ndim, IDX, dtype>* gdata);
-
-}  // namespace kernel
-}  // namespace dgl
-
-#endif  // DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
--- a/src/kernel/cpu/binary_bcast_reduce_max.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_max.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_max.cc
- * \brief CPU kernels for braodcasting binary reduce max
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceMax
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_min.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_min.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_min.cc
- * \brief CPU kernels for braodcasting binary reduce min
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceMin
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_none.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_none.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_none.cc
- * \brief CPU kernels for braodcasting binary reduce none
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceNone
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_prod.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_prod.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_prod.cc
- * \brief CPU kernels for braodcasting binary reduce prod
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceProd
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_sum.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_sum.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_sum.cc
- * \brief CPU kernels for braodcasting binary reduce sum
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceSum
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_impl.cc
+++ b/src/kernel/cpu/binary_reduce_impl.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_impl.cc
- * \brief Binary reduce implementation on CPU.
- */
-#include "../binary_reduce_impl.h"
-#include "../csr_interface.h"
-
-using dgl::runtime::NDArray;
-
-namespace dgl {
-namespace kernel {
-
-template void BinaryReduceImpl<kDLCPU>(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-
-template void BinaryReduceBcastImpl<kDLCPU>(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-
-template void BackwardBinaryReduceImpl<kDLCPU>(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    NDArray lhs_mapping, NDArray rhs_mapping, NDArray out_mapping,
-    NDArray lhs_data, NDArray rhs_data, NDArray out_data,
-    NDArray grad_out_data,
-    NDArray grad_lhs_data, NDArray grad_rhs_data);
-
-template void BackwardBinaryReduceBcastImpl<kDLCPU>(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs_tgt, binary_op::Target rhs_tgt,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
-    runtime::NDArray grad_lhs, runtime::NDArray grad_rhs);
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_impl.h
- * \brief Minigun CPU UDFs for binary reduce
- */
-#ifndef DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
-#define DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
-
-#include <minigun/minigun.h>
-
-#include <algorithm>
-
-#include "../binary_reduce_impl_decl.h"
-#include "../utils.h"
-#include "./functor.h"
-#include "../csr_interface.h"
-
-namespace dgl {
-namespace kernel {
-namespace cpu {
-
-// Minigun UDF to compute binary reduce.
-template <typename Idx, typename DType, typename Functors>
-struct BinaryReduce {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    const int64_t D = gdata->x_length;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * D * len;
-    DType* rhsoff = gdata->rhs_data + rid * D * len;
-    DType* outoff = gdata->out_data + oid * D;
-    for (int64_t tx = 0; tx < D; ++tx) {
-      DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
-      Functors::Write(outoff + tx, out);
-    }
-  }
-};
-
-// Convert flattened index to multi-dimension index (assume row-major).
-inline void Unravel(int64_t idx, int ndim,
-    const int64_t* shape, const int64_t* stride, int64_t* out) {
-  for (int d = 0; d < ndim; ++d) {
-    out[d] = (idx / stride[d]) % shape[d];
-  }
-}
-
-// Convert multi-dimension index to flattened index (assume row-major).
-inline int64_t Ravel(const int64_t* idx, int ndim,
-    const int64_t* shape, const int64_t* stride) {
-  int64_t out = 0;
-  for (int d = 0; d < ndim; ++d) {
-    out += std::min(idx[d], shape[d] - 1) * stride[d];
-  }
-  return out;
-}
-
-// Minigun UDF to compute binary reduce with broadcasting.
-template <int NDim, typename Idx, typename DType, typename Functors>
-struct BinaryReduceBcast {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;  // data with len size
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len;
-    int64_t tmp[NDim];  // store unraveled idx.
-    for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
-      Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType out = Functors::Op(
-          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
-          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
-          len);
-
-      Functors::Write(outoff + tx, out);
-    }
-  }
-};
-
-// Auxiliary template used in UDF.
-template <typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-struct FunctorsTempl {
-  static inline Idx SelectOut(
-      Idx src, Idx edge, Idx dst) {
-    return OutSelector<Reducer>::Type::Call(src, edge, dst);
-  }
-  static inline Idx SelectLeft(
-      Idx src, Idx edge, Idx dst) {
-    return LeftSelector::Call(src, edge, dst);
-  }
-  static inline Idx SelectRight(
-      Idx src, Idx edge, Idx dst) {
-    return RightSelector::Call(src, edge, dst);
-  }
-  static inline DType Op(DType *lhs, DType *rhs, int64_t len) {
-    return BinaryOp::Call(lhs, rhs, len);
-  }
-  static inline void Write(DType* addr, DType val) {
-    Reducer::Call(addr, val);
-  }
-  static inline Idx GetId(Idx id, Idx* id_map) {
-    return *(id_map + id);
-  }
-};
-
-typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
-
-}  // namespace cpu
-
-// Template implementation of BinaryReduce operator.
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
-                      const CSRWrapper& graph,
-                      GData<Idx, DType>* gdata) {
-  typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryOp, Reducer>
-          Functors;
-  typedef cpu::BinaryReduce<Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, GData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-
-// Template implementation of BinaryReduce broadcasting operator.
-template <int XPU, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduceBcast(
-  const minigun::advance::RuntimeConfig& rtcfg,
-  const CSRWrapper& graph,
-  BcastGData<NDim, Idx, DType>* gdata) {
-  typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryOp, Reducer>
-          Functors;
-  typedef cpu::BinaryReduceBcast<NDim, Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig,
-    BcastGData<NDim, Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_DEFINE(dtype, lhs_tgt, rhs_tgt, op)                    \
-  template void CallBinaryReduce<XPU, IDX,                      \
-        dtype, lhs_tgt, rhs_tgt, op<dtype>, REDUCER<XPU, dtype>>(  \
-      const minigun::advance::RuntimeConfig& rtcfg,                \
-      const CSRWrapper& graph,                                     \
-      GData<IDX, dtype>* gdata);
-
-#define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op)         \
-  template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype,     \
-                                 lhs_tgt, rhs_tgt,                  \
-                                 op<dtype>, REDUCER<XPU, dtype>>(   \
-      const minigun::advance::RuntimeConfig& rtcfg,                 \
-      const CSRWrapper& graph,                                      \
-      BcastGData<ndim, IDX, dtype>* gdata);
-
-#define EVAL(F, ...) MSVC_EXPAND(F(__VA_ARGS__))
-
-}  // namespace kernel
-}  // namespace dgl
-
-#endif  // DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
--- a/src/kernel/cpu/binary_reduce_max.cc
+++ b/src/kernel/cpu/binary_reduce_max.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_max.cc
- * \brief CPU kernels for binary reduce max
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceMax
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_min.cc
+++ b/src/kernel/cpu/binary_reduce_min.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_min.cc
- * \brief CPU kernels for binary reduce min
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceMin
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_none.cc
+++ b/src/kernel/cpu/binary_reduce_none.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_none.cc
- * \brief CPU kernels for binary reduce none
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-
-namespace dgl {
-namespace kernel {
-
-#define REDUCER ReduceNone
-#define XPU kDLCPU
-
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-
-}  // namespace kernel
-}  // namespace dgl