Remove deprecated kernels (#3316)

* remove * remove * fix * remove * remove

Remove deprecated kernels (#3316)
* remove * remove * fix * remove * remove
c81efdf2 · Jinjing Zhou · GitHub · 75d793a1 · c81efdf2 · c81efdf2
Unverified Commit c81efdf2 authored Sep 06, 2021 by Jinjing Zhou Committed by GitHub Sep 06, 2021
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -145,8 +145,6 @@ file(GLOB DGL_SRC
  src/array/cpu/*.cc
  src/random/*.cc
  src/random/cpu/*.cc
-  src/kernel/*.cc
-  src/kernel/cpu/*.cc
  src/runtime/*.cc
  src/geometry/*.cc
  src/geometry/cpu/*.cc

--- a/src/array/cuda/atomic.cuh
+++ b/src/array/cuda/atomic.cuh
@@ -10,6 +10,9 @@
 #include <cassert>
 #include "fp16.cuh"
+#if __CUDA_ARCH__ >= 600
+#include <cuda_fp16.h>
+#endif
 namespace dgl {
 namespace aten {
@@ -133,6 +136,84 @@ DEFINE_ATOMIC_HALF(Min)
 DEFINE_ATOMIC(Add)
 #undef OP
+/**
+* \brief Performs an atomic compare-and-swap on 64 bit integers. That is,
+* it the word `old` at the memory location `address`, computes
+* `(old == compare ? val : old)` , and stores the result back to memory at
+* the same address.
+*
+* \param address The address to perform the atomic operation on.
+* \param compare The value to compare to.
+* \param val The new value to conditionally store.
+*
+* \return The old value at the address.
+*/
+inline __device__ int64_t AtomicCAS(
+    int64_t * const address,
+    const int64_t compare,
+    const int64_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = unsigned long long int; // NOLINT
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+  return atomicCAS(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(compare),
+                   static_cast<Type>(val));
+}
+/**
+* \brief Performs an atomic compare-and-swap on 32 bit integers. That is,
+* it the word `old` at the memory location `address`, computes
+* `(old == compare ? val : old)` , and stores the result back to memory at
+* the same address.
+*
+* \param address The address to perform the atomic operation on.
+* \param compare The value to compare to.
+* \param val The new value to conditionally store.
+*
+* \return The old value at the address.
+*/
+inline __device__ int32_t AtomicCAS(
+    int32_t * const address,
+    const int32_t compare,
+    const int32_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = int; // NOLINT
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+  return atomicCAS(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(compare),
+                   static_cast<Type>(val));
+}
+inline __device__ int64_t AtomicMax(
+    int64_t * const address,
+    const int64_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = unsigned long long int; // NOLINT
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+  return atomicMax(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(val));
+}
+inline __device__ int32_t AtomicMax(
+    int32_t * const address,
+    const int32_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = int; // NOLINT
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+  return atomicMax(reinterpret_cast<Type*>(address),
+                   static_cast<Type>(val));
+}
 template <>
 __device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
 #if __CUDA_ARCH__ >= 200

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -10,10 +10,10 @@
 #include <numeric>
 #include "./dgl_cub.cuh"
-#include "../../kernel/cuda/atomic.cuh"
+#include "../../array/cuda/atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
-using namespace dgl::kernel::cuda;
+using namespace dgl::aten::cuda;
 namespace dgl {
 namespace aten {

--- a/src/kernel/binary_reduce.cc
+++ b/src/kernel/binary_reduce.cc
--- a/src/kernel/binary_reduce.h
+++ b/src/kernel/binary_reduce.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce.h
- * \brief Binary reduce function C++ header.
- */
-#ifndef DGL_KERNEL_BINARY_REDUCE_H_
-#define DGL_KERNEL_BINARY_REDUCE_H_
-#include <dgl/runtime/ndarray.h>
-#include <vector>
-#include <string>
-#include "./binary_reduce_common.h"
-#include "./csr_interface.h"
-namespace dgl {
-namespace kernel {
-// Structure for broadcasting shapes
-struct BcastInfo {
-  // inferred output shape
-  std::vector<int64_t> real_out_shape;
-  // Following shapes here have been preprocessed, so that:
-  //  - The first dimension (for graph) is removed. Shapes here are only for features.
-  //  - They have the same number of dimensions.
-  //    e.g. (4,) and (3, 4) become (1, 4) and (3, 4)
-  //  - Continuous non-broadcasting dimenions are flattened.
-  //    e.g. (4, 1, 3, 3) and (4, 5, 3, 3) become (4, 1, 9) and (4, 5, 9)
-  std::vector<int64_t> lhs_shape, lhs_stride;
-  std::vector<int64_t> rhs_shape, rhs_stride;
-  std::vector<int64_t> out_shape, out_stride;
-  int64_t data_len;
-};
-/*
- * !\brief Compute the feature shape after binary reduce computation.
- */
-std::vector<int64_t> InferBinaryFeatureShape(
-    runtime::NDArray lhs,
-    runtime::NDArray rhs);
-/*!
- * \brief Perform binary operation between the given data and reduce by the graph.
- *
- * If the reducer is one of "sum, "max, "min", "prod", the operator computes,
- * for each node i,
- *
- *   out[i] = Sigma_{j\in Neighbor(i)} ( A[s1(i, j, e)] op B[s2(i, j, e)] )
- *
- * , where A, B are two input feature tensors, op could be element-wise add/sub/div/mul.
- * Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
- * ids of each neighbor.
- *
- * If the reducer is "none", the operator computes, for each edge e,
- *
- *   out[e] = A[s1(i, j, e)] op B[s2(i, j, e)]
- *
- * Here, the node/edge feature (e.g., A[i], B[e]) could be dense tensor. In such
- * case, broadcasting is supported on the feature dimensions.
- *
- * Examples:
- *
- * A.shape = (N, D1, D2)  # N is the number of nodes
- * B.shape = (M, D1, 1)   # M is the number of edges
- * C = BinaryOpReduce("sum", "add", graph, A, B, ...)
- * C.shape = (N, D1, D2)
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-void BinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-/*!
- * \brief Compute the lhs gradient of BinaryOpReduce
- *
- * Broadcasting along feature dimensions is supported. However, the gradient
- * of the being-broadcasted dimensions will *not* be reduced. Therefore, the
- * gradient tensor has the same shape with the out tensor.
- *
- * Examples:
- * A.shape = (N, D1, 1)    # N is the number of nodes
- * B.shape = (M, D1, D2)   # M is the number of edges
- * C = BinaryOpReduce("sum", "add", graph, A, B, ...)
- * C.shape = (N, D1, D2)
- * dC.shape = (N, D1, D2)
- * dA = BackwardLhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
- * dA.shape = (N, D1, D2)  # extra reduction should be handled afterwards
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_lhs_data The gradient lhs tensor.
- */
-void BackwardLhsBinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping,
-    runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data,
-    runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data);
-/*!
- * \brief Compute the rhs gradient of BinaryOpReduce
- *
- * Broadcasting along feature dimensions is supported. However, the gradient
- * of the being-broadcasted dimensions will *not* be reduced. Therefore, the
- * gradient tensor has the same shape with the out tensor.
- *
- * Examples:
- * A.shape = (N, D1, D2)   # N is the number of nodes
- * B.shape = (M, D1, 1)    # M is the number of edges
- * C = BinaryOpReduce("sum", "add", graph, A, B, ...)
- * C.shape = (N, D1, D2)
- * dC.shape = (N, D1, D2)
- * dB = BackwardRhsBinaryOpReduce("sum", "add", graph, A, B, C, dC, ...)
- * dB.shape = (N, D1, D2)  # extra reduction should be handled afterwards
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_rhs_data The gradient rhs tensor.
- */
-void BackwardRhsBinaryOpReduce(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping,
-    runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data,
-    runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_rhs_data);
-/*!
- * \brief Copy the target data and reduce by graph structure.
- *
- * If the reducer is one of "sum, "max, "min", "prod", the operator computes,
- * for each node i,
- *
- *   out[i] = Sigma_{j\in Neighbor(i)} A[s1(i, j, e)]
- *
- * , where A, B are two input feature tensors.
- * Depending on the lhs and rhs target, s1 and s2 will select the src/dst/edge
- * ids of each neighbor.
- *
- * If the reducer is "none", the operator computes, for each edge e,
- *
- *   out[e] = A[s1(i, j, e)]
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param graph The graph object.
- * \param target The nput target (src, edge)
- * \param in_data The input feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param in_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-void CopyReduce(
-    const std::string& reducer,
-    const CSRWrapper& graph,
-    binary_op::Target target,
-    runtime::NDArray in_data, runtime::NDArray out_data,
-    runtime::NDArray in_mapping, runtime::NDArray out_mapping);
-/*!
- * \brief Compute backward of the CopyReduce
- *
- * \param reducer The type of the reducer ("sum", "max", "prod", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param graph The graph object.
- * \param target The nput target (src, edge)
- * \param in_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param in_data The input feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_in_data The gradient input tensor.
- */
-void BackwardCopyReduce(
-    const std::string& reducer,
-    const CSRWrapper& graph,
-    binary_op::Target target,
-    runtime::NDArray in_mapping,
-    runtime::NDArray out_mapping,
-    runtime::NDArray in_data,
-    runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_in_data);
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_BINARY_REDUCE_H_
--- a/src/kernel/binary_reduce_common.h
+++ b/src/kernel/binary_reduce_common.h
--- a/src/kernel/binary_reduce_impl.h
+++ b/src/kernel/binary_reduce_impl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce_impl.h
- * \brief Implementations of binary reduce operations.
- */
-#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_H_
-#define DGL_KERNEL_BINARY_REDUCE_IMPL_H_
-#include <minigun/minigun.h>
-#include <dgl/runtime/device_api.h>
-#include <algorithm>
-#include <string>
-#ifdef __CUDACC__
-#include "../runtime/cuda/cuda_common.h"
-#endif
-#include "./binary_reduce.h"
-#include "./binary_reduce_impl_decl.h"
-#include "./csr_interface.h"
-#include "./utils.h"
-namespace dgl {
-namespace kernel {
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduce device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-template <int XPU, typename Idx, typename DType, typename Reducer>
-GData<Idx, DType> AllocGData(const std::string& op,
-    const DLContext& ctx, int64_t x_len,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_mapping, runtime::NDArray out_data) {
-  // GData
-  GData<Idx, DType> gdata;
-  gdata.x_length = x_len;
-  gdata.lhs_data = static_cast<DType*>(lhs_data->data);
-  gdata.rhs_data = static_cast<DType*>(rhs_data->data);
-  gdata.out_data = static_cast<DType*>(out_data->data);
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-  // for dot operation: vector [dot] vector
-  if (op == binary_op::kDot) {
-    // get size of vector
-    gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
-  } else {
-    gdata.data_len = 1;
-  }
-  // fill out data with zero values
-  utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
-  return gdata;
-}
-template <int XPU>
-void BinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping) {
-  using runtime::NDArray;
-  using minigun::Csr;
-  // device
-#ifdef __CUDACC__
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  const int64_t x_len = utils::ComputeXLength(out_data);
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out_data->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  const DLDataType& dtype = out_data->dtype;
-  const auto bits = graph.NumBits();
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-        auto gdata = AllocGData<XPU, Idx, DType, Reducer>(op,
-            rtcfg.ctx, x_len, lhs_mapping, rhs_mapping,
-            lhs_data, rhs_data, out_mapping, out_data);
-        OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-          CallBinaryReduce<XPU, Idx, DType, LeftTarget,
-            RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-        });
-      });
-    });
-  });
-}
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduce device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-template <int XPU, typename Idx, typename DType>
-BackwardGData<Idx, DType> AllocBackwardGData(
-    const std::string& op, const DLContext& ctx, int64_t x_len,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data) {
-  // GData
-  BackwardGData<Idx, DType> gdata;
-  gdata.x_length = x_len;
-  gdata.lhs_data = static_cast<DType*>(lhs_data->data);
-  gdata.rhs_data = static_cast<DType*>(rhs_data->data);
-  gdata.out_data = static_cast<DType*>(out_data->data);
-  gdata.grad_out_data = static_cast<DType*>(grad_out_data->data);
-  if (!aten::IsNullArray(grad_lhs_data)) {
-    gdata.grad_lhs_data = static_cast<DType*>(grad_lhs_data->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_lhs_data, utils::NElements(grad_lhs_data),
-                static_cast<DType>(0));
-  }
-  if (!aten::IsNullArray(grad_rhs_data)) {
-    gdata.grad_rhs_data = static_cast<DType*>(grad_rhs_data->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_rhs_data, utils::NElements(grad_rhs_data),
-                static_cast<DType>(0));
-  }
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-  // for dot operation: vector [dot] vector
-  if (op == binary_op::kDot) {
-    // get size of vector
-    gdata.data_len = lhs_data->shape[lhs_data->ndim - 1];
-  } else {
-    gdata.data_len = 1;
-  }
-  return gdata;
-}
-template <int XPU>
-void BackwardBinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data) {
-  using runtime::NDArray;
-  using minigun::Csr;
-#ifdef __CUDACC__
-  // device
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  // Graph
-  const int64_t x_len = utils::ComputeXLength(out_data);
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out_data->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-  const DLDataType& dtype = out_data->dtype;
-  const bool req_lhs = !aten::IsNullArray(grad_lhs_data);
-  const bool req_rhs = !aten::IsNullArray(grad_rhs_data);
-  const auto bits = graph.NumBits();
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      auto gdata = AllocBackwardGData<XPU, Idx, DType>(op,
-          rtcfg.ctx, x_len, lhs_mapping, rhs_mapping, out_mapping,
-          lhs_data, rhs_data, out_data, grad_out_data,
-          grad_lhs_data, grad_rhs_data);
-      BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, {
-        REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-          OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-            CallBackwardBinaryReduce<XPU, Mode, Idx, DType, LeftTarget,
-              RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-          });
-        });
-      });
-    });
-  });
-}
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduceBcast device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-template <int XPU, int NDim, typename Idx, typename DType, typename Reducer>
-BcastGData<NDim, Idx, DType> AllocBcastGData(
-    const DLContext& ctx, const BcastInfo& info,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_mapping, runtime::NDArray out_data) {
-  // GData
-  BcastGData<NDim, Idx, DType> gdata;
-  // dim, shape and stride
-  gdata.ndim = info.lhs_shape.size();
-  std::copy(info.lhs_shape.begin(), info.lhs_shape.end(), gdata.lhs_shape);
-  std::copy(info.lhs_stride.begin(), info.lhs_stride.end(), gdata.lhs_stride);
-  std::copy(info.rhs_shape.begin(), info.rhs_shape.end(), gdata.rhs_shape);
-  std::copy(info.rhs_stride.begin(), info.rhs_stride.end(), gdata.rhs_stride);
-  std::copy(info.out_shape.begin(), info.out_shape.end(), gdata.out_shape);
-  std::copy(info.out_stride.begin(), info.out_stride.end(), gdata.out_stride);
-  gdata.lhs_len = utils::Prod(info.lhs_shape);
-  gdata.rhs_len = utils::Prod(info.rhs_shape);
-  gdata.out_len = utils::Prod(info.out_shape);
-  // data
-  gdata.lhs_data = static_cast<DType*>(lhs_data->data);
-  gdata.rhs_data = static_cast<DType*>(rhs_data->data);
-  gdata.out_data = static_cast<DType*>(out_data->data);
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-  gdata.data_len = info.data_len;
-  // fill out data with zero values
-  utils::Fill<XPU>(ctx, gdata.out_data, utils::NElements(out_data), Zero<Reducer>::value);
-  return gdata;
-}
-template <int XPU>
-void BinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs,
-    binary_op::Target rhs,
-    runtime::NDArray lhs_data,
-    runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping,
-    runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping) {
-  using runtime::NDArray;
-  using minigun::Csr;
-#ifdef __CUDACC__
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out_data->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int64_t x_len = utils::ComputeXLength(out_data);
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-  const DLDataType& dtype = out_data->dtype;
-  const int bcast_ndim = info.out_shape.size();
-  const auto bits = graph.NumBits();
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-        BCAST_NDIM_SWITCH(bcast_ndim, NDim, {
-          auto gdata = AllocBcastGData<XPU, NDim, Idx, DType, Reducer>(
-              rtcfg.ctx, info, lhs_mapping, rhs_mapping,
-              lhs_data, rhs_data, out_mapping, out_data);
-          OP_TARGET_SWITCH(op, lhs, rhs, DType, BinaryOp, LeftTarget, RightTarget, {
-            CallBinaryReduceBcast<XPU, NDim, Idx, DType, LeftTarget,
-              RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-          });
-        });
-      });
-    });
-  });
-}
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduceBcast device-agnostic implementation
-///////////////////////////////////////////////////////////////////////////////
-template <int XPU, int NDim, typename Idx, typename DType>
-BackwardBcastGData<NDim, Idx, DType> AllocBackwardBcastGData(
-    const DLContext& ctx, const BcastInfo& info,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
-    runtime::NDArray grad_lhs, runtime::NDArray grad_rhs) {
-  // GData
-  BackwardBcastGData<NDim, Idx, DType> gdata;
-  // dim, shape and stride
-  gdata.ndim = info.lhs_shape.size();
-  gdata.lhs_len = utils::Prod(info.lhs_shape);
-  gdata.rhs_len = utils::Prod(info.rhs_shape);
-  gdata.out_len = utils::Prod(info.out_shape);
-  std::copy(info.lhs_shape.begin(), info.lhs_shape.end(), gdata.lhs_shape);
-  std::copy(info.lhs_stride.begin(), info.lhs_stride.end(), gdata.lhs_stride);
-  std::copy(info.rhs_shape.begin(), info.rhs_shape.end(), gdata.rhs_shape);
-  std::copy(info.rhs_stride.begin(), info.rhs_stride.end(), gdata.rhs_stride);
-  std::copy(info.out_shape.begin(), info.out_shape.end(), gdata.out_shape);
-  std::copy(info.out_stride.begin(), info.out_stride.end(), gdata.out_stride);
-  // mappings
-  if (!aten::IsNullArray(lhs_mapping)) {
-    gdata.lhs_mapping = static_cast<Idx*>(lhs_mapping->data);
-  }
-  if (!aten::IsNullArray(rhs_mapping)) {
-    gdata.rhs_mapping = static_cast<Idx*>(rhs_mapping->data);
-  }
-  if (!aten::IsNullArray(out_mapping)) {
-    gdata.out_mapping = static_cast<Idx*>(out_mapping->data);
-  }
-  gdata.data_len = info.data_len;
-  // data
-  gdata.lhs_data = static_cast<DType*>(lhs->data);
-  gdata.rhs_data = static_cast<DType*>(rhs->data);
-  gdata.out_data = static_cast<DType*>(out->data);
-  gdata.grad_out_data = static_cast<DType*>(grad_out->data);
-  if (!aten::IsNullArray(grad_lhs)) {
-    gdata.grad_lhs_data = static_cast<DType*>(grad_lhs->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_lhs_data, utils::NElements(grad_lhs),
-                static_cast<DType>(0));
-  }
-  if (!aten::IsNullArray(grad_rhs)) {
-    gdata.grad_rhs_data = static_cast<DType*>(grad_rhs->data);
-    // fill out data with zero values
-    utils::Fill<XPU>(ctx, gdata.grad_rhs_data, utils::NElements(grad_rhs),
-                static_cast<DType>(0));
-  }
-  return gdata;
-}
-template <int XPU>
-void BackwardBinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs_tgt, binary_op::Target rhs_tgt,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
-    runtime::NDArray grad_lhs, runtime::NDArray grad_rhs) {
-  using runtime::NDArray;
-  using minigun::Csr;
-#ifdef __CUDACC__
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-#endif
-  // advance config
-  minigun::advance::RuntimeConfig rtcfg;
-  rtcfg.ctx = out->ctx;
-#ifdef __CUDACC__
-  rtcfg.stream = thr_entry->stream;
-  const int64_t x_len = utils::ComputeXLength(out);
-  const int nt = utils::FindNumThreads(x_len, 64);
-  rtcfg.data_num_threads = nt;
-  // XXX(minjie): hard-code to let each thread compute two elements to increase
-  //              instruction level parallelism
-  rtcfg.data_num_blocks = (x_len + (nt * 2) - 1) / (nt * 2);
-#endif
-  const DLDataType& dtype = out->dtype;
-  const int bcast_ndim = info.out_shape.size();
-  const bool req_lhs = !aten::IsNullArray(grad_lhs);
-  const bool req_rhs = !aten::IsNullArray(grad_rhs);
-  const auto bits = graph.NumBits();
-  if (reducer == binary_op::kReduceMean) {
-    // TODO(minjie): divide
-    LOG(FATAL) << "reduce mean is not supported.";
-  }
-  DGL_DTYPE_SWITCH(dtype, DType, {
-    DGL_IDX_TYPE_SWITCH(bits, Idx, {
-      BCAST_NDIM_SWITCH(bcast_ndim, NDim, {
-        auto gdata = AllocBackwardBcastGData<XPU, NDim, Idx, DType>(
-            rtcfg.ctx, info,
-            lhs_mapping, rhs_mapping, out_mapping,
-            lhs, rhs, out, grad_out,
-            grad_lhs, grad_rhs);
-        BACKWARD_MODE_SWITCH(req_lhs, req_rhs, Mode, {
-          REDUCER_SWITCH(reducer, XPU, DType, Reducer, {
-            OP_TARGET_SWITCH(op, lhs_tgt, rhs_tgt, DType, BinaryOp, LeftTarget, RightTarget, {
-              CallBackwardBinaryReduceBcast<XPU, Mode, NDim, Idx, DType,
-                LeftTarget, RightTarget, BinaryOp, Reducer>(rtcfg, graph, &gdata);
-            });
-          });
-        });
-      });
-    });
-  });
-}
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_BINARY_REDUCE_IMPL_H_
--- a/src/kernel/binary_reduce_impl_decl.h
+++ b/src/kernel/binary_reduce_impl_decl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/binary_reduce_impl_decl.h
- * \brief Data structure and function declarations for implementations.
- */
-#ifndef DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
-#define DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
-#include <dgl/runtime/ndarray.h>
-#include <string>
-#include "./binary_reduce_common.h"
-#include "./csr_interface.h"
-namespace minigun {
-namespace advance {
-// forward declaration
-struct RuntimeConfig;
-}  // namespace advance
-}  // namespace minigun
-namespace dgl {
-namespace kernel {
-// forward declaration
-struct BcastInfo;
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduce declarations
-///////////////////////////////////////////////////////////////////////////////
-/*!\brief Data structure used by computing BinaryOpReduce in Minigun. */
-template <typename Idx, typename DType>
-struct GData {
-  // length along x(feature) dimension
-  int64_t x_length{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr};
-  // output data
-  DType *out_data{nullptr};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
-  // output id mapping
-  Idx *out_mapping{nullptr};
-};
-/*!
- * \brief Template declaration for BinaryReduce operator.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp Binary operator type
- * \tparam Reducer Reducer type
- * \param rtcfg Runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduce(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    GData<Idx, DType>* gdata);
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-template <int XPU>
-void BinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping);
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduce declarations
-///////////////////////////////////////////////////////////////////////////////
-/*!\brief Data structure used by computing BackwardBinaryReduce in Minigun. */
-template <typename Idx, typename DType>
-struct BackwardGData {
-  // length along x(feature) dimension
-  int64_t x_length{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr}, *out_data{nullptr};
-  DType *grad_out_data{nullptr};
-  // output data
-  DType *grad_lhs_data{nullptr}, *grad_rhs_data{nullptr};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
-  // output id mapping
-  Idx *out_mapping{nullptr};
-};
-/*!
- * \brief Template declaration for BackwardBinaryReduce operator.
- *
- * Mode must be one of the enum code in binary_op::BackwardMode.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/backward_binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam Mode the backward mode code
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp Binary operator type
- * \tparam Reducer Reducer type
- * \param rtcfg Runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, int Mode, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduce(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<Idx, DType>* gdata);
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_lhs_data The gradient lhs tensor.
- */
-template <int XPU>
-void BackwardBinaryReduceImpl(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data);
-///////////////////////////////////////////////////////////////////////////////
-// BinaryReduce with broadcasting declarations
-///////////////////////////////////////////////////////////////////////////////
-/*!
- * \brief Data structure used by computing BinaryOp with broadcasting in Minigun.
- *
- * Note that all the shapes and strides are for the feature dimensions.
- *
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx id index type
- * \tparam DType feature data type
- */
-template <int NDim, typename Idx, typename DType>
-struct BcastGData {
-  // actual number of feature dimensions
-  int ndim{0};
-  // input feature shape and stride
-  int64_t lhs_len{0}, rhs_len{0};
-  int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
-  int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr};
-  // output feature shape and stride
-  int64_t out_len{0};  // output total feature length (equal to prod(out_shape));
-  int64_t out_shape[NDim]{0}, out_stride[NDim]{0};
-  // output data
-  DType *out_data{nullptr};
-  // output id mapping
-  Idx *out_mapping{nullptr};
-};
-/*!
- * \brief Template declaration for BinaryReduce with broadcasting operator.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp rinary operator type
- * \tparam Reducer reducer type
- * \param rtcfg runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduceBcast(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BcastGData<NDim, Idx, DType>* gdata);
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- */
-template <int XPU>
-void BinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-///////////////////////////////////////////////////////////////////////////////
-// BackwardBinaryReduce with broadcasting declarations
-///////////////////////////////////////////////////////////////////////////////
-/*!
- * \brief Data and auxiliary information for backward binary broadcasting op.
- *
- * Note that all the shapes and strides are for the feature dimensions.
- *
- * The gradients of the broadcasting dimensions are not reduced. As a result,
- * The grad_lhs and grad_rhs have the same shape as grad_out.
- *
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx id index type
- * \tparam DType feature data type
- */
-template <int NDim, typename Idx, typename DType>
-struct BackwardBcastGData {
-  // actual number of feature dimensions
-  int ndim{0};
-  // input shape and stride
-  int64_t lhs_len{0}, rhs_len{0}, out_len{0};
-  int64_t lhs_shape[NDim]{0}, lhs_stride[NDim]{0};
-  int64_t rhs_shape[NDim]{0}, rhs_stride[NDim]{0};
-  int64_t out_shape[NDim]{0}, out_stride[NDim]{0};
-  // size of data, can be single value or a vector
-  int64_t data_len{0};
-  // input id mappings
-  Idx *lhs_mapping{nullptr}, *rhs_mapping{nullptr}, *out_mapping{nullptr};
-  // input data
-  DType *lhs_data{nullptr}, *rhs_data{nullptr}, *out_data{nullptr};
-  DType *grad_out_data{nullptr};
-  // output data
-  DType *grad_lhs_data{nullptr}, *grad_rhs_data{nullptr};
-};
-/*!
- * \brief Template declaration for BackwardBinaryReduce with broadcasting operator.
- *
- * LeftSelector and RightSelector must be one of the four operand target
- * categories.
- *
- * BinaryOp must be one of the binary operator types.
- *
- * Reducer must be one of the reducer types.
- *
- * The implementation of this template is device-dependent
- * (see kernel/xpu/binary_reduce_impl.(cu)h).
- *
- * See definitions in binary_reduce_common.h
- *
- * \tparam XPU the device flag
- * \tparam Mode the backward mode code
- * \tparam NDim maximum number of feature dimensions
- * \tparam Idx type of node/edge index (e.g. int32_t, int64_t)
- * \tparam DType type of the feature data (e.g. float32)
- * \tparam LeftSelect lhs category type
- * \tparam RightSelect rhs category type
- * \tparam BinaryOp rinary operator type
- * \tparam Reducer reducer type
- * \param rtcfg runtime configuration used by miningun
- * \param graph The graph object.
- * \param gdata The feature and mapping data used by the computation.
- */
-template <int XPU, int Mode, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduceBcast(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardBcastGData<NDim, Idx, DType>* gdata);
-/*!
- * \brief Template declaration for common logics shared by different devices.
- *
- * \tparam XPU the device flag
- * \param reducer The type of the reducer ("sum", "max", "mean", "min", "none").
- *                If the reducer is "none", the output is an edge feature tensor.
- *                Otherwise, a node feature tensor is returned.
- * \param op The type of the binary operator ("mul", "add").
- * \param graph The graph object.
- * \param lhs The lhs target (src, dst, edge)
- * \param rhs The rhs target (src, dst, edge)
- * \param lhs_mapping An optional int64 id mapping array.
- * \param rhs_mapping An optional int64 id mapping array.
- * \param out_mapping An optional int64 id mapping array.
- * \param lhs_data The lhs feature tensor.
- * \param rhs_data The rhs feature tensor.
- * \param out_data The output tensor. Could be either node or edge feature
- *                  tensor depending on the reducer.
- * \param grad_out_data The gradient output tensor.
- * \param grad_lhs_data The gradient lhs tensor.
- */
-template <int XPU>
-void BackwardBinaryReduceBcastImpl(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data, runtime::NDArray out_data,
-    runtime::NDArray grad_out_data,
-    runtime::NDArray grad_lhs_data, runtime::NDArray grad_rhs_data);
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_BINARY_REDUCE_IMPL_DECL_H_
--- a/src/kernel/common.h
+++ b/src/kernel/common.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/common.h
- * \brief Kernel common utilities
- */
-#ifndef DGL_KERNEL_COMMON_H_
-#define DGL_KERNEL_COMMON_H_
-#include <dgl/runtime/ndarray.h>
-#include <cstdint>
-#include "../c_api_common.h"
-namespace dgl {
-namespace kernel {
-#ifdef __CUDACC__
-#define DGLDEVICE __device__
-#define DGLINLINE __forceinline__
-#else
-#define DGLDEVICE
-#define DGLINLINE inline
-#endif  // __CUDACC__
-// Macro for dispatch device flag to template function calls
-#ifdef DGL_USE_CUDA
-#define DGL_XPU_SWITCH(val, Method, ...)  \
-  if (val == kDLCPU) {                    \
-    Method<kDLCPU>(__VA_ARGS__);          \
-  } else if (val == kDLGPU) {             \
-    Method<kDLGPU>(__VA_ARGS__);          \
-  } else {                                \
-    LOG(FATAL) << "Unsupported device type: " << val;  \
-  }
-#else  // DGL_USE_CUDA
-#define DGL_XPU_SWITCH(val, Method, ...)  \
-  if (val == kDLCPU) {                    \
-    Method<kDLCPU>(__VA_ARGS__);          \
-  } else {                                \
-    LOG(FATAL) << "Unsupported device type: " << val;  \
-  }
-#endif  // DGL_USE_CUDA
-// MSVC does not expand __VA_ARGS__ correctly, and needs this expand hack
-#define MSVC_EXPAND(x) x
-// Macro for dispatch dtype flag to template argument. Currently only
-// support float32.
-#define DGL_DTYPE_SWITCH(val, DType, ...)       \
-  if (val.code == kDLFloat && val.bits == 32) { \
-    typedef float DType;                        \
-    { __VA_ARGS__ }                             \
-  } else {                                      \
-    LOG(FATAL) << "Unsupported dtype: " << val; \
-  }
-// Macro for unrolling with data type arguments.
-#define GEN_DTYPE(GEN, ...)  \
-  MSVC_EXPAND(GEN(__VA_ARGS__, float))
-// Macro for dispatch index nbits to template argument.
-#ifdef __CUDACC__
-#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...)            \
-  if (bits == 32) {                                    \
-    typedef int32_t Idx;                               \
-    {__VA_ARGS__}                                      \
-  } else {                                             \
-    LOG(FATAL) << "Unsupported idx bits: " << bits;    \
-  }
-#else
-#define DGL_IDX_TYPE_SWITCH(bits, Idx, ...)            \
-  if (bits == 32) {                                    \
-    typedef int32_t Idx;                               \
-    {__VA_ARGS__}                                      \
-  } else if (bits == 64) {                             \
-    typedef int64_t Idx;                               \
-    {__VA_ARGS__}                                      \
-  } else {                                             \
-    LOG(FATAL) << "Unsupported idx bits: " << bits;    \
-  }
-#endif
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_COMMON_H_
--- a/src/kernel/cpu/backward_binary_reduce_impl.h
+++ b/src/kernel/cpu/backward_binary_reduce_impl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/backward_binary_reduce_impl.h
- * \brief Minigun CPU UDFs for bacward binary reduce
- */
-#ifndef DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
-#define DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
-#include <minigun/minigun.h>
-#include "../binary_reduce_impl_decl.h"
-#include "../utils.h"
-#include "./functor.h"
-#include "../csr_interface.h"
-namespace dgl {
-namespace kernel {
-namespace cpu {
-// Minigun UDF to compute backward binary reduce.
-template <int Mode, typename Idx, typename DType, typename Functors>
-struct BackwardBinaryReduce {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
-    const int64_t D = gdata->x_length;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * D * len;
-    DType* rhsoff = gdata->rhs_data + rid * D * len;
-    DType* outoff = gdata->out_data + oid * D;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len;
-    DType* gradoutoff = gdata->grad_out_data + oid * D;
-    for (int64_t tx = 0; tx < D; ++tx) {
-      DType out = Functors::Read(outoff + tx);
-      DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
-      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
-      if (0 == grad_e)
-        continue;
-      DType* lhs_base = lhsoff + tx * len;
-      DType* rhs_base = rhsoff + tx * len;
-      if (Mode == binary_op::kGradBoth) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          DType grad = grad_lhs + grad_rhs;
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad;
-        }
-      } else if (Mode == binary_op::kGradLhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad_lhs;
-        }
-      } else if (Mode == binary_op::kGradRhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-#pragma omp atomic
-          gradrhsoff[tx * len + i] += grad_rhs;
-        }
-      }
-    }
-  }
-};
-// Minigun UDF to compute backward binary reduce with broadcasting.
-template <int Mode, int NDim,
-          typename Idx, typename DType, typename Functors>
-struct BackwardBinaryReduceBcast {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len;
-    DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len;
-    int64_t tmp[NDim];  // store unraveled idx.
-    for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
-      Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType out = Functors::Read(outoff + tx);
-      DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(
-        lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
-        rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
-        len);
-      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
-      // (pawelpiotrowicz) Although we can technically add the same condition for
-      // skipping atomic additions as in BackwardBinaryReduce, doing so made the
-      // speed 2% slower in GCMC training on MovieLens-1M with 24 OpenMP threads.
-      // For more details, see https://github.com/dmlc/dgl/pull/1527.
-      // TODO(BarclayII): Needs further investigation and benchmarking.
-      DType* lhs_base = lhsoff +
-          Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len;
-      DType* rhs_base = rhsoff +
-          Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len;
-      if (Mode == binary_op::kGradBoth) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          DType grad = grad_lhs + grad_rhs;
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad;
-        }
-      } else if (Mode == binary_op::kGradLhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-#pragma omp atomic
-          gradlhsoff[tx * len + i] += grad_lhs;
-        }
-      } else if (Mode == binary_op::kGradRhs) {
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-#pragma omp atomic
-          gradrhsoff[tx * len + i] += grad_rhs;
-        }
-      }
-    }
-  }
-};
-// Auxiliary template used in UDF.
-template <typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-struct BackwardFunctorsTempl {
-  static inline Idx SelectOut(
-      Idx src, Idx edge, Idx dst) {
-    typedef typename OutSelector<Reducer>::Type OutTarget;
-    return SwitchSrcDst<OutTarget>::Type::Call(src, edge, dst);
-  }
-  static inline Idx SelectLeft(
-      Idx src, Idx edge, Idx dst) {
-    return LeftSelector::Call(src, edge, dst);
-  }
-  static inline Idx SelectRight(
-      Idx src, Idx edge, Idx dst) {
-    return RightSelector::Call(src, edge, dst);
-  }
-  static inline DType Op(DType* lhs, DType* rhs, int64_t len) {
-    return BinaryOp::Call(lhs, rhs, len);
-  }
-  static inline DType Read(DType* addr) {
-    return *addr;
-  }
-  static inline void Write(DType* addr, DType val) {
-    Reducer::Call(addr, val);
-  }
-  static inline Idx GetId(Idx id, Idx* id_map) {
-    return *(id_map + id);
-  }
-  static inline DType BackwardWrite(DType val, DType accum) {
-    return Reducer::BackwardCall(val, accum);
-  }
-  static inline DType BackwardOpLhs(DType lhs, DType rhs, DType out) {
-    return BinaryOp::BackwardLhs(lhs, rhs, out);
-  }
-  static inline DType BackwardOpRhs(DType lhs, DType rhs, DType out) {
-    return BinaryOp::BackwardRhs(lhs, rhs, out);
-  }
-};
-typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
-}  // namespace cpu
-// Template implementation of BackwardBinaryReduce operator.
-template <int XPU, int Mode, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduce(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<Idx, DType>* gdata) {
-  // For backward computation, we use reverse csr and switch dst and src.
-  // This benefits the most common src_op_edge or copy_src case, because the
-  // gradients of src are now aggregated into destination buffer to reduce
-  // competition of atomic add.
-  auto incsr = graph.GetInCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
-  typedef cpu::BackwardFunctorsTempl<Idx, DType,
-          typename SwitchSrcDst<LeftSelector>::Type,
-          typename SwitchSrcDst<RightSelector>::Type,
-          BinaryOp, Reducer> Functors;
-  typedef cpu::BackwardBinaryReduce<Mode, Idx, DType, Functors> UDF;
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge
-      && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge
-      && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, BackwardGData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_BACKWARD_DEFINE(mode, dtype, lhs_tgt, rhs_tgt, op)  \
-  template void CallBackwardBinaryReduce<XPU,                \
-                    mode, IDX, dtype,                           \
-                    lhs_tgt, rhs_tgt,                           \
-                    op<dtype>, REDUCER<XPU, dtype>>(            \
-      const minigun::advance::RuntimeConfig& rtcfg,             \
-      const CSRWrapper& graph,                                  \
-      BackwardGData<IDX, dtype>* gdata);
-// Template implementation of BackwardBinaryReduce with broadcasting operator.
-template <int XPU, int Mode, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduceBcast(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardBcastGData<NDim, Idx, DType>* gdata) {
-  // For backward computation, we use reverse csr and switch dst and src.
-  // This benefits the most common src_op_edge or copy_src case, because the
-  // gradients of src are now aggregated into destination buffer to reduce
-  // competition of atomic add.
-  auto incsr = graph.GetInCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
-  typedef cpu::BackwardFunctorsTempl<Idx, DType,
-          typename SwitchSrcDst<LeftSelector>::Type,
-          typename SwitchSrcDst<RightSelector>::Type,
-          BinaryOp, Reducer> Functors;
-  typedef cpu::BackwardBinaryReduceBcast<Mode, NDim, Idx, DType, Functors> UDF;
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge
-      && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge
-      && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig,
-    BackwardBcastGData<NDim, Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_BACKWARD_BCAST_DEFINE(mode, ndim, dtype, lhs_tgt, rhs_tgt, op)  \
-  template void CallBackwardBinaryReduceBcast<XPU,                       \
-                    mode, ndim, IDX, dtype,                                 \
-                    lhs_tgt, rhs_tgt,                                       \
-                    op<dtype>, REDUCER<XPU, dtype>>(                        \
-      const minigun::advance::RuntimeConfig& rtcfg,                         \
-      const CSRWrapper& graph,                                              \
-      BackwardBcastGData<ndim, IDX, dtype>* gdata);
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_CPU_BACKWARD_BINARY_REDUCE_IMPL_H_
--- a/src/kernel/cpu/binary_bcast_reduce_max.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_max.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_max.cc
- * \brief CPU kernels for braodcasting binary reduce max
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceMax
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_min.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_min.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_min.cc
- * \brief CPU kernels for braodcasting binary reduce min
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceMin
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_none.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_none.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_none.cc
- * \brief CPU kernels for braodcasting binary reduce none
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceNone
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_prod.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_prod.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_prod.cc
- * \brief CPU kernels for braodcasting binary reduce prod
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceProd
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_bcast_reduce_sum.cc
+++ b/src/kernel/cpu/binary_bcast_reduce_sum.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_bcast_reduce_sum.cc
- * \brief CPU kernels for braodcasting binary reduce sum
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceSum
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_impl.cc
+++ b/src/kernel/cpu/binary_reduce_impl.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_impl.cc
- * \brief Binary reduce implementation on CPU.
- */
-#include "../binary_reduce_impl.h"
-#include "../csr_interface.h"
-using dgl::runtime::NDArray;
-namespace dgl {
-namespace kernel {
-template void BinaryReduceImpl<kDLCPU>(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-template void BinaryReduceBcastImpl<kDLCPU>(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-template void BackwardBinaryReduceImpl<kDLCPU>(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    NDArray lhs_mapping, NDArray rhs_mapping, NDArray out_mapping,
-    NDArray lhs_data, NDArray rhs_data, NDArray out_data,
-    NDArray grad_out_data,
-    NDArray grad_lhs_data, NDArray grad_rhs_data);
-template void BackwardBinaryReduceBcastImpl<kDLCPU>(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs_tgt, binary_op::Target rhs_tgt,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
-    runtime::NDArray grad_lhs, runtime::NDArray grad_rhs);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_impl.h
+++ b/src/kernel/cpu/binary_reduce_impl.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_impl.h
- * \brief Minigun CPU UDFs for binary reduce
- */
-#ifndef DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
-#define DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
-#include <minigun/minigun.h>
-#include <algorithm>
-#include "../binary_reduce_impl_decl.h"
-#include "../utils.h"
-#include "./functor.h"
-#include "../csr_interface.h"
-namespace dgl {
-namespace kernel {
-namespace cpu {
-// Minigun UDF to compute binary reduce.
-template <typename Idx, typename DType, typename Functors>
-struct BinaryReduce {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    const int64_t D = gdata->x_length;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * D * len;
-    DType* rhsoff = gdata->rhs_data + rid * D * len;
-    DType* outoff = gdata->out_data + oid * D;
-    for (int64_t tx = 0; tx < D; ++tx) {
-      DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
-      Functors::Write(outoff + tx, out);
-    }
-  }
-};
-// Convert flattened index to multi-dimension index (assume row-major).
-inline void Unravel(int64_t idx, int ndim,
-    const int64_t* shape, const int64_t* stride, int64_t* out) {
-  for (int d = 0; d < ndim; ++d) {
-    out[d] = (idx / stride[d]) % shape[d];
-  }
-}
-// Convert multi-dimension index to flattened index (assume row-major).
-inline int64_t Ravel(const int64_t* idx, int ndim,
-    const int64_t* shape, const int64_t* stride) {
-  int64_t out = 0;
-  for (int d = 0; d < ndim; ++d) {
-    out += std::min(idx[d], shape[d] - 1) * stride[d];
-  }
-  return out;
-}
-// Minigun UDF to compute binary reduce with broadcasting.
-template <int NDim, typename Idx, typename DType, typename Functors>
-struct BinaryReduceBcast {
-  static inline bool CondEdge(
-      Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
-    return true;
-  }
-  static inline void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;  // data with len size
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len;
-    int64_t tmp[NDim];  // store unraveled idx.
-    for (int64_t tx = 0; tx < gdata->out_len; ++tx) {
-      Unravel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride, tmp);
-      DType out = Functors::Op(
-          lhsoff + Ravel(tmp, gdata->ndim, gdata->lhs_shape, gdata->lhs_stride) * len,
-          rhsoff + Ravel(tmp, gdata->ndim, gdata->rhs_shape, gdata->rhs_stride) * len,
-          len);
-      Functors::Write(outoff + tx, out);
-    }
-  }
-};
-// Auxiliary template used in UDF.
-template <typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-struct FunctorsTempl {
-  static inline Idx SelectOut(
-      Idx src, Idx edge, Idx dst) {
-    return OutSelector<Reducer>::Type::Call(src, edge, dst);
-  }
-  static inline Idx SelectLeft(
-      Idx src, Idx edge, Idx dst) {
-    return LeftSelector::Call(src, edge, dst);
-  }
-  static inline Idx SelectRight(
-      Idx src, Idx edge, Idx dst) {
-    return RightSelector::Call(src, edge, dst);
-  }
-  static inline DType Op(DType *lhs, DType *rhs, int64_t len) {
-    return BinaryOp::Call(lhs, rhs, len);
-  }
-  static inline void Write(DType* addr, DType val) {
-    Reducer::Call(addr, val);
-  }
-  static inline Idx GetId(Idx id, Idx* id_map) {
-    return *(id_map + id);
-  }
-};
-typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
-}  // namespace cpu
-// Template implementation of BinaryReduce operator.
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
-                      const CSRWrapper& graph,
-                      GData<Idx, DType>* gdata) {
-  typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryOp, Reducer>
-          Functors;
-  typedef cpu::BinaryReduce<Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig, GData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Template implementation of BinaryReduce broadcasting operator.
-template <int XPU, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduceBcast(
-  const minigun::advance::RuntimeConfig& rtcfg,
-  const CSRWrapper& graph,
-  BcastGData<NDim, Idx, DType>* gdata) {
-  typedef cpu::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryOp, Reducer>
-          Functors;
-  typedef cpu::BinaryReduceBcast<NDim, Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cpu::AdvanceConfig,
-    BcastGData<NDim, Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_DEFINE(dtype, lhs_tgt, rhs_tgt, op)                    \
-  template void CallBinaryReduce<XPU, IDX,                      \
-        dtype, lhs_tgt, rhs_tgt, op<dtype>, REDUCER<XPU, dtype>>(  \
-      const minigun::advance::RuntimeConfig& rtcfg,                \
-      const CSRWrapper& graph,                                     \
-      GData<IDX, dtype>* gdata);
-#define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op)         \
-  template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype,     \
-                                 lhs_tgt, rhs_tgt,                  \
-                                 op<dtype>, REDUCER<XPU, dtype>>(   \
-      const minigun::advance::RuntimeConfig& rtcfg,                 \
-      const CSRWrapper& graph,                                      \
-      BcastGData<ndim, IDX, dtype>* gdata);
-#define EVAL(F, ...) MSVC_EXPAND(F(__VA_ARGS__))
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_CPU_BINARY_REDUCE_IMPL_H_
--- a/src/kernel/cpu/binary_reduce_max.cc
+++ b/src/kernel/cpu/binary_reduce_max.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_max.cc
- * \brief CPU kernels for binary reduce max
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceMax
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_min.cc
+++ b/src/kernel/cpu/binary_reduce_min.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_min.cc
- * \brief CPU kernels for binary reduce min
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceMin
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_none.cc
+++ b/src/kernel/cpu/binary_reduce_none.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_none.cc
- * \brief CPU kernels for binary reduce none
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceNone
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl