Remove deprecated kernels (#3316)

* remove * remove * fix * remove * remove

Remove deprecated kernels (#3316)
* remove * remove * fix * remove * remove
c81efdf2 · Jinjing Zhou · GitHub · 75d793a1 · 75d793a1 · 75d793a1
Unverified Commit c81efdf2 authored Sep 06, 2021 by Jinjing Zhou Committed by GitHub Sep 06, 2021
20 changed files
--- a/src/kernel/cpu/binary_reduce_prod.cc
+++ b/src/kernel/cpu/binary_reduce_prod.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_prod.cc
- * \brief CPU kernels for binary reduce prod
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceProd
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/binary_reduce_sum.cc
+++ b/src/kernel/cpu/binary_reduce_sum.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/binary_reduce_sum.cc
- * \brief CPU kernels for binary reduce sum
- */
-#include "./binary_reduce_impl.h"
-#include "./backward_binary_reduce_impl.h"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceSum
-#define XPU kDLCPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-#define IDX int64_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-#undef IDX
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cpu/functor.h
+++ b/src/kernel/cpu/functor.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/functor.h
- * \brief Functors for template on CPU
- */
-#ifndef DGL_KERNEL_CPU_FUNCTOR_H_
-#define DGL_KERNEL_CPU_FUNCTOR_H_
-#include <dmlc/omp.h>
-#include <algorithm>
-#include "../binary_reduce_common.h"
-namespace dgl {
-namespace kernel {
-// Reducer functor specialization
-template <typename DType>
-struct ReduceSum<kDLCPU, DType> {
-  static void Call(DType* addr, DType val) {
-    if (0 == val)
-      return;
-#pragma omp atomic
-    *addr += val;
-  }
-  static DType BackwardCall(DType val, DType accum) {
-    return 1;
-  }
-};
-template <typename DType>
-struct ReduceMax<kDLCPU, DType> {
-  static void Call(DType* addr, DType val) {
-#pragma omp critical
-    *addr = std::max(*addr, val);
-  }
-  static DType BackwardCall(DType val, DType accum) {
-    return static_cast<DType>(val == accum);
-  }
-};
-template <typename DType>
-struct ReduceMin<kDLCPU, DType> {
-  static void Call(DType* addr, DType val) {
-#pragma omp critical
-    *addr = std::min(*addr, val);
-  }
-  static DType BackwardCall(DType val, DType accum) {
-    return static_cast<DType>(val == accum);
-  }
-};
-template <typename DType>
-struct ReduceProd<kDLCPU, DType> {
-  static void Call(DType* addr, DType val) {
-#pragma omp atomic
-    *addr *= val;
-  }
-  static DType BackwardCall(DType val, DType accum) {
-    return accum / val;
-  }
-};
-template <typename DType>
-struct ReduceNone<kDLCPU, DType> {
-  static void Call(DType* addr, DType val) {
-    *addr = val;
-  }
-  static DType BackwardCall(DType val, DType accum) {
-    return 1;
-  }
-};
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_CPU_FUNCTOR_H_
--- a/src/kernel/cpu/utils.cc
+++ b/src/kernel/cpu/utils.cc
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cpu/utils.cc
- * \brief Utility function implementations on CPU
- */
-#include "../utils.h"
-namespace dgl {
-namespace kernel {
-namespace utils {
-template <int XPU, typename DType>
-void Fill(const DLContext& ctx, DType* ptr, size_t length, DType val) {
-  for (size_t i = 0; i < length; ++i) {
-    *(ptr + i) = val;
-  }
-}
-template void Fill<kDLCPU, float>(const DLContext& ctx, float* ptr, size_t length, float val);
-template void Fill<kDLCPU, double>(const DLContext& ctx, double* ptr, size_t length, double val);
-}  // namespace utils
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/csr_interface.h
+++ b/src/kernel/csr_interface.h
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/csr_interface.h
- * \brief Kernel common utilities
- */
-#ifndef DGL_KERNEL_CSR_INTERFACE_H_
-#define DGL_KERNEL_CSR_INTERFACE_H_
-#include <dgl/array.h>
-#include <dgl/runtime/c_runtime_api.h>
-namespace dgl {
-namespace kernel {
-/*!
- * \brief Wrapper class that unifies ImmutableGraph and Bipartite, which do
- * not share a base class.
- *
- * \note This is an ugly temporary solution, and shall be removed after
- * refactoring ImmutableGraph and Bipartite to use the same data structure.
- */
-class CSRWrapper {
- public:
-  virtual aten::CSRMatrix GetInCSRMatrix() const = 0;
-  virtual aten::CSRMatrix GetOutCSRMatrix() const = 0;
-  virtual DGLContext Context() const = 0;
-  virtual int NumBits() const = 0;
-};
-};  // namespace kernel
-};  // namespace dgl
-#endif  // DGL_KERNEL_CSR_INTERFACE_H_
--- a/src/kernel/cuda/atomic.cuh
+++ b/src/kernel/cuda/atomic.cuh
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/atomic.cuh
- * \brief Atomic functions
- */
-#ifndef DGL_KERNEL_CUDA_ATOMIC_H_
-#define DGL_KERNEL_CUDA_ATOMIC_H_
-#include <cuda_runtime.h>
-#if __CUDA_ARCH__ >= 600
-#include <cuda_fp16.h>
-#endif
-namespace dgl {
-namespace kernel {
-namespace cuda {
-// Type trait for selecting code type
-template <int Bytes> struct Code { };
-template <> struct Code<4> {
-  typedef unsigned int Type;
-};
-template <> struct Code<8> {
-  typedef unsigned long long int Type;
-};
-// Helper class for converting to/from atomicCAS compatible types.
-template <typename T> struct Cast {
-  typedef typename Code<sizeof(T)>::Type Type;
-  static __device__ __forceinline__ Type Encode(T val) {
-    return static_cast<Type>(val);
-  }
-  static __device__ __forceinline__ T Decode(Type code) {
-    return static_cast<T>(code);
-  }
-};
-template <> struct Cast<float> {
-  typedef Code<sizeof(float)>::Type Type;
-  static __device__ __forceinline__ Type Encode(float val) {
-    return __float_as_uint(val);
-  }
-  static __device__ __forceinline__ float Decode(Type code) {
-    return __uint_as_float(code);
-  }
-};
-template <> struct Cast<double> {
-  typedef Code<sizeof(double)>::Type Type;
-  static __device__ __forceinline__ Type Encode(double val) {
-    return __double_as_longlong(val);
-  }
-  static __device__ __forceinline__ double Decode(Type code) {
-    return __longlong_as_double(code);
-  }
-};
-#define DEFINE_ATOMIC(NAME) \
-  template <typename T>                                          \
-  __device__ __forceinline__ T Atomic##NAME(T* addr, T val) {    \
-    typedef typename Cast<T>::Type CT;                           \
-    CT* addr_as_ui = reinterpret_cast<CT*>(addr);                \
-    CT old = *addr_as_ui;                                        \
-    CT assumed = old;                                            \
-    do {                                                         \
-      assumed = old;                                             \
-      old = atomicCAS(addr_as_ui, assumed,                       \
-          Cast<T>::Encode(OP(val, Cast<T>::Decode(old))));       \
-    } while (assumed != old);                                    \
-    return Cast<T>::Decode(old);                                 \
-  }
-#define OP(a, b) max(a, b)
-DEFINE_ATOMIC(Max)
-#undef OP
-#define OP(a, b) min(a, b)
-DEFINE_ATOMIC(Min)
-#undef OP
-#define OP(a, b) a + b
-DEFINE_ATOMIC(Add)
-#undef OP
-#if __CUDA_ARCH__ >= 200
-template <>
-__device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
-  return atomicAdd(addr, val);
-}
-#endif  // __CUDA_ARCH__
-#if __CUDA_ARCH__ >= 600
-template <>
-__device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
-  return atomicAdd(addr, val);
-}
-#endif
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000
-#if __CUDA_ARCH__ >= 600
-template <>
-__device__ __forceinline__ __half2 AtomicAdd<__half2>(__half2* addr, __half2 val) {
-  return atomicAdd(addr, val);
-}
-#endif  // __CUDA_ARCH__
-#if __CUDA_ARCH__ >= 700
-template <>
-__device__ __forceinline__ __half AtomicAdd<__half>(__half* addr, __half val) {
-  return atomicAdd(addr, val);
-}
-#endif  // __CUDA_ARCH__
-#endif
-#define OP(a, b) a * b
-DEFINE_ATOMIC(Mul)
-#undef OP
-/**
-* \brief Performs an atomic compare-and-swap on 64 bit integers. That is,
-* it the word `old` at the memory location `address`, computes
-* `(old == compare ? val : old)` , and stores the result back to memory at
-* the same address.
-*
-* \param address The address to perform the atomic operation on.
-* \param compare The value to compare to.
-* \param val The new value to conditionally store.
-*
-* \return The old value at the address.
-*/
-inline __device__ int64_t AtomicCAS(
-    int64_t * const address,
-    const int64_t compare,
-    const int64_t val) {
-  // match the type of "::atomicCAS", so ignore lint warning
-  using Type = unsigned long long int; // NOLINT
-  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
-  return atomicCAS(reinterpret_cast<Type*>(address),
-                   static_cast<Type>(compare),
-                   static_cast<Type>(val));
-}
-/**
-* \brief Performs an atomic compare-and-swap on 32 bit integers. That is,
-* it the word `old` at the memory location `address`, computes
-* `(old == compare ? val : old)` , and stores the result back to memory at
-* the same address.
-*
-* \param address The address to perform the atomic operation on.
-* \param compare The value to compare to.
-* \param val The new value to conditionally store.
-*
-* \return The old value at the address.
-*/
-inline __device__ int32_t AtomicCAS(
-    int32_t * const address,
-    const int32_t compare,
-    const int32_t val) {
-  // match the type of "::atomicCAS", so ignore lint warning
-  using Type = int; // NOLINT
-  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
-  return atomicCAS(reinterpret_cast<Type*>(address),
-                   static_cast<Type>(compare),
-                   static_cast<Type>(val));
-}
-inline __device__ int64_t AtomicMax(
-    int64_t * const address,
-    const int64_t val) {
-  // match the type of "::atomicCAS", so ignore lint warning
-  using Type = unsigned long long int; // NOLINT
-  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
-  return atomicMax(reinterpret_cast<Type*>(address),
-                   static_cast<Type>(val));
-}
-inline __device__ int32_t AtomicMax(
-    int32_t * const address,
-    const int32_t val) {
-  // match the type of "::atomicCAS", so ignore lint warning
-  using Type = int; // NOLINT
-  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
-  return atomicMax(reinterpret_cast<Type*>(address),
-                   static_cast<Type>(val));
-}
-}  // namespace cuda
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_CUDA_ATOMIC_H_
--- a/src/kernel/cuda/backward_binary_reduce_impl.cuh
+++ b/src/kernel/cuda/backward_binary_reduce_impl.cuh
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/backward_binary_reduce_impl.cuh
- * \brief Minigun CUDA UDFs for bacward binary reduce
- */
-#ifndef DGL_KERNEL_CUDA_BACKWARD_BINARY_REDUCE_IMPL_CUH_
-#define DGL_KERNEL_CUDA_BACKWARD_BINARY_REDUCE_IMPL_CUH_
-#include <minigun/minigun.h>
-#include "../binary_reduce_impl_decl.h"
-#include "../utils.h"
-#include "./functor.cuh"
-#include "../csr_interface.h"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-// Minigun UDF to compute backward binary reduce.
-template <int Mode, typename Idx, typename DType, typename Functors>
-struct BackwardBinaryReduce {
-  static __device__ __forceinline__ bool CondEdge(
-      Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
-    return true;
-  }
-  static __device__ __forceinline__ void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BackwardGData<Idx, DType>* gdata) {
-    const int64_t D = gdata->x_length;
-    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride_x = blockDim.x * gridDim.x;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * D * len;
-    DType* rhsoff = gdata->rhs_data + rid * D * len;
-    DType* outoff = gdata->out_data + oid * D;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * D * len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * D * len;
-    DType* gradoutoff = gdata->grad_out_data + oid * D;
-    while (tx < D) {
-      DType out = Functors::Read(outoff + tx);
-      DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
-      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
-      DType* lhs_base = lhsoff + tx * len;
-      DType* rhs_base = rhsoff + tx * len;
-      if (Mode == binary_op::kGradBoth) {
-#pragma unroll
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          DType grad = grad_lhs + grad_rhs;
-          AtomicAdd(gradlhsoff + tx * len + i, grad);
-        }
-      } else if (Mode == binary_op::kGradLhs) {
-#pragma unroll
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
-        }
-      } else if (Mode == binary_op::kGradRhs) {
-#pragma unroll
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          AtomicAdd(gradrhsoff + tx * len + i, grad_rhs);
-        }
-      }
-      tx += stride_x;
-    }
-  }
-};
-// Minigun UDF to compute backward binary reduce with broadcasting.
-template <int Mode, int NDim, typename Idx, typename DType, typename Functors>
-struct BackwardBinaryReduceBcast {
-  static __device__ __forceinline__ bool CondEdge(
-      Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
-    return true;
-  }
-  static __device__ __forceinline__ void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BackwardBcastGData<NDim, Idx, DType>* gdata) {
-    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
-    int stride_x = blockDim.x * gridDim.x;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len;
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len;
-    DType* gradlhsoff = gdata->grad_lhs_data + lid * gdata->out_len * len;
-    DType* gradrhsoff = gdata->grad_rhs_data + rid * gdata->out_len * len;
-    DType* gradoutoff = gdata->grad_out_data + oid * gdata->out_len;
-    while (tx < gdata->out_len) {
-      int64_t lhs_add = 0;
-      int64_t rhs_add = 0;
-      UnravelRavel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride,
-          gdata->lhs_shape, gdata->lhs_stride,
-          gdata->rhs_shape, gdata->rhs_stride, &lhs_add, &rhs_add);
-      DType out = Functors::Read(outoff + tx);
-      DType grad_out = Functors::Read(gradoutoff + tx);
-      DType e = Functors::Op(lhsoff + lhs_add * len, rhsoff + rhs_add * len, len);
-      DType grad_e = grad_out * Functors::BackwardWrite(e, out);
-      DType* lhs_base = lhsoff + lhs_add * len;
-      DType* rhs_base = rhsoff + rhs_add * len;
-      if (Mode == binary_op::kGradBoth) {
-#pragma unroll
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          DType grad = grad_lhs + grad_rhs;
-          AtomicAdd(gradlhsoff + tx * len + i, grad);
-        }
-      } else if (Mode == binary_op::kGradLhs) {
-#pragma unroll
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_lhs = grad_e * Functors::BackwardOpLhs(lhs, rhs, e);
-          AtomicAdd(gradlhsoff + tx * len + i, grad_lhs);
-        }
-      } else if (Mode == binary_op::kGradRhs) {
-#pragma unroll
-        for (int64_t i = 0; i < len; ++i) {
-          DType lhs = Functors::Read(lhs_base + i);
-          DType rhs = Functors::Read(rhs_base + i);
-          DType grad_rhs = grad_e * Functors::BackwardOpRhs(lhs, rhs, e);
-          AtomicAdd(gradrhsoff + tx * len + i, grad_rhs);
-        }
-      }
-      tx += stride_x;
-    }
-  }
-};
-// Auxiliary template used in UDF.
-template <typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-struct BackwardFunctorsTempl {
-  static __device__ __forceinline__ Idx SelectOut(
-      Idx src, Idx edge, Idx dst) {
-    typedef typename OutSelector<Reducer>::Type OutTarget;
-    return SwitchSrcDst<OutTarget>::Type::Call(src, edge, dst);
-  }
-  static __device__ __forceinline__ Idx SelectLeft(
-      Idx src, Idx edge, Idx dst) {
-    return LeftSelector::Call(src, edge, dst);
-  }
-  static __device__ __forceinline__ Idx SelectRight(
-      Idx src, Idx edge, Idx dst) {
-    return RightSelector::Call(src, edge, dst);
-  }
-  static __device__ __forceinline__ DType Op(DType* lhs, DType* rhs, int64_t len) {
-    return BinaryOp::Call(lhs, rhs, len);
-  }
-  static __device__ __forceinline__ DType Read(DType* addr) {
-    return LDGReader<DType>::Call(addr);
-  }
-  static __device__ __forceinline__ void Write(DType* addr, DType val) {
-    Reducer::Call(addr, val);
-  }
-  static __device__ __forceinline__ Idx GetId(Idx id, Idx* id_map) {
-    return LDGReader<Idx>::Call(id_map + id);
-  }
-  static __device__ __forceinline__ DType BackwardWrite(DType val, DType accum) {
-    return Reducer::BackwardCall(val, accum);
-  }
-  static __device__ __forceinline__ DType BackwardOpLhs(DType lhs, DType rhs, DType out) {
-    return BinaryOp::BackwardLhs(lhs, rhs, out);
-  }
-  static __device__ __forceinline__ DType BackwardOpRhs(DType lhs, DType rhs, DType out) {
-    return BinaryOp::BackwardRhs(lhs, rhs, out);
-  }
-};
-typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
-}  // namespace cuda
-// Template implementation of BackwardBinaryReduce operator.
-template <int XPU, int Mode, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduce(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<Idx, DType>* gdata) {
-  // For backward computation, we use reverse csr and switch dst and src.
-  // This benefits the most common src_op_edge or copy_src case, because the
-  // gradients of src are now aggregated into destination buffer to reduce
-  // competition of atomic add.
-  auto incsr = graph.GetInCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
-  typedef cuda::BackwardFunctorsTempl<Idx, DType,
-          typename SwitchSrcDst<LeftSelector>::Type,
-          typename SwitchSrcDst<RightSelector>::Type,
-          BinaryOp, Reducer> Functors;
-  typedef cuda::BackwardBinaryReduce<Mode, Idx, DType, Functors> UDF;
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge
-      && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge
-      && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig, BackwardGData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_BACKWARD_DEFINE(mode, dtype, lhs_tgt, rhs_tgt, op)  \
-  template void CallBackwardBinaryReduce<XPU,                \
-                    mode, IDX, dtype,                           \
-                    lhs_tgt, rhs_tgt,                           \
-                    op<dtype>, REDUCER<XPU, dtype>>(            \
-      const minigun::advance::RuntimeConfig& rtcfg,             \
-      const CSRWrapper& graph,                                  \
-      BackwardGData<IDX, dtype>* gdata);
-// Template implementation of BackwardBinaryReduce with broadcasting operator.
-template <int XPU, int Mode, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBackwardBinaryReduceBcast(
-    const minigun::advance::RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardBcastGData<NDim, Idx, DType>* gdata) {
-  // For backward computation, we use reverse csr and switch dst and src.
-  // This benefits the most common src_op_edge or copy_src case, because the
-  // gradients of src are now aggregated into destination buffer to reduce
-  // competition of atomic add.
-  auto incsr = graph.GetInCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
-  typedef cuda::BackwardFunctorsTempl<Idx, DType,
-          typename SwitchSrcDst<LeftSelector>::Type,
-          typename SwitchSrcDst<RightSelector>::Type,
-          BinaryOp, Reducer> Functors;
-  typedef cuda::BackwardBinaryReduceBcast<Mode, NDim, Idx, DType, Functors> UDF;
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge
-      && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge
-      && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig,
-    BackwardBcastGData<NDim, Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_BACKWARD_BCAST_DEFINE(mode, ndim, dtype, lhs_tgt, rhs_tgt, op)  \
-  template void CallBackwardBinaryReduceBcast<XPU,                       \
-                    mode, ndim, IDX, dtype,                                 \
-                    lhs_tgt, rhs_tgt,                                       \
-                    op<dtype>, REDUCER<XPU, dtype>>(                        \
-      const minigun::advance::RuntimeConfig& rtcfg,                         \
-      const CSRWrapper& graph,                                              \
-      BackwardBcastGData<ndim, IDX, dtype>* gdata);
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_CUDA_BACKWARD_BINARY_REDUCE_IMPL_CUH_
--- a/src/kernel/cuda/binary_bcast_reduce_max.cu
+++ b/src/kernel/cuda/binary_bcast_reduce_max.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_bcast_reduce_max.cu
- * \brief CUDA kernels for braodcasting binary reduce max
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-}  // namespace cuda
-#define REDUCER ReduceMax
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_bcast_reduce_min.cu
+++ b/src/kernel/cuda/binary_bcast_reduce_min.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_bcast_reduce_min.cu
- * \brief CUDA kernels for braodcasting binary reduce min
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-}  // namespace cuda
-#define REDUCER ReduceMin
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_bcast_reduce_none.cu
+++ b/src/kernel/cuda/binary_bcast_reduce_none.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_bcast_reduce_none.cu
- * \brief CUDA kernels for braodcasting binary reduce none
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-}  // namespace cuda
-#define REDUCER ReduceNone
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_bcast_reduce_prod.cu
+++ b/src/kernel/cuda/binary_bcast_reduce_prod.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_bcast_reduce_prod.cu
- * \brief CUDA kernels for braodcasting binary reduce prod
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-}  // namespace cuda
-#define REDUCER ReduceProd
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_bcast_reduce_sum.cu
+++ b/src/kernel/cuda/binary_bcast_reduce_sum.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_bcast_reduce_sum.cu
- * \brief CUDA kernels for braodcasting binary reduce sum
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-}  // namespace cuda
-#define REDUCER ReduceSum
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET, GEN_BCAST_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_NDIM, GEN_DTYPE, GEN_OP_TARGET,
-     GEN_BACKWARD_BCAST_DEFINE);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_reduce_impl.cu
+++ b/src/kernel/cuda/binary_reduce_impl.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_reduce_impl.cu
- * \brief Binary reduce implementation on cuda.
- */
-#include "../binary_reduce_impl.h"
-#include "../csr_interface.h"
-using dgl::runtime::NDArray;
-namespace dgl {
-namespace kernel {
-template void BinaryReduceImpl<kDLGPU>(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-template void BinaryReduceBcastImpl<kDLGPU>(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    runtime::NDArray lhs_data, runtime::NDArray rhs_data,
-    runtime::NDArray out_data,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping,
-    runtime::NDArray out_mapping);
-template void BackwardBinaryReduceImpl<kDLGPU>(
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs, binary_op::Target rhs,
-    NDArray lhs_mapping, NDArray rhs_mapping, NDArray out_mapping,
-    NDArray lhs_data, NDArray rhs_data, NDArray out_data,
-    NDArray grad_out_data,
-    NDArray grad_lhs_data, NDArray grad_rhs_data);
-template void BackwardBinaryReduceBcastImpl<kDLGPU>(
-    const BcastInfo& info,
-    const std::string& reducer,
-    const std::string& op,
-    const CSRWrapper& graph,
-    binary_op::Target lhs_tgt, binary_op::Target rhs_tgt,
-    runtime::NDArray lhs_mapping, runtime::NDArray rhs_mapping, runtime::NDArray out_mapping,
-    runtime::NDArray lhs, runtime::NDArray rhs, runtime::NDArray out, runtime::NDArray grad_out,
-    runtime::NDArray grad_lhs, runtime::NDArray grad_rhs);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_reduce_impl.cuh
+++ b/src/kernel/cuda/binary_reduce_impl.cuh
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_reduce_impl.cuh
- * \brief Minigun CUDA UDFs for binary reduce
- */
-#ifndef DGL_KERNEL_CUDA_BINARY_REDUCE_IMPL_CUH_
-#define DGL_KERNEL_CUDA_BINARY_REDUCE_IMPL_CUH_
-#include <minigun/minigun.h>
-#include "../binary_reduce_impl_decl.h"
-#include "../utils.h"
-#include "./functor.cuh"
-#include "../csr_interface.h"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-// Minigun UDF to compute binary reduce.
-template <typename Idx, typename DType, typename Functors>
-struct BinaryReduce {
-  static __device__ __forceinline__ bool CondEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    return true;
-  }
-  static __device__ __forceinline__ void ApplyEdge(
-      Idx src, Idx dst, Idx eid, GData<Idx, DType>* gdata) {
-    const int64_t D = gdata->x_length;
-    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
-    const int64_t stride_x = blockDim.x * gridDim.x;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * D * len;
-    DType* rhsoff = gdata->rhs_data + rid * D * len;
-    DType* outoff = gdata->out_data + oid * D;
-    while (tx < D) {
-      DType out = Functors::Op(lhsoff + tx * len, rhsoff + tx * len, len);
-      Functors::Write(outoff + tx, out);
-      tx += stride_x;
-    }
-  }
-};
-/*
- * This func do the followings:
- *   1. Convert flattened index to multi-dimension index
- *      according to output shape (assume row-major).
- *   2. Convert multi-dimension index to flattened index for lhs.
- *   3. Convert multi-dimension index to flattened index for rhs.
- */
-__device__ __forceinline__ void UnravelRavel(
-    const int64_t idx, const int ndim, const int64_t* out_shape, const int64_t* out_stride,
-    const int64_t* lhs_shape, const int64_t* lhs_stride,
-    const int64_t* rhs_shape, const int64_t* rhs_stride, int64_t *lhs_out, int64_t *rhs_out) {
-  if (out_stride[0] == lhs_stride[0]) {
-#pragma unroll
-    for (int d = 0; d < ndim; ++d) {
-      int64_t o_sh = out_shape[d];
-      int64_t o_st = out_stride[d];
-      int64_t rhs_sh = rhs_shape[d];
-      int64_t rhs_st = rhs_stride[d];
-      int64_t i = (idx / o_st) % o_sh;
-      /*
-       * Simplfied for rhs_out += min(i, rhs_sh - 1) * rhs_st;
-       * rhs_sh be o_sh or 1
-       */
-      if (rhs_sh > i) {
-        *rhs_out += i * rhs_st;
-      }
-    }
-    *lhs_out = idx;
-  } else {
-#pragma unroll
-    for (int d = 0; d < ndim; ++d) {
-      int64_t o_sh = out_shape[d];
-      int64_t o_st = out_stride[d];
-      int64_t lhs_sh = lhs_shape[d];
-      int64_t lhs_st = lhs_stride[d];
-      int64_t i = (idx / o_st) % o_sh;
-      /*
-       * Simplfied for lhs_out += min(i, lhs_sh - 1) * lhs_st;
-       * lhs_sh be o_sh or 1
-       */
-      if (lhs_sh > i) {
-        *lhs_out += i * lhs_st;
-      }
-    }
-    *rhs_out = idx;
-  }
-}
-// Minigun UDF to compute binary reduce with broadcasting.
-template <int NDim, typename Idx, typename DType, typename Functors>
-struct BinaryReduceBcast {
-  static __device__ __forceinline__ bool CondEdge(
-      Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
-    return true;
-  }
-  static __device__ __forceinline__ void ApplyEdge(
-      Idx src, Idx dst, Idx eid, BcastGData<NDim, Idx, DType>* gdata) {
-    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
-    const int64_t stride_x = blockDim.x * gridDim.x;
-    const int64_t len = gdata->data_len;
-    Idx lid = Functors::SelectLeft(src, eid, dst);
-    Idx rid = Functors::SelectRight(src, eid, dst);
-    Idx oid = Functors::SelectOut(src, eid, dst);
-    if (gdata->lhs_mapping) {
-      lid = Functors::GetId(lid, gdata->lhs_mapping);
-    }
-    if (gdata->rhs_mapping) {
-      rid = Functors::GetId(rid, gdata->rhs_mapping);
-    }
-    if (gdata->out_mapping) {
-      oid = Functors::GetId(oid, gdata->out_mapping);
-    }
-    DType* lhsoff = gdata->lhs_data + lid * gdata->lhs_len * len; //data with len size
-    DType* rhsoff = gdata->rhs_data + rid * gdata->rhs_len * len;
-    DType* outoff = gdata->out_data + oid * gdata->out_len;
-    while (tx < gdata->out_len) {
-      int64_t lhs_add = 0;
-      int64_t rhs_add = 0;
-      UnravelRavel(tx, gdata->ndim, gdata->out_shape, gdata->out_stride,
-          gdata->lhs_shape, gdata->lhs_stride,
-          gdata->rhs_shape, gdata->rhs_stride, &lhs_add, &rhs_add);
-      DType out = Functors::Op(lhsoff + lhs_add * len, rhsoff + rhs_add * len, len);
-      Functors::Write(outoff + tx, out);
-      tx += stride_x;
-    }
-  }
-};
-// Auxiliary template used in UDF.
-template <typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-struct FunctorsTempl {
-  static __device__ __forceinline__ Idx SelectOut(
-      Idx src, Idx edge, Idx dst) {
-    return OutSelector<Reducer>::Type::Call(src, edge, dst);
-  }
-  static __device__ __forceinline__ Idx SelectLeft(
-      Idx src, Idx edge, Idx dst) {
-    return LeftSelector::Call(src, edge, dst);
-  }
-  static __device__ __forceinline__ Idx SelectRight(
-      Idx src, Idx edge, Idx dst) {
-    return RightSelector::Call(src, edge, dst);
-  }
-  static __device__ __forceinline__ DType Op(DType *lhs, DType *rhs, int64_t len) {
-    return BinaryOp::Call(lhs, rhs, len);
-  }
-  static __device__ __forceinline__ void Write(DType* addr, DType val) {
-    Reducer::Call(addr, val);
-  }
-  static __device__ __forceinline__ Idx GetId(Idx id, Idx* id_map) {
-    return LDGReader<Idx>::Call(id_map + id);
-  }
-};
-typedef minigun::advance::Config<true, minigun::advance::kV2N> AdvanceConfig;
-}  // namespace cuda
-// Template implementation of BinaryReduce operator.
-template <int XPU, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduce(const minigun::advance::RuntimeConfig& rtcfg,
-                      const CSRWrapper& graph,
-                      GData<Idx, DType>* gdata) {
-  typedef cuda::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryOp, Reducer>
-          Functors;
-  typedef cuda::BinaryReduce<Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig, GData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Template implementation of BinaryReduce broadcasting operator.
-template <int XPU, int NDim, typename Idx, typename DType,
-          typename LeftSelector, typename RightSelector,
-          typename BinaryOp, typename Reducer>
-void CallBinaryReduceBcast(
-  const minigun::advance::RuntimeConfig& rtcfg,
-  const CSRWrapper& graph,
-  BcastGData<NDim, Idx, DType>* gdata) {
-  typedef cuda::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryOp, Reducer>
-          Functors;
-  typedef cuda::BinaryReduceBcast<NDim, Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig,
-    BcastGData<NDim, Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-// Following macro is used to generate explicit-specialization of the template
-// operator.
-#define GEN_DEFINE(dtype, lhs_tgt, rhs_tgt, op)                    \
-  template void CallBinaryReduce<XPU, IDX,                      \
-        dtype, lhs_tgt, rhs_tgt, op<dtype>, REDUCER<XPU, dtype>>(  \
-      const minigun::advance::RuntimeConfig& rtcfg,                \
-      const CSRWrapper& graph,                                     \
-      GData<IDX, dtype>* gdata);
-#define GEN_BCAST_DEFINE(ndim, dtype, lhs_tgt, rhs_tgt, op)         \
-  template void CallBinaryReduceBcast<XPU, ndim, IDX, dtype,     \
-                                 lhs_tgt, rhs_tgt,                  \
-                                 op<dtype>, REDUCER<XPU, dtype>>(   \
-      const minigun::advance::RuntimeConfig& rtcfg,                 \
-      const CSRWrapper& graph,                                      \
-      BcastGData<ndim, IDX, dtype>* gdata);
-#define EVAL(F, ...) MSVC_EXPAND(F(__VA_ARGS__))
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_CUDA_BINARY_REDUCE_IMPL_CUH_
--- a/src/kernel/cuda/binary_reduce_max.cu
+++ b/src/kernel/cuda/binary_reduce_max.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_reduce_max.cu
- * \brief CUDA kernels for binary reduce max
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceMax
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE)
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_reduce_min.cu
+++ b/src/kernel/cuda/binary_reduce_min.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_reduce_min.cu
- * \brief CUDA kernels for binary reduce min
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceMin
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE)
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_reduce_none.cu
+++ b/src/kernel/cuda/binary_reduce_none.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_reduce_none.cu
- * \brief CUDA kernels for binary reduce none
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceNone
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE)
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_reduce_prod.cu
+++ b/src/kernel/cuda/binary_reduce_prod.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_reduce_prod.cu
- * \brief CUDA kernels for binary reduce prod
- */
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-namespace dgl {
-namespace kernel {
-#define REDUCER ReduceProd
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE)
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE)
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/binary_reduce_sum.cu
+++ b/src/kernel/cuda/binary_reduce_sum.cu
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/binary_reduce_sum.cu
- * \brief CUDA kernels for binary reduce sum
- */
-#include <dgl/runtime/device_api.h>
-#include "../../runtime/cuda/cuda_common.h"
-#include "./binary_reduce_impl.cuh"
-#include "./backward_binary_reduce_impl.cuh"
-#include "../utils.h"
-#include "../csr_interface.h"
-using minigun::advance::RuntimeConfig;
-namespace dgl {
-namespace kernel {
-namespace cuda {
-// specialization for cusparse
-#if CUDART_VERSION < 11000
-template <typename DType>
-cusparseStatus_t Xcsrmm2(cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const DType* alpha, const cusparseMatDescr_t descrA,
-    const DType* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-    const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
-  LOG(INFO) << "Not supported dtype";
-  return CUSPARSE_STATUS_EXECUTION_FAILED;
-}
-template <>
-cusparseStatus_t Xcsrmm2<float>(cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const float* alpha, const cusparseMatDescr_t descrA,
-    const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-    const float* B, int ldb, const float* beta, float* C, int ldc) {
-  return cusparseScsrmm2(handle, transA, transB, m, n, k, nnz,
-      alpha, descrA, csrValA, csrRowPtrA, csrColIndA,
-      B, ldb, beta, C, ldc);
-}
-template <>
-cusparseStatus_t Xcsrmm2<double>(cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const double* alpha, const cusparseMatDescr_t descrA,
-    const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-    const double* B, int ldb, const double* beta, double* C, int ldc) {
-  return cusparseDcsrmm2(handle, transA, transB, m, n, k, nnz,
-      alpha, descrA, csrValA, csrRowPtrA, csrColIndA,
-      B, ldb, beta, C, ldc);
-}
-#endif
-template <typename DType>
-cublasStatus_t Xgeam(cublasHandle_t handle, cublasOperation_t transa,
-    cublasOperation_t transb, int m, int n,
-    const DType* alpha, const DType* A, int lda,
-    const DType* beta, const DType* B, int ldb,
-    DType* C, int ldc) {
-  LOG(INFO) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
-}
-template <>
-cublasStatus_t Xgeam<float>(cublasHandle_t handle, cublasOperation_t transa,
-    cublasOperation_t transb, int m, int n,
-    const float* alpha, const float* A, int lda,
-    const float* beta, const float* B, int ldb,
-    float* C, int ldc) {
-  return cublasSgeam(handle, transa, transb, m, n, alpha, A, lda,
-      beta, B, ldb, C, ldc);
-}
-template <>
-cublasStatus_t Xgeam<double>(cublasHandle_t handle, cublasOperation_t transa,
-    cublasOperation_t transb, int m, int n,
-    const double* alpha, const double* A, int lda,
-    const double* beta, const double* B, int ldb,
-    double* C, int ldc) {
-  return cublasDgeam(handle, transa, transb, m, n, alpha, A, lda,
-      beta, B, ldb, C, ldc);
-}
-template <typename DType>
-void CusparseCsrmm2(
-    const RuntimeConfig& rtcfg,
-    const aten::CSRMatrix& csr,
-    const DType* B_data, DType* C_data,
-    int x_length) {
-  // We use csrmm2 to perform following operation:
-  // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix for node
-  // feature tensor. However, since cusparse only supports column-major, while our tensor
-  // is stored in row-major, the actual computation is:
-  // C = trans(A x trans(B)).
-  // Currently, we use cublasXgeam to implement transposition and allocate intermediate
-  // workspace memory for this.
-  const int m = csr.num_rows;
-  const int n = x_length;
-  const int k = csr.num_cols;
-  const int nnz = csr.indices->shape[0];
-  const DType alpha = 1.0;
-  const DType beta = 0.0;
-  // device
-  auto device = runtime::DeviceAPI::Get(rtcfg.ctx);
-  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  // allocate cusparse handle if needed
-  if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
-  }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, rtcfg.stream));
-  // allocate matrix for temporary transposed output
-  DType* trans_out = static_cast<DType*>(device->AllocWorkspace(rtcfg.ctx, m * n * sizeof(DType)));
-  // all one data array
-  DType* valptr = static_cast<DType*>(device->AllocWorkspace(rtcfg.ctx, nnz * sizeof(DType)));
-  utils::Fill<kDLGPU>(rtcfg.ctx, valptr, nnz, static_cast<DType>(1.));
-#if CUDART_VERSION >= 11000
-  auto ctx = rtcfg.ctx;
-  cusparseSpMatDescr_t matA;
-  cusparseDnMatDescr_t matB, matC;
-  constexpr auto cuda_dtype = std::is_same<DType, float>::value ? CUDA_R_32F: CUDA_R_64F;
-  CUSPARSE_CALL(cusparseCreateCsr(&matA,
-      m, k, nnz,
-      static_cast<int32_t*>(csr.indptr->data),
-      static_cast<int32_t*>(csr.indices->data),
-      const_cast<DType*>(valptr),
-      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
-      CUSPARSE_INDEX_BASE_ZERO, cuda_dtype));
-  CUSPARSE_CALL(cusparseCreateDnMat(&matB,
-      n, k, n,
-      const_cast<DType*>(B_data), cuda_dtype, CUSPARSE_ORDER_COL));
-  CUSPARSE_CALL(cusparseCreateDnMat(&matC,
-      m, n, m,
-      trans_out, cuda_dtype, CUSPARSE_ORDER_COL));
-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_TRANSPOSE;
-  size_t workspace_size;
-  CUSPARSE_CALL(cusparseSpMM_bufferSize(
-      thr_entry->cusparse_handle, transA, transB,
-      &alpha, matA, matB, &beta, matC,
-      cuda_dtype, CUSPARSE_CSRMM_ALG1,
-      &workspace_size));
-  void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseSpMM(
-      thr_entry->cusparse_handle, transA, transB,
-      &alpha, matA, matB, &beta, matC,
-      cuda_dtype, CUSPARSE_CSRMM_ALG1,
-      workspace));
-  device->FreeWorkspace(ctx, workspace);
-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
-#else
-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-  CUSPARSE_CALL(Xcsrmm2<DType>(
-      thr_entry->cusparse_handle,
-      CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE,
-      m, n, k, nnz, &alpha,
-      descr, valptr,
-      static_cast<int32_t*>(csr.indptr->data),
-      static_cast<int32_t*>(csr.indices->data),
-      B_data, n, &beta, trans_out, m));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
-#endif
-  device->FreeWorkspace(rtcfg.ctx, valptr);
-  // transpose the output matrix
-  if (!thr_entry->cublas_handle) {
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  }
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, rtcfg.stream));
-  CUBLAS_CALL(Xgeam<DType>(
-      thr_entry->cublas_handle,
-      CUBLAS_OP_T,
-      CUBLAS_OP_N,
-      n, m,
-      &alpha, trans_out, m,
-      &beta, nullptr, n,
-      C_data, n));
-  device->FreeWorkspace(rtcfg.ctx, trans_out);
-}
-// forward
-template <typename DType>
-void FallbackCallBinaryReduce(
-    const RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    GData<int32_t, DType>* gdata) {
-  constexpr int XPU = kDLGPU;
-  typedef int32_t Idx;
-  typedef SelectSrc LeftSelector;
-  typedef SelectNone RightSelector;
-  typedef BinaryUseLhs<DType> BinaryOp;
-  typedef ReduceSum<kDLGPU, DType> Reducer;
-  typedef cuda::FunctorsTempl<Idx, DType, LeftSelector,
-                        RightSelector, BinaryOp, Reducer>
-          Functors;
-  typedef cuda::BinaryReduce<Idx, DType, Functors> UDF;
-  // csr
-  auto outcsr = graph.GetOutCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(outcsr.indptr, outcsr.indices);
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(outcsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig, GData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-template <typename DType>
-void FallbackCallBackwardBinaryReduce(
-    const RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<int32_t, DType>* gdata) {
-  constexpr int XPU = kDLGPU;
-  constexpr int Mode = binary_op::kGradLhs;
-  typedef int32_t Idx;
-  typedef SelectSrc LeftSelector;
-  typedef SelectNone RightSelector;
-  typedef BinaryUseLhs<DType> BinaryOp;
-  typedef ReduceSum<kDLGPU, DType> Reducer;
-  // For backward computation, we use reverse csr and switch dst and src.
-  // This benefits the most common src_op_edge or copy_src case, because the
-  // gradients of src are now aggregated into destination buffer to reduce
-  // competition of atomic add.
-  auto incsr = graph.GetInCSRMatrix();
-  minigun::Csr<Idx> csr = utils::CreateCsr<Idx>(incsr.indptr, incsr.indices);
-  typedef cuda::BackwardFunctorsTempl<Idx, DType,
-          typename SwitchSrcDst<LeftSelector>::Type,
-          typename SwitchSrcDst<RightSelector>::Type,
-          BinaryOp, Reducer> Functors;
-  typedef cuda::BackwardBinaryReduce<Mode, Idx, DType, Functors> UDF;
-  // If the user-given mapping is none and the target is edge data, we need to
-  // replace the mapping by the edge ids in the csr graph so that the edge
-  // data is correctly read/written.
-  if (LeftSelector::target == binary_op::kEdge
-      && gdata->lhs_mapping == nullptr) {
-    gdata->lhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (RightSelector::target == binary_op::kEdge
-      && gdata->rhs_mapping == nullptr) {
-    gdata->rhs_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  if (OutSelector<Reducer>::Type::target == binary_op::kEdge
-      && gdata->out_mapping == nullptr) {
-    gdata->out_mapping = static_cast<Idx*>(incsr.data->data);
-  }
-  // TODO(minjie): allocator
-  minigun::advance::Advance<XPU, Idx, cuda::AdvanceConfig, BackwardGData<Idx, DType>, UDF>(
-        rtcfg, csr, gdata, minigun::IntArray1D<Idx>());
-}
-}  // namespace cuda
-template <>
-void CallBinaryReduce<kDLGPU, int32_t, float, SelectSrc, SelectNone,
-                      BinaryUseLhs<float>, ReduceSum<kDLGPU, float>>(
-    const RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    GData<int32_t, float>* gdata) {
-  if (gdata->lhs_mapping || gdata->rhs_mapping || gdata->out_mapping) {
-    cuda::FallbackCallBinaryReduce<float>(rtcfg, graph, gdata);
-  } else {
-    // cusparse use rev csr for csrmm
-    auto csr = graph.GetInCSRMatrix();
-    cuda::CusparseCsrmm2(rtcfg, csr, gdata->lhs_data, gdata->out_data,
-        gdata->x_length);
-  }
-}
-template <>
-void CallBinaryReduce<kDLGPU, int32_t, double, SelectSrc, SelectNone,
-                      BinaryUseLhs<double>, ReduceSum<kDLGPU, double>>(
-    const RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    GData<int32_t, double>* gdata) {
-  if (gdata->lhs_mapping || gdata->rhs_mapping || gdata->out_mapping) {
-    cuda::FallbackCallBinaryReduce<double>(rtcfg, graph, gdata);
-  } else {
-    // cusparse use rev csr for csrmm
-    auto csr = graph.GetInCSRMatrix();
-    cuda::CusparseCsrmm2(rtcfg, csr, gdata->lhs_data, gdata->out_data,
-        gdata->x_length);
-  }
-}
-// backward
-template <>
-void CallBackwardBinaryReduce<kDLGPU, binary_op::kGradLhs, int32_t, float,
-                              SelectSrc, SelectNone,
-                              BinaryUseLhs<float>, ReduceSum<kDLGPU, float>>(
-    const RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<int32_t, float>* gdata) {
-  if (gdata->lhs_mapping || gdata->rhs_mapping || gdata->out_mapping) {
-    cuda::FallbackCallBackwardBinaryReduce<float>(rtcfg, graph, gdata);
-  } else {
-    auto csr = graph.GetOutCSRMatrix();
-    cuda::CusparseCsrmm2(rtcfg, csr, gdata->grad_out_data, gdata->grad_lhs_data,
-        gdata->x_length);
-  }
-}
-template <>
-void CallBackwardBinaryReduce<kDLGPU, binary_op::kGradLhs, int32_t, double,
-                              SelectSrc, SelectNone,
-                              BinaryUseLhs<double>, ReduceSum<kDLGPU, double>>(
-    const RuntimeConfig& rtcfg,
-    const CSRWrapper& graph,
-    BackwardGData<int32_t, double>* gdata) {
-  if (gdata->lhs_mapping || gdata->rhs_mapping || gdata->out_mapping) {
-    cuda::FallbackCallBackwardBinaryReduce<double>(rtcfg, graph, gdata);
-  } else {
-    auto csr = graph.GetOutCSRMatrix();
-    cuda::CusparseCsrmm2(rtcfg, csr, gdata->grad_out_data, gdata->grad_lhs_data,
-        gdata->x_length);
-  }
-}
-// generate definitions
-#define REDUCER ReduceSum
-#define XPU kDLGPU
-#define IDX int32_t
-EVAL(GEN_DTYPE, GEN_OP_TARGET, GEN_DEFINE);
-EVAL(GEN_BACKWARD_MODE, GEN_DTYPE, GEN_OP_TARGET, GEN_BACKWARD_DEFINE);
-}  // namespace kernel
-}  // namespace dgl
--- a/src/kernel/cuda/functor.cuh
+++ b/src/kernel/cuda/functor.cuh
-/*!
- *  Copyright (c) 2019 by Contributors
- * \file kernel/cuda/functor.cuh
- * \brief Functors for template on CUDA
- */
-#ifndef DGL_KERNEL_CUDA_FUNCTOR_CUH_
-#define DGL_KERNEL_CUDA_FUNCTOR_CUH_
-#include "../binary_reduce_common.h"
-#include "./atomic.cuh"
-namespace dgl {
-namespace kernel {
-namespace cuda {
-// Cache load from global memory
-template <typename DType>
-struct LDGReader {
-  static __device__ __forceinline__ DType Call(DType* addr) {
-#if __CUDA_ARCH__ >= 350
-    return __ldg(addr);
-#else
-    return *addr;
-#endif
-  }
-};
-}  // namespace cuda
-// Reducer functor specialization
-template <typename DType>
-struct ReduceSum<kDLGPU, DType> {
-  static __device__ __forceinline__ void Call(DType* addr, DType val) {
-    cuda::AtomicAdd(addr, val);
-  }
-  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
-    return 1;
-  }
-};
-template <typename DType>
-struct ReduceMax<kDLGPU, DType> {
-  static __device__ __forceinline__ void Call(DType* addr, DType val) {
-    cuda::AtomicMax(addr, val);
-  }
-  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
-    return static_cast<DType>(val == accum);
-  }
-};
-template <typename DType>
-struct ReduceMin<kDLGPU, DType> {
-  static __device__ __forceinline__ void Call(DType* addr, DType val) {
-    cuda::AtomicMin(addr, val);
-  }
-  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
-    return static_cast<DType>(val == accum);
-  }
-};
-template <typename DType>
-struct ReduceProd<kDLGPU, DType> {
-  static __device__ __forceinline__ void Call(DType* addr, DType val) {
-    cuda::AtomicMul(addr, val);
-  }
-  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
-    return accum / val;
-  }
-};
-template <typename DType>
-struct ReduceNone<kDLGPU, DType> {
-  static __device__ __forceinline__ void Call(DType* addr, DType val) {
-    *addr = val;
-  }
-  static __device__ __forceinline__ DType BackwardCall(DType val, DType accum) {
-    return 1;
-  }
-};
-}  // namespace kernel
-}  // namespace dgl
-#endif  // DGL_KERNEL_CUDA_FUNCTOR_CUH_