[Performance] Dynamic cpu kernel V3 for SpMMSumCsr all Ops (#2309)

* support AVX512 * env DGL_CPU_INTEL_KERNEL_ENABLED=1 * env DGL_CPU_INTEL_KERNEL_LOG=1 * Add unittest test_spmm.cc Co-authored-by: Izabela Mazur <izabela.mazur@intel.com> Co-authored-by: Michal Szarmach <michal.szarmach@intel.com> Review patch

[Performance] Dynamic cpu kernel V3 for SpMMSumCsr all Ops (#2309)
* support AVX512 * env DGL_CPU_INTEL_KERNEL_ENABLED=1 * env DGL_CPU_INTEL_KERNEL_LOG=1 * Add unittest test_spmm.cc Co-authored-by: Izabela Mazur <izabela.mazur@intel.com> Co-authored-by: Michal Szarmach <michal.szarmach@intel.com> Review patch
f8ebcd7f · pawelpiotrowicz · GitHub · 62b4bbb4 · f8ebcd7f · f8ebcd7f
Unverified Commit f8ebcd7f authored Nov 17, 2020 by pawelpiotrowicz Committed by GitHub Nov 17, 2020
9 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,3 +23,6 @@
 [submodule "third_party/thrust"]
 	path = third_party/thrust
 	url = https://github.com/NVIDIA/thrust.git
+[submodule "third_party/xbyak"]
+	path = third_party/xbyak
+	url = https://github.com/herumi/xbyak
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,7 @@ include_directories("third_party/dmlc-core/include")
 include_directories("third_party/minigun/minigun")
 include_directories("third_party/minigun/third_party/moderngpu/src")
 include_directories("third_party/phmap/")
+include_directories("third_party/xbyak/")
 # initial variables
 set(DGL_LINKER_LIBS "")

--- a/docs/source/env_var.rst
+++ b/docs/source/env_var.rst
@@ -22,3 +22,15 @@ Data Repository
 * ``DGL_DOWNLOAD_DIR``:
    * Values: String (default="${HOME}/.dgl")
    * The local directory to cache the downloaded data.
+Intel CPU Performance Options
+---------------
+* ``DGL_CPU_INTEL_KERNEL_ENABLED``:
+    * Values: int (default='0')
+    * Use dynamic cpu kernels.
+    * Suggested values: 1
+* ``DGL_CPU_INTEL_KERNEL_LOG``:
+    * Values: int (default='0')
+    * Show diagnostic message (debug mode).
+    * Suggested values: 1
--- a/include/intel/cpu_support.h
+++ b/include/intel/cpu_support.h
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file intel/cpu_support.h
+ * \brief Intel CPU support
+ * \author Pawel Piotrowicz <pawel.piotrowicz@intel.com>
+ */
+#ifndef INTEL_CPU_SUPPORT_H_
+#define INTEL_CPU_SUPPORT_H_
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include "dmlc/logging.h"
+#include "meta_utils.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
+namespace dgl {
+typedef std::tuple<float, double> supported_types;
+#ifndef log_intel
+#define log_intel(x)                   \
+  if (IntelKernel<>::IsLogEnabled()) { \
+    LOG(INFO) << x;                    \
+  }
+#endif
+static inline Xbyak::Zmm make_zmm(const Xbyak::Xmm &v) {
+  return Xbyak::Zmm(v.getIdx());
+}
+template <int version = 0>
+struct IntelKernel {
+  static int64_t GetValue() {
+    int64_t v = 0;
+    const char *label = "DGL_CPU_INTEL_KERNEL_ENABLED";
+    const char *ptr = std::getenv(label);
+    if (ptr) {
+      v = atoll(ptr);
+      log_intel(label << "=>" << v);
+    }
+    return v;
+  }
+  static int64_t IsEnabled() {
+    static int64_t r = IntelKernel<version>::GetValue();
+    return r;
+  }
+  static int IsLogEnabled() {
+    static int r = (std::getenv("DGL_CPU_INTEL_KERNEL_LOG")) ? 1 : 0;
+    return r;
+  }
+};
+/*!
+ * \brief Element-wise addition kernel using Intel AVX512 instructions.
+ * \note it uses AVX512.
+ */
+template <class Op>
+class ElemWiseAddUpdate : public Xbyak::CodeGenerator {
+ public:
+  typedef typename Op::type DType;
+  static_assert(
+    std::is_base_of<std::true_type,
+                    utils::has_type<DType, supported_types>>::value,
+    "Use case fail dgl::ElemWiseAddUpdate< Operator<DType> > DType is not "
+    "supported !");
+ protected:
+  const Xbyak::Reg64 &r_out_;
+  const Xbyak::Reg64 &r_left_;
+  const Xbyak::Reg64 &r_right;
+  const Xbyak::Reg64 &r_size_;
+  /* [functional] Does kernel is applicable on this machine ? */
+  bool applicable_;
+ public:
+  static constexpr int UNIT_SIZE_BYTES = sizeof(DType);
+  static constexpr int BITS_IN_BYTES = 8;
+  static constexpr int REG_BIT_SIZE = 512;
+  static constexpr int UNIT_PER_REG =
+    REG_BIT_SIZE / (UNIT_SIZE_BYTES * BITS_IN_BYTES);
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, float> = true>
+  void alias_load(R1 r1, R2 r2) {
+    vmovups(r1, r2);
+  }
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, double> = true>
+  void alias_load(R1 r1, R2 r2) {
+    vmovupd(r1, r2);
+  }
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, float> = true>
+  void alias_save(R1 r1, R2 r2) {
+    alias_load<TType>(r1, r2);
+  }
+  template <class TType, class R1, class R2,
+            utils::CheckCmp<TType, double> = true>
+  void alias_save(R1 r1, R2 r2) {
+    alias_load<TType>(r1, r2);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_ADD(R1 r1, R2 r2, R3 r3) {
+    vaddps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_ADD(R1 r1, R2 r2, R3 r3) {
+    vaddpd(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_SUB(R1 r1, R2 r2, R3 r3) {
+    vsubps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_SUB(R1 r1, R2 r2, R3 r3) {
+    vsubpd(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_DIV(R1 r1, R2 r2, R3 r3) {
+    vdivps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_DIV(R1 r1, R2 r2, R3 r3) {
+    vdivpd(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, float> = true>
+  void alias_MUL(R1 r1, R2 r2, R3 r3) {
+    vmulps(r1, r2, r3);
+  }
+  template <class TType, class R1, class R2, class R3,
+            utils::CheckCmp<TType, double> = true>
+  void alias_MUL(R1 r1, R2 r2, R3 r3) {
+    vmulpd(r1, r2, r3);
+  }
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
+                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
+    alias_load<IType>(zmm1, ptr[r_left_ + r9 * sizeof(IType)]);
+    alias_ADD<IType>(zmm2, zmm0, zmm1);
+    alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
+  }
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
+                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    alias_load<IType>(zmm0, ptr[r_out_ + r9 * sizeof(IType)]);
+    alias_load<IType>(zmm1, ptr[r_right + r9 * sizeof(IType)]);
+    alias_ADD<IType>(zmm2, zmm0, zmm1);
+    alias_save<IType>(ptr[r_out_ + r9 * sizeof(IType)], zmm2);
+  }
+  template <class T>
+  void loop_pre() {
+    alias_load<T>(zmm0, ptr[r_out_ + r9 * sizeof(T)]);
+    alias_load<T>(zmm1, ptr[r_left_ + r9 * sizeof(T)]);
+    alias_load<T>(zmm2, ptr[r_right + r9 * sizeof(T)]);
+  }
+  template <class T>
+  void loop_post() {
+    alias_ADD<T>(zmm2, zmm0, zmm2);
+    alias_save<T>(ptr[r_out_ + r9 * sizeof(T)], zmm2);
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_ADD<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_SUB<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_DIV<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
+                                          supported_types> = true>
+  void full_chunk_loop_operations() {
+    typedef typename Operator::type IType;
+    loop_pre<IType>();
+    alias_MUL<IType>(zmm2, zmm1, zmm2);
+    loop_post<IType>();
+  }
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyLhs,
+                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(IType)]);
+  }
+  template <class Operator,
+            utils::Verify<Operator, ::dgl::aten::cpu::op::CopyRhs,
+                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    alias_load<IType>(make_zmm(zmm2) | mask, ptr[r_right + r9 * sizeof(IType)]);
+  }
+  template <class T>
+  void remainder_fetch_LR(const Xbyak::Opmask mask) {
+    alias_load<T>(make_zmm(zmm2) | mask, ptr[r_left_ + r9 * sizeof(T)]);
+    alias_load<T>(make_zmm(zmm1) | mask, ptr[r_right + r9 * sizeof(T)]);
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Mul,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_MUL<IType>(zmm2, zmm2, zmm1);
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Add,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_ADD<DType>(zmm2, zmm2, zmm1);
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Div,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_DIV<DType>(zmm2, zmm2, zmm1);
+  }
+  template <class Operator, utils::Verify<Operator, ::dgl::aten::cpu::op::Sub,
+                                          supported_types> = true>
+  void remainder_operations(const Xbyak::Opmask mask) {
+    typedef typename Operator::type IType;
+    remainder_fetch_LR<IType>(mask);
+    alias_SUB<DType>(zmm2, zmm2, zmm1);
+  }
+  ElemWiseAddUpdate()
+      : r_out_(rdi),
+        r_left_(rsi),
+        r_right(rdx),
+        r_size_(rcx),
+        applicable_(false) {
+    static Xbyak::util::Cpu current_cpu;
+    /* Default case for all */
+    if (current_cpu.has(Xbyak::util::Cpu::tAVX512F)) {
+      /* prepare REMAINDER */
+      mov(r8, r_size_);
+      and_(r8,
+           UNIT_PER_REG - 1);  // r8_modulo = size/(sizeof(zmm)/sizeof(float))
+      xor_(r9, r9);            // reset r9
+      cmp(r_size_, UNIT_PER_REG);  // if ( size < 16 ) {  }
+      jl("remainder");
+      /*  decrease  divident */
+      sub(r_size_, r8);  // prepare alignment chunks
+      cmp(r_size_, 0);   // do we have any full chunks ?
+      jz("remainder");
+      L("for_i");
+      full_chunk_loop_operations<Op>();
+      add(r9, UNIT_PER_REG);  // r9+=sizeof(zmm)/sizeof(float)
+      cmp(r_size_, r9);       // more full chunks ?
+      jnz("for_i");
+      L("remainder");
+      cmp(r8, 0);  //  do we have a remainder ?
+      jz("done");
+      /* prepare a bitmask for k1 */
+      mov(rax, 1);
+      mov(r_size_, r8);
+      sal(rax, cl);
+      dec(rax);        // k1= (1 << r8 )-1
+      kmovw(k1, eax);  // set bitmask
+      alias_load<DType>(make_zmm(zmm0) | k1,
+                        ptr[r_out_ + r9 * UNIT_SIZE_BYTES]);
+      remainder_operations<Op>(k1);
+      alias_ADD<DType>(zmm3, zmm2, zmm0);
+      alias_save<DType>(ptr[r_out_ + r9 * UNIT_SIZE_BYTES],
+                        make_zmm(zmm3) | k1);
+      L("done");
+      applicable_ = true;
+      log_intel("AVX512F cpu kernel is ready");
+    }
+    ret();
+  }
+  bool applicable() const { return applicable_; }
+  template <class... P>
+  void run(P... args) {
+    ((void (*)(P...))(this)->getCode())(args...);
+  }
+};
+}  // namespace dgl
+#endif  // INTEL_CPU_SUPPORT_H_
--- a/include/intel/meta_utils.h
+++ b/include/intel/meta_utils.h
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file intel/meta_utils.h
+ * \brief Meta programming utils
+ * \author Pawel Piotrowicz <pawel.piotrowicz@intel.com>
+ */
+#ifndef INTEL_META_UTILS_H_
+#define INTEL_META_UTILS_H_
+#include <tuple>
+namespace dgl {
+namespace utils {
+template <typename T, typename Tuple>
+struct has_type;
+template <typename T>
+struct has_type<T, std::tuple<>> : std::false_type {};
+template <typename T, typename U, typename... Ts>
+struct has_type<T, std::tuple<U, Ts...>> : has_type<T, std::tuple<Ts...>> {};
+template <typename T, typename... Ts>
+struct has_type<T, std::tuple<T, Ts...>> : std::true_type {};
+template <class OCmp, template <class> class ToP, class Tup,
+          int ok = std::tuple_size<Tup>::value>
+struct DeepType;
+template <class OCmp, template <class> class ToP, class Tup>
+struct DeepType<OCmp, ToP, Tup, 1> {
+  typedef typename std::tuple_element<0, Tup>::type EL1;
+  enum { value = std::is_same<OCmp, ToP<EL1>>::value };
+};
+template <class OCmp, template <class> class ToP, class Tup>
+struct DeepType<OCmp, ToP, Tup, 2> {
+  typedef typename std::tuple_element<0, Tup>::type EL1;
+  typedef typename std::tuple_element<1, Tup>::type EL2;
+  enum {
+    value = (std::is_same<OCmp, ToP<EL1>>::value ||
+             std::is_same<OCmp, ToP<EL2>>::value)
+  };
+};
+template <class OCmp, template <class> class ToP, class Tup>
+struct DeepType<OCmp, ToP, Tup, 3> {
+  typedef typename std::tuple_element<0, Tup>::type EL1;
+  typedef typename std::tuple_element<1, Tup>::type EL2;
+  typedef typename std::tuple_element<2, Tup>::type EL3;
+  enum {
+    value = (std::is_same<OCmp, ToP<EL1>>::value ||
+             std::is_same<OCmp, ToP<EL2>>::value ||
+             std::is_same<OCmp, ToP<EL3>>::value)
+  };
+};
+template <bool b>
+using Required = typename std::enable_if<b, bool>::type;
+template <class L, class R>
+using CheckCmp = Required<std::is_same<L, R>::value>;
+template <class L, class R1, class R2>
+using CheckCmp_2 =
+  Required<std::is_same<L, R1>::value || std::is_same<L, R2>::value>;
+template <class OpType, template <class> class TPP, class Tup>
+using Verify = Required<utils::DeepType<OpType, TPP, Tup>::value>;
+}  // namespace utils
+}  // namespace dgl
+#endif  //  INTEL_META_UTILS_H_
--- a/src/array/cpu/spmm.h
+++ b/src/array/cpu/spmm.h
@@ -8,9 +8,13 @@
 #include <dgl/array.h>
 #include <dgl/bcast.h>
-#include <limits>
 #include <algorithm>
+#include <limits>
+#include <memory>
+#include "spmm_binary_ops.h"
+#if !defined(_WIN32)
+#include "intel/cpu_support.h"
+#endif
 namespace dgl {
 namespace aten {
 namespace cpu {
@@ -26,40 +30,62 @@ namespace cpu {
 *       for the computation of different nodes.
 */
 template <typename IdType, typename DType, typename Op>
-void SpMMSumCsr(
+void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
-    const BcastOff& bcast,
+                NDArray efeat, NDArray out) {
-    const CSRMatrix& csr,
-    NDArray ufeat, NDArray efeat,
-    NDArray out) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = csr.indptr.Ptr<IdType>();
  const IdType* indices = csr.indices.Ptr<IdType>();
  const IdType* edges = csr.data.Ptr<IdType>();
  const DType* X = ufeat.Ptr<DType>();
  const DType* W = efeat.Ptr<DType>();
-  int64_t dim = bcast.out_len,
+  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
-          lhs_dim = bcast.lhs_len,
-          rhs_dim = bcast.rhs_len;
  DType* O = out.Ptr<DType>();
+#if !defined(_WIN32)
+  typedef dgl::ElemWiseAddUpdate<Op> ElemWiseUpd;
+  /* Prepare an assembler kernel */
+  static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
+    (dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
+  /* Distribute the kernel among OMP threads */
+  ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
+                            ? asm_kernel_ptr.get()
+                            : nullptr;
+  if (cpu_spec && dim > 16 && !bcast.use_bcast) {
 #pragma omp parallel for
-  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
+    for (IdType rid = 0; rid < csr.num_rows; ++rid) {
-    const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
+      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
-    DType *out_off = O + rid * dim;
+      DType* out_off = O + rid * dim;
-    std::fill(out_off, out_off + dim, 0);
+      std::fill(out_off, out_off + dim, 0);
-    for (IdType j = row_start; j < row_end; ++j) {
+      for (IdType j = row_start; j < row_end; ++j) {
-      const IdType cid = indices[j];
+        const IdType cid = indices[j];
-      const IdType eid = has_idx ? edges[j] : j;
+        const IdType eid = has_idx ? edges[j] : j;
-      for (int64_t k = 0; k < dim; ++k) {
+        cpu_spec->run(out_off, X + cid * lhs_dim, W + eid * rhs_dim, dim);
-        const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
+      }
-        const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
+    }
-        const DType *lhs_off =
+  } else {
+#endif
+#pragma omp parallel for
+    for (IdType rid = 0; rid < csr.num_rows; ++rid) {
+      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
+      DType* out_off = O + rid * dim;
+      std::fill(out_off, out_off + dim, 0);
+      for (IdType j = row_start; j < row_end; ++j) {
+        const IdType cid = indices[j];
+        const IdType eid = has_idx ? edges[j] : j;
+        for (int64_t k = 0; k < dim; ++k) {
+          const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
+          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
+          const DType* lhs_off =
            Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
-        const DType *rhs_off =
+          const DType* rhs_off =
            Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
-        out_off[k] += Op::Call(lhs_off, rhs_off);
+          out_off[k] += Op::Call(lhs_off, rhs_off);
+        }
      }
    }
+#if !defined(_WIN32)
  }
+#endif
 }
 /*!
@@ -74,20 +100,15 @@ void SpMMSumCsr(
 *       we use atomic operators in the reduction phase.
 */
 template <typename IdType, typename DType, typename Op>
-void SpMMSumCoo(
+void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
-    const BcastOff& bcast,
+                NDArray efeat, NDArray out) {
-    const COOMatrix& coo,
-    NDArray ufeat, NDArray efeat,
-    NDArray out) {
  const bool has_idx = !IsNullArray(coo.data);
  const IdType* row = coo.row.Ptr<IdType>();
  const IdType* col = coo.col.Ptr<IdType>();
  const IdType* edges = coo.data.Ptr<IdType>();
  const DType* X = ufeat.Ptr<DType>();
  const DType* W = efeat.Ptr<DType>();
-  int64_t dim = bcast.out_len,
+  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
-          lhs_dim = bcast.lhs_len,
-          rhs_dim = bcast.rhs_len;
  DType* O = out.Ptr<DType>();
  const int64_t nnz = coo.row->shape[0];
  // fill zero elements
@@ -97,13 +118,15 @@ void SpMMSumCoo(
  for (IdType i = 0; i < nnz; ++i) {
    const IdType rid = row[i];
    const IdType cid = col[i];
-    const IdType eid = has_idx? edges[i] : i;
+    const IdType eid = has_idx ? edges[i] : i;
    DType* out_off = O + cid * dim;
    for (int64_t k = 0; k < dim; ++k) {
      const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
      const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-      const DType* lhs_off = Op::use_lhs? X + rid * lhs_dim + lhs_add : nullptr;
+      const DType* lhs_off =
-      const DType* rhs_off = Op::use_rhs? W + eid * rhs_dim + rhs_add : nullptr;
+        Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
+      const DType* rhs_off =
+        Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
      const DType val = Op::Call(lhs_off, rhs_off);
      if (val != 0) {
 #pragma omp atomic
@@ -120,34 +143,31 @@ void SpMMSumCoo(
 * \param ufeat The feature on source nodes.
 * \param efeat The feature on edges.
 * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices 
+ * \param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
- *        destination nodes. It's useful in computing gradients of Min/Max reducer.
+ *        destination nodes. It's useful in computing gradients of Min/Max
- * \param arge Arg-Min/Max on edges. which refers the source node indices 
+ * reducer. \param arge Arg-Min/Max on edges. which refers the source node
- *        correspond to the minimum/maximum values of reduction result on
+ * indices correspond to the minimum/maximum values of reduction result on
- *        destination nodes. It's useful in computing gradients of Min/Max reducer.
+ *        destination nodes. It's useful in computing gradients of Min/Max
- * \note It uses node parallel strategy, different threads are responsible
+ * reducer. \note It uses node parallel strategy, different threads are
- *       for the computation of different nodes.
+ * responsible for the computation of different nodes. \note The result will
- * \note The result will contain infinity for zero-degree nodes.
+ * contain infinity for zero-degree nodes.
 */
 template <typename IdType, typename DType, typename Op, typename Cmp>
-void SpMMCmpCsr(
+void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
-    const BcastOff& bcast,
+                NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
-    const CSRMatrix& csr,
-    NDArray ufeat, NDArray efeat,
-    NDArray out, NDArray argu, NDArray arge) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices = static_cast<IdType*>(csr.indices->data);
-  const IdType* edges = has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
+  const IdType* edges =
-  const DType* X = Op::use_lhs? static_cast<DType*>(ufeat->data) : nullptr;
+    has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
-  const DType* W = Op::use_rhs? static_cast<DType*>(efeat->data) : nullptr;
+  const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
-  const int64_t dim = bcast.out_len,
+  const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
-                lhs_dim = bcast.lhs_len,
+  const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
                rhs_dim = bcast.rhs_len;
  DType* O = static_cast<DType*>(out->data);
-  IdType* argX = Op::use_lhs? static_cast<IdType*>(argu->data) : nullptr;
+  IdType* argX = Op::use_lhs ? static_cast<IdType*>(argu->data) : nullptr;
-  IdType* argW = Op::use_rhs? static_cast<IdType*>(arge->data) : nullptr;
+  IdType* argW = Op::use_rhs ? static_cast<IdType*>(arge->data) : nullptr;
 #pragma omp parallel for
  for (IdType rid = 0; rid < csr.num_rows; ++rid) {
    const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
@@ -155,25 +175,23 @@ void SpMMCmpCsr(
    IdType* argx_off = argX + rid * dim;
    IdType* argw_off = argW + rid * dim;
    std::fill(out_off, out_off + dim, Cmp::zero);
-    if (Op::use_lhs)
+    if (Op::use_lhs) std::fill(argx_off, argx_off + dim, 0);
-      std::fill(argx_off, argx_off + dim, 0);
+    if (Op::use_rhs) std::fill(argw_off, argw_off + dim, 0);
-    if (Op::use_rhs)
-      std::fill(argw_off, argw_off + dim, 0);
    for (IdType j = row_start; j < row_end; ++j) {
      const IdType cid = indices[j];
-      const IdType eid = has_idx? edges[j] : j;
+      const IdType eid = has_idx ? edges[j] : j;
      for (int64_t k = 0; k < dim; ++k) {
        const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
        const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-        const DType* lhs_off = Op::use_lhs? X + cid * lhs_dim + lhs_add : nullptr;
+        const DType* lhs_off =
-        const DType* rhs_off = Op::use_rhs? W + eid * rhs_dim + rhs_add : nullptr;
+          Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
+        const DType* rhs_off =
+          Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
        const DType val = Op::Call(lhs_off, rhs_off);
        if (Cmp::Call(out_off[k], val)) {
          out_off[k] = val;
-          if (Op::use_lhs)
+          if (Op::use_lhs) argx_off[k] = cid;
-            argx_off[k] = cid;
+          if (Op::use_rhs) argw_off[k] = eid;
-          if (Op::use_rhs)
-            argw_off[k] = eid;
        }
      }
    }
@@ -187,35 +205,32 @@ void SpMMCmpCsr(
 * \param ufeat The feature on source nodes.
 * \param efeat The feature on edges.
 * \param out The result feature on destination nodes.
- * \param argu Arg-Min/Max on source nodes, which refers the source node indices 
+ * \param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
- *        destination nodes. It's useful in computing gradients of Min/Max reducer.
+ *        destination nodes. It's useful in computing gradients of Min/Max
- * \param arge Arg-Min/Max on edges. which refers the source node indices 
+ * reducer. \param arge Arg-Min/Max on edges. which refers the source node
- *        correspond to the minimum/maximum values of reduction result on
+ * indices correspond to the minimum/maximum values of reduction result on
- *        destination nodes. It's useful in computing gradients of Min/Max reducer.
+ *        destination nodes. It's useful in computing gradients of Min/Max
- * \note it uses node parallel strategy, different threads are responsible
+ * reducer. \note it uses node parallel strategy, different threads are
- *       for the computation of different nodes. To avoid possible data hazard,
+ * responsible for the computation of different nodes. To avoid possible data
- *       we use atomic operators in the reduction phase.
+ * hazard, we use atomic operators in the reduction phase. \note The result will
- * \note The result will contain infinity for zero-degree nodes.
+ * contain infinity for zero-degree nodes.
 */
 template <typename IdType, typename DType, typename Op, typename Cmp>
-void SpMMCmpCoo(
+void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
-    const BcastOff& bcast,
+                NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
-    const COOMatrix& coo,
-    NDArray ufeat, NDArray efeat,
-    NDArray out, NDArray argu, NDArray arge) {
  const bool has_idx = !IsNullArray(coo.data);
  const IdType* row = static_cast<IdType*>(coo.row->data);
  const IdType* col = static_cast<IdType*>(coo.col->data);
-  const IdType* edges = has_idx? static_cast<IdType*>(coo.data->data) : nullptr;
+  const IdType* edges =
-  const DType* X = Op::use_lhs? static_cast<DType*>(ufeat->data) : nullptr;
+    has_idx ? static_cast<IdType*>(coo.data->data) : nullptr;
-  const DType* W = Op::use_rhs? static_cast<DType*>(efeat->data) : nullptr;
+  const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
-  const int64_t dim = bcast.out_len,
+  const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
-                lhs_dim = bcast.lhs_len,
+  const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
                rhs_dim = bcast.rhs_len;
  DType* O = static_cast<DType*>(out->data);
-  IdType* argX = Op::use_lhs? static_cast<IdType*>(argu->data) : nullptr;
+  IdType* argX = Op::use_lhs ? static_cast<IdType*>(argu->data) : nullptr;
-  IdType* argW = Op::use_rhs? static_cast<IdType*>(arge->data) : nullptr;
+  IdType* argW = Op::use_rhs ? static_cast<IdType*>(arge->data) : nullptr;
  const int64_t nnz = coo.row->shape[0];
  // fill zero elements
  std::fill(O, O + out.NumElements(), Cmp::zero);
@@ -224,145 +239,28 @@ void SpMMCmpCoo(
  for (IdType i = 0; i < nnz; ++i) {
    const IdType rid = row[i];
    const IdType cid = col[i];
-    const IdType eid = has_idx? edges[i] : i;
+    const IdType eid = has_idx ? edges[i] : i;
    DType* out_off = O + cid * dim;
-    IdType* argx_off = Op::use_lhs? argX + cid * dim : nullptr;
+    IdType* argx_off = Op::use_lhs ? argX + cid * dim : nullptr;
-    IdType* argw_off = Op::use_rhs? argW + cid * dim : nullptr;
+    IdType* argw_off = Op::use_rhs ? argW + cid * dim : nullptr;
    for (int64_t k = 0; k < dim; ++k) {
      const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
      const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-      const DType* lhs_off = Op::use_lhs? X + rid * lhs_dim + lhs_add : nullptr;
+      const DType* lhs_off =
-      const DType* rhs_off = Op::use_rhs? W + eid * rhs_dim + rhs_add : nullptr;
+        Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
+      const DType* rhs_off =
+        Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
      const DType val = Op::Call(lhs_off, rhs_off);
 #pragma omp critical
      if (Cmp::Call(out_off[k], val)) {
        out_off[k] = val;
-        if (Op::use_lhs)
+        if (Op::use_lhs) argx_off[k] = rid;
-          argx_off[k] = rid;
+        if (Op::use_rhs) argw_off[k] = eid;
-        if (Op::use_rhs)
-          argw_off[k] = eid;
      }
    }
  }
 }
-namespace op {
-//////////////////////////////// binary operators on CPU ////////////////////////////////
-template <typename DType>
-struct Add {
-  static constexpr bool use_lhs = true;
-  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
-    return *lhs_off + *rhs_off;
-  }
-};
-template <typename DType> constexpr bool Add<DType>::use_lhs;
-template <typename DType> constexpr bool Add<DType>::use_rhs;
-template <typename DType>
-struct Sub {
-  static constexpr bool use_lhs = true;
-  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
-    return *lhs_off - *rhs_off;
-  }
-};
-template <typename DType> constexpr bool Sub<DType>::use_lhs;
-template <typename DType> constexpr bool Sub<DType>::use_rhs;
-template <typename DType>
-struct Mul {
-  static constexpr bool use_lhs = true;
-  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
-    return *lhs_off * *rhs_off;
-  }
-};
-template <typename DType> constexpr bool Mul<DType>::use_lhs;
-template <typename DType> constexpr bool Mul<DType>::use_rhs;
-template <typename DType>
-struct Div {
-  static constexpr bool use_lhs = true;
-  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
-    return *lhs_off / *rhs_off;
-  }
-};
-template <typename DType> constexpr bool Div<DType>::use_lhs;
-template <typename DType> constexpr bool Div<DType>::use_rhs;
-template <typename DType>
-struct CopyLhs {
-  static constexpr bool use_lhs = true;
-  static constexpr bool use_rhs = false;
-  inline static DType Call(const DType* lhs_off, const DType* ) {
-    return *lhs_off;
-  }
-};
-template <typename DType> constexpr bool CopyLhs<DType>::use_lhs;
-template <typename DType> constexpr bool CopyLhs<DType>::use_rhs;
-template <typename DType>
-struct CopyRhs {
-  static constexpr bool use_lhs = false;
-  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* , const DType* rhs_off) {
-    return *rhs_off;
-  }
-};
-template <typename DType> constexpr bool CopyRhs<DType>::use_lhs;
-template <typename DType> constexpr bool CopyRhs<DType>::use_rhs;
-//////////////////////////////// Reduce operators on CPU ////////////////////////////////
-template <typename DType>
-struct Max {
-  static constexpr DType zero = -std::numeric_limits<DType>::infinity();
-  // return true if accum should be replaced
-  inline static DType Call(DType accum, DType val) {
-    return accum < val;
-  }
-};
-template <typename DType> constexpr DType Max<DType>::zero;
-template <typename DType>
-struct Min {
-  static constexpr DType zero = std::numeric_limits<DType>::infinity();
-  // return true if accum should be replaced
-  inline static DType Call(DType accum, DType val) {
-    return accum > val;
-  }
-};
-template <typename DType> constexpr DType Min<DType>::zero;
-#define SWITCH_OP(op, Op, ...)                                      \
-  do {                                                              \
-    if ((op) == "add") {                                            \
-      typedef dgl::aten::cpu::op::Add<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "sub") {                                     \
-      typedef dgl::aten::cpu::op::Sub<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "mul") {                                     \
-      typedef dgl::aten::cpu::op::Mul<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "div") {                                     \
-      typedef dgl::aten::cpu::op::Div<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "copy_lhs") {                                \
-      typedef dgl::aten::cpu::op::CopyLhs<DType> Op;                \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "copy_rhs") {                                \
-      typedef dgl::aten::cpu::op::CopyRhs<DType> Op;                \
-      { __VA_ARGS__ }                                               \
-    } else {                                                        \
-      LOG(FATAL) << "Unsupported SpMM binary operator: " << op;     \
-    }                                                               \
-  } while (0)
-}  // namespace op
 }  // namespace cpu
 }  // namespace aten
 }  // namespace dgl

--- a/src/array/cpu/spmm_binary_ops.h
+++ b/src/array/cpu/spmm_binary_ops.h
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file array/cpu/spmm_binary_ops.h
+ * \brief SPMM CPU Binary ops.
+ */
+#ifndef DGL_ARRAY_CPU_SPMM_BINARY_OPS_H_
+#define DGL_ARRAY_CPU_SPMM_BINARY_OPS_H_
+#include <dgl/array.h>
+#include <dgl/bcast.h>
+#include <limits>
+namespace dgl {
+namespace aten {
+namespace cpu {
+namespace op {
+//////////////////////////////// binary operators on CPU
+///////////////////////////////////
+template <typename DType>
+struct Add {
+  typedef DType type;
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
+    return *lhs_off + *rhs_off;
+  }
+};
+template <typename DType>
+constexpr bool Add<DType>::use_lhs;
+template <typename DType>
+constexpr bool Add<DType>::use_rhs;
+template <typename DType>
+struct Sub {
+  typedef DType type;
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
+    return *lhs_off - *rhs_off;
+  }
+};
+template <typename DType>
+constexpr bool Sub<DType>::use_lhs;
+template <typename DType>
+constexpr bool Sub<DType>::use_rhs;
+template <typename DType>
+struct Mul {
+  typedef DType type;
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
+    return *lhs_off * *rhs_off;
+  }
+};
+template <typename DType>
+constexpr bool Mul<DType>::use_lhs;
+template <typename DType>
+constexpr bool Mul<DType>::use_rhs;
+template <typename DType>
+struct Div {
+  typedef DType type;
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  inline static DType Call(const DType* lhs_off, const DType* rhs_off) {
+    return *lhs_off / *rhs_off;
+  }
+};
+template <typename DType>
+constexpr bool Div<DType>::use_lhs;
+template <typename DType>
+constexpr bool Div<DType>::use_rhs;
+template <typename DType>
+struct CopyLhs {
+  typedef DType type;
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = false;
+  inline static DType Call(const DType* lhs_off, const DType*) {
+    return *lhs_off;
+  }
+};
+template <typename DType>
+constexpr bool CopyLhs<DType>::use_lhs;
+template <typename DType>
+constexpr bool CopyLhs<DType>::use_rhs;
+template <typename DType>
+struct CopyRhs {
+  typedef DType type;
+  static constexpr bool use_lhs = false;
+  static constexpr bool use_rhs = true;
+  inline static DType Call(const DType*, const DType* rhs_off) {
+    return *rhs_off;
+  }
+};
+template <typename DType>
+constexpr bool CopyRhs<DType>::use_lhs;
+template <typename DType>
+constexpr bool CopyRhs<DType>::use_rhs;
+//////////////////////////////// Reduce operators on CPU
+///////////////////////////////////
+template <typename DType>
+struct Max {
+  typedef DType type;
+  static constexpr DType zero = -std::numeric_limits<DType>::infinity();
+  // return true if accum should be replaced
+  inline static DType Call(DType accum, DType val) { return accum < val; }
+};
+template <typename DType>
+constexpr DType Max<DType>::zero;
+template <typename DType>
+struct Min {
+  typedef DType type;
+  static constexpr DType zero = std::numeric_limits<DType>::infinity();
+  // return true if accum should be replaced
+  inline static DType Call(DType accum, DType val) { return accum > val; }
+};
+template <typename DType>
+constexpr DType Min<DType>::zero;
+#define SWITCH_OP(op, Op, ...)                                  \
+  do {                                                          \
+    if ((op) == "add") {                                        \
+      typedef dgl::aten::cpu::op::Add<DType> Op;                \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "sub") {                                 \
+      typedef dgl::aten::cpu::op::Sub<DType> Op;                \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "mul") {                                 \
+      typedef dgl::aten::cpu::op::Mul<DType> Op;                \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "div") {                                 \
+      typedef dgl::aten::cpu::op::Div<DType> Op;                \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "copy_lhs") {                            \
+      typedef dgl::aten::cpu::op::CopyLhs<DType> Op;            \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "copy_rhs") {                            \
+      typedef dgl::aten::cpu::op::CopyRhs<DType> Op;            \
+      { __VA_ARGS__ }                                           \
+    } else {                                                    \
+      LOG(FATAL) << "Unsupported SpMM binary operator: " << op; \
+    }                                                           \
+  } while (0)
+}  // namespace op
+}  // namespace cpu
+}  // namespace aten
+}  // namespace dgl
+#endif  // DGL_ARRAY_CPU_SPMM_BINARY_OPS_H_
--- a/tests/cpp/test_spmm.cc
+++ b/tests/cpp/test_spmm.cc
+#if !defined(_WIN32)
+#include <../../src/array/cpu/spmm.h>
+#include <dgl/array.h>
+#include <gtest/gtest.h>
+#include <time.h>
+#include <random>
+#include "./common.h"
+using namespace dgl;
+using namespace dgl::runtime;
+int sizes[] = {1, 7, 8, 9, 31, 32, 33, 54, 63, 64, 65, 256, 257};
+namespace ns_op = dgl::aten::cpu::op;
+namespace {
+template <class T>
+void GenerateData(T* data, int dim, T mul) {
+  for (int i = 0; i < dim; i++) {
+    data[i] = (i + 1) * mul;
+  }
+}
+template <class T>
+void GenerateRandomData(T* data, int dim) {
+  std::mt19937 rng(std::random_device{}());
+  std::uniform_int_distribution<> dist(0, 10000);
+  for (int i = 0; i < dim; i++) {
+    data[i] = (dist(rng) / 100);
+  }
+}
+template <class T>
+void GenerateZeroData(T* data, int dim) {
+  for (int i = 0; i < dim; i++) {
+    data[i] = 0;
+  }
+}
+template <class T>
+void Copy(T* exp, T* out, T* hs, int dim) {
+  for (int i = 0; i < dim; i++) {
+    exp[i] = out[i] + hs[i];
+  }
+}
+template <class T>
+void Add(T* exp, T* out, T* lhs, T* rhs, int dim) {
+  for (int i = 0; i < dim; i++) {
+    exp[i] = out[i] + lhs[i] + rhs[i];
+  }
+}
+template <class T>
+void Sub(T* exp, T* out, T* lhs, T* rhs, int dim) {
+  for (int i = 0; i < dim; i++) {
+    exp[i] = out[i] + lhs[i] - rhs[i];
+  }
+}
+template <class T>
+void Mul(T* exp, T* out, T* lhs, T* rhs, int dim) {
+  for (int i = 0; i < dim; i++) {
+    exp[i] = (out[i] + (lhs[i] * rhs[i]));
+  }
+}
+template <class T>
+void Div(T* exp, T* out, T* lhs, T* rhs, int dim) {
+  for (int i = 0; i < dim; i++) {
+    exp[i] = (out[i] + (lhs[i] / rhs[i]));
+  }
+}
+template <class T>
+void CheckResult(T* exp, T* out, T* out_intel_kernel, int dim) {
+  for (int i = 0; i < dim; i++) {
+    ASSERT_TRUE(exp[i] == out[i]);
+    if (out_intel_kernel != nullptr) {
+      ASSERT_TRUE(out[i] == out_intel_kernel[i]);
+    }
+  }
+}
+}  // namespace
+template <class ElemWiseUpd>
+ElemWiseUpd* generic_ElemWiseUpd() {
+  static std::unique_ptr<ElemWiseUpd> asm_kernel_ptr(
+    (dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
+  ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
+                            ? asm_kernel_ptr.get()
+                            : nullptr;
+  return cpu_spec;
+}
+template <typename IDX>
+void _TestSpmmCopyLhs() {
+  for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
+    int dim = sizes[i];
+    IDX out[dim], exp[dim], lhs[dim];
+    GenerateZeroData(out, dim);
+    GenerateRandomData(lhs, dim);
+    // Calculation of expected output - 'exp'
+    Copy(exp, out, lhs, dim);
+    // Calculation of output using legacy path - 'out'
+    for (int k = 0; k < dim; k++) {
+      out[k] += ns_op::CopyLhs<IDX>::Call(lhs + k, nullptr);
+    }
+    // Calculation of output using intel path - 'out_intel_kernel'
+    auto* cpu_spec =
+      generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyLhs<IDX>>>();
+    if (cpu_spec) {
+      IDX out_intel_kernel[dim];
+      GenerateZeroData(out_intel_kernel, dim);
+      cpu_spec->run(out_intel_kernel, lhs, nullptr, dim);
+      CheckResult(exp, out, out_intel_kernel, dim);
+    } else {
+      IDX* out_intel_kernel = nullptr;
+      CheckResult(exp, out, out_intel_kernel, dim);
+    }
+  }
+}
+TEST(SpmmTest, TestSpmmCopyLhs) {
+  _TestSpmmCopyLhs<float>();
+  _TestSpmmCopyLhs<double>();
+}
+template <typename IDX>
+void _TestSpmmCopyRhs() {
+  for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
+    int dim = sizes[i];
+    IDX out[dim], exp[dim], rhs[dim];
+    GenerateZeroData(out, dim);
+    GenerateRandomData(rhs, dim);
+    // Calculation of expected output - 'exp'
+    Copy(exp, out, rhs, dim);
+    // Calculation of output using legacy path - 'out'
+    for (int k = 0; k < dim; k++) {
+      out[k] += ns_op::CopyRhs<IDX>::Call(nullptr, rhs + k);
+    }
+    // Calculation of output using intel path - 'out_intel_kernel'
+    auto* cpu_spec =
+      generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::CopyRhs<IDX>>>();
+    if (cpu_spec) {
+      IDX out_intel_kernel[dim];
+      GenerateZeroData(out_intel_kernel, dim);
+      cpu_spec->run(out_intel_kernel, nullptr, rhs, dim);
+      CheckResult(exp, out, out_intel_kernel, dim);
+    } else {
+      IDX* out_intel_kernel = nullptr;
+      CheckResult(exp, out, out_intel_kernel, dim);
+    }
+  }
+}
+TEST(SpmmTest, TestSpmmCopyRhs) {
+  _TestSpmmCopyRhs<float>();
+  _TestSpmmCopyRhs<double>();
+}
+template <typename IDX>
+void _TestSpmmAdd() {
+  for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
+    int dim = sizes[i];
+    IDX out[dim], exp[dim], lhs[dim], rhs[dim];
+    GenerateZeroData(out, dim);
+    GenerateRandomData(lhs, dim);
+    GenerateRandomData(rhs, dim);
+    // Calculation of expected output - 'exp'
+    Add(exp, out, lhs, rhs, dim);
+    // Calculation of output using legacy path - 'out'
+    for (int k = 0; k < dim; k++) {
+      out[k] += ns_op::Add<IDX>::Call(lhs + k, rhs + k);
+    }
+    // Calculation of output using intel path - 'out_intel_kernel'
+    auto* cpu_spec =
+      generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Add<IDX>>>();
+    if (cpu_spec) {
+      IDX out_intel_kernel[dim];
+      GenerateZeroData(out_intel_kernel, dim);
+      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
+      CheckResult(exp, out, out_intel_kernel, dim);
+    } else {
+      IDX* out_intel_kernel = nullptr;
+      CheckResult(exp, out, out_intel_kernel, dim);
+    }
+  }
+}
+TEST(SpmmTest, TestSpmmAdd) {
+  _TestSpmmAdd<float>();
+  _TestSpmmAdd<double>();
+}
+template <typename IDX>
+void _TestSpmmSub() {
+  for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
+    int dim = sizes[i];
+    IDX out[dim], exp[dim], lhs[dim], rhs[dim];
+    GenerateZeroData(out, dim);
+    GenerateRandomData(lhs, dim);
+    GenerateRandomData(rhs, dim);
+    // Calculation of expected output - 'exp'
+    Sub(exp, out, lhs, rhs, dim);
+    // Calculation of output using legacy path - 'out'
+    for (int k = 0; k < dim; k++) {
+      out[k] += ns_op::Sub<IDX>::Call(lhs + k, rhs + k);
+    }
+    // Calculation of output using intel path - 'out_intel_kernel'
+    auto* cpu_spec =
+      generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Sub<IDX>>>();
+    if (cpu_spec) {
+      IDX out_intel_kernel[dim];
+      GenerateZeroData(out_intel_kernel, dim);
+      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
+      CheckResult(exp, out, out_intel_kernel, dim);
+    } else {
+      IDX* out_intel_kernel = nullptr;
+      CheckResult(exp, out, out_intel_kernel, dim);
+    }
+  }
+}
+TEST(SpmmTest, TestSpmmSub) {
+  _TestSpmmSub<float>();
+  _TestSpmmSub<double>();
+}
+template <typename IDX>
+void _TestSpmmMul() {
+  for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
+    int dim = sizes[i];
+    IDX out[dim], exp[dim], lhs[dim], rhs[dim];
+    GenerateZeroData(out, dim);
+    GenerateRandomData(lhs, dim);
+    GenerateRandomData(rhs, dim);
+    // Calculation of expected output - 'exp'
+    Mul(exp, out, lhs, rhs, dim);
+    // Calculation of output using legacy path - 'out'
+    for (int k = 0; k < dim; k++) {
+      out[k] += ns_op::Mul<IDX>::Call(lhs + k, rhs + k);
+    }
+    // Calculation of output using intel path - 'out_intel_kernel'
+    auto* cpu_spec =
+      generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Mul<IDX>>>();
+    if (cpu_spec) {
+      IDX out_intel_kernel[dim];
+      GenerateZeroData(out_intel_kernel, dim);
+      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
+      CheckResult(exp, out, out_intel_kernel, dim);
+    } else {
+      IDX* out_intel_kernel = nullptr;
+      CheckResult(exp, out, out_intel_kernel, dim);
+    }
+  }
+}
+TEST(SpmmTest, TestSpmmMul) {
+  _TestSpmmMul<float>();
+  _TestSpmmMul<double>();
+}
+template <typename IDX>
+void _TestSpmmDiv() {
+  for (size_t i = 0; i < sizeof(sizes) / sizeof(int); i++) {
+    int dim = sizes[i];
+    IDX out[dim], exp[dim], lhs[dim], rhs[dim];
+    GenerateZeroData(out, dim);
+    GenerateData(lhs, dim, (IDX)15);
+    GenerateData(rhs, dim, (IDX)1);
+    // Calculation of expected output - 'exp'
+    Div(exp, out, lhs, rhs, dim);
+    // Calculation of output using legacy path - 'out'
+    for (int k = 0; k < dim; k++) {
+      out[k] += ns_op::Div<IDX>::Call(lhs + k, rhs + k);
+    }
+    // Calculation of output using intel path - 'out_intel_kernel'
+    auto* cpu_spec =
+      generic_ElemWiseUpd<dgl::ElemWiseAddUpdate<ns_op::Div<IDX>>>();
+    if (cpu_spec) {
+      IDX out_intel_kernel[dim];
+      GenerateZeroData(out_intel_kernel, dim);
+      cpu_spec->run(out_intel_kernel, lhs, rhs, dim);
+      CheckResult(exp, out, out_intel_kernel, dim);
+    } else {
+      IDX* out_intel_kernel = nullptr;
+      CheckResult(exp, out, out_intel_kernel, dim);
+    }
+  }
+}
+TEST(SpmmTest, TestSpmmDiv) {
+  _TestSpmmDiv<float>();
+  _TestSpmmDiv<double>();
+}
+#endif
--- a/xbyak @ 0140eeff
+++ b/xbyak @ 0140eeff
+Subproject commit 0140eeff1fffcf5069dea3abb57095695320971c