[Misc] clang-format auto fix. (#4804)

* [Misc] clang-format auto fix. * manual * manual * manual * manual * todo * fix Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] clang-format auto fix. (#4804)
* [Misc] clang-format auto fix. * manual * manual * manual * manual * todo * fix Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
8ae50c42 · Hongzhi (Steve), Chen · GitHub · 81831111 · 8ae50c42 · 8ae50c42
Unverified Commit 8ae50c42 authored Nov 03, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 03, 2022
20 changed files
--- a/src/array/cpu/negative_sampling.cc
+++ b/src/array/cpu/negative_sampling.cc
@@ -6,10 +6,11 @@

 #include <dgl/array.h>
 #include <dgl/array_iterator.h>
-#include <dgl/runtime/parallel_for.h>
 #include <dgl/random.h>
-#include <utility>
+#include <dgl/runtime/parallel_for.h>
+
 #include <algorithm>
+#include <utility>

 using namespace dgl::runtime;

@@ -19,15 +20,12 @@ namespace impl {

 template <DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
-    const CSRMatrix &csr,
-    int64_t num_samples,
-    int num_trials,
-    bool exclude_self_loops,
-    bool replace,
-    double redundancy) {
+    const CSRMatrix& csr, int64_t num_samples, int num_trials,
+    bool exclude_self_loops, bool replace, double redundancy) {
  const int64_t num_row = csr.num_rows;
  const int64_t num_col = csr.num_cols;
-  const int64_t num_actual_samples = static_cast<int64_t>(num_samples * (1 + redundancy));
+  const int64_t num_actual_samples =
+      static_cast<int64_t>(num_samples * (1 + redundancy));
  IdArray row = Full<IdType>(-1, num_actual_samples, csr.indptr->ctx);
  IdArray col = Full<IdType>(-1, num_actual_samples, csr.indptr->ctx);
  IdType* row_data = row.Ptr<IdType>();
@@ -48,23 +46,30 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  });

  PairIterator<IdType> begin(row_data, col_data);
-  PairIterator<IdType> end = std::remove_if(begin, begin + num_actual_samples,
+  PairIterator<IdType> end = std::remove_if(
+      begin, begin + num_actual_samples,
      [](const std::pair<IdType, IdType>& val) { return val.first == -1; });
  if (!replace) {
-    std::sort(begin, end,
-        [](const std::pair<IdType, IdType>& a, const std::pair<IdType, IdType>& b) {
-          return a.first < b.first || (a.first == b.first && a.second < b.second);
-        });;
+    std::sort(
+        begin, end,
+        [](const std::pair<IdType, IdType>& a,
+           const std::pair<IdType, IdType>& b) {
+          return a.first < b.first ||
+                 (a.first == b.first && a.second < b.second);
+        });
    end = std::unique(begin, end);
  }
-  int64_t num_sampled = std::min(static_cast<int64_t>(end - begin), num_samples);
-  return {row.CreateView({num_sampled}, row->dtype), col.CreateView({num_sampled}, col->dtype)};
+  int64_t num_sampled =
+      std::min(static_cast<int64_t>(end - begin), num_samples);
+  return {
+      row.CreateView({num_sampled}, row->dtype),
+      col.CreateView({num_sampled}, col->dtype)};
 }

-template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCPU, int32_t>(
-    const CSRMatrix&, int64_t, int, bool, bool, double);
-template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCPU, int64_t>(
-    const CSRMatrix&, int64_t, int, bool, bool, double);
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
+    kDGLCPU, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
+    kDGLCPU, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);

 };  // namespace impl
 };  // namespace aten

--- a/src/array/cpu/sddmm.h
+++ b/src/array/cpu/sddmm.h
@@ -9,6 +9,7 @@
 #include <dgl/array.h>
 #include <dgl/bcast.h>
 #include <dgl/runtime/parallel_for.h>
+
 #include "../selector.h"

 namespace dgl {
@@ -25,38 +26,41 @@ namespace cpu {
 * \note it uses node parallel strategy, different threads are responsible
 *       for the computation of different nodes.
 */
-template <typename IdType, typename DType, typename Op,
-          int LhsTarget = 0, int RhsTarget = 2>
-void SDDMMCsr(const BcastOff& bcast,
-              const CSRMatrix& csr,
-              NDArray lhs, NDArray rhs, NDArray out) {
+template <
+    typename IdType, typename DType, typename Op, int LhsTarget = 0,
+    int RhsTarget = 2>
+void SDDMMCsr(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs,
+    NDArray out) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = csr.indptr.Ptr<IdType>();
  const IdType* indices = csr.indices.Ptr<IdType>();
  const IdType* edges = csr.data.Ptr<IdType>();
  const DType* X = lhs.Ptr<DType>();
  const DType* Y = rhs.Ptr<DType>();
-  const int64_t dim = bcast.out_len,
-                lhs_dim = bcast.lhs_len,
-                rhs_dim = bcast.rhs_len,
-                reduce_size = bcast.reduce_size;
+  const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
+                rhs_dim = bcast.rhs_len, reduce_size = bcast.reduce_size;
  DType* O = out.Ptr<DType>();
  runtime::parallel_for(0, csr.num_rows, [=](IdType b, IdType e) {
    for (auto rid = b; rid < e; ++rid) {
      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
      for (IdType j = row_start; j < row_end; ++j) {
        const IdType cid = indices[j];
-        const IdType eid = has_idx? edges[j] : j;
+        const IdType eid = has_idx ? edges[j] : j;
        DType* out_off = O + eid * dim;
        for (int64_t k = 0; k < dim; ++k) {
          const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-          const DType* lhs_off = Op::use_lhs
-            ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size
-            : nullptr;
-          const DType* rhs_off = Op::use_rhs
-            ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size
-            : nullptr;
+          const DType* lhs_off =
+              Op::use_lhs
+                  ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim +
+                        lhs_add * reduce_size
+                  : nullptr;
+          const DType* rhs_off =
+              Op::use_rhs
+                  ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim +
+                        rhs_add * reduce_size
+                  : nullptr;
          out_off[k] = Op::Call(lhs_off, rhs_off, reduce_size);
        }
      }
@@ -74,35 +78,38 @@ void SDDMMCsr(const BcastOff& bcast,
 * \note it uses edge parallel strategy, different threads are responsible
 *       for the computation of different edges.
 */
-template <typename IdType, typename DType, typename Op,
-          int LhsTarget = 0, int RhsTarget = 2>
-void SDDMMCoo(const BcastOff& bcast,
-              const COOMatrix& coo,
-              NDArray lhs, NDArray rhs, NDArray out) {
+template <
+    typename IdType, typename DType, typename Op, int LhsTarget = 0,
+    int RhsTarget = 2>
+void SDDMMCoo(
+    const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs,
+    NDArray out) {
  const bool has_idx = !IsNullArray(coo.data);
  const IdType* row = coo.row.Ptr<IdType>();
  const IdType* col = coo.col.Ptr<IdType>();
  const IdType* edges = coo.data.Ptr<IdType>();
  const DType* X = lhs.Ptr<DType>();
  const DType* Y = rhs.Ptr<DType>();
-  const int64_t dim = bcast.out_len,
-                lhs_dim = bcast.lhs_len,
-                rhs_dim = bcast.rhs_len,
-                reduce_size = bcast.reduce_size;
+  const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
+                rhs_dim = bcast.rhs_len, reduce_size = bcast.reduce_size;
  DType* O = out.Ptr<DType>();
 #pragma omp parallel for
  for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
    const IdType rid = row[i];
    const IdType cid = col[i];
-    const IdType eid = has_idx? edges[i] : i;
+    const IdType eid = has_idx ? edges[i] : i;
    DType* out_off = O + eid * dim;
    for (int64_t k = 0; k < dim; ++k) {
      const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
      const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
-      const DType* lhs_off = Op::use_lhs ?
-        X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim + lhs_add * reduce_size : nullptr;
-      const DType* rhs_off = Op::use_rhs ?
-        Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim + rhs_add * reduce_size : nullptr;
+      const DType* lhs_off =
+          Op::use_lhs ? X + Selector<LhsTarget>::Call(rid, eid, cid) * lhs_dim +
+                            lhs_add * reduce_size
+                      : nullptr;
+      const DType* rhs_off =
+          Op::use_rhs ? Y + Selector<RhsTarget>::Call(rid, eid, cid) * rhs_dim +
+                            rhs_add * reduce_size
+                      : nullptr;
      out_off[k] = Op::Call(lhs_off, rhs_off, bcast.reduce_size);
    }
  }
@@ -110,12 +117,13 @@ void SDDMMCoo(const BcastOff& bcast,

 namespace op {

-//////////////////////////////// binary operators on CPU ////////////////////////////////
+////////////////////////// binary operators on CPU /////////////////////////////
 template <typename DType>
 struct Add {
  static constexpr bool use_lhs = true;
  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
+  inline static DType Call(
+      const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
    return *lhs_off + *rhs_off;
  }
 };
@@ -124,7 +132,8 @@ template <typename DType>
 struct Sub {
  static constexpr bool use_lhs = true;
  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
+  inline static DType Call(
+      const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
    return *lhs_off - *rhs_off;
  }
 };
@@ -133,7 +142,8 @@ template <typename DType>
 struct Mul {
  static constexpr bool use_lhs = true;
  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
+  inline static DType Call(
+      const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
    return *lhs_off * *rhs_off;
  }
 };
@@ -142,7 +152,8 @@ template <typename DType>
 struct Div {
  static constexpr bool use_lhs = true;
  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
+  inline static DType Call(
+      const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
    return *lhs_off / *rhs_off;
  }
 };
@@ -151,7 +162,8 @@ template <typename DType>
 struct CopyLhs {
  static constexpr bool use_lhs = true;
  static constexpr bool use_rhs = false;
-  inline static DType Call(const DType* lhs_off, const DType*, int64_t len = 1) {
+  inline static DType Call(
+      const DType* lhs_off, const DType*, int64_t len = 1) {
    return *lhs_off;
  }
 };
@@ -160,7 +172,8 @@ template <typename DType>
 struct CopyRhs {
  static constexpr bool use_lhs = false;
  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* , const DType* rhs_off, int64_t len = 1) {
+  inline static DType Call(
+      const DType*, const DType* rhs_off, int64_t len = 1) {
    return *rhs_off;
  }
 };
@@ -169,7 +182,8 @@ template <typename DType>
 struct Dot {
  static constexpr bool use_lhs = true;
  static constexpr bool use_rhs = true;
-  inline static DType Call(const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
+  inline static DType Call(
+      const DType* lhs_off, const DType* rhs_off, int64_t len = 1) {
    DType rst = 0;
    for (int64_t l = 0; l < len; ++l) {
      rst += lhs_off[l] * rhs_off[l];
@@ -178,32 +192,32 @@ struct Dot {
  }
 };

-#define SWITCH_OP(op, Op, ...)                                      \
-  do {                                                              \
-    if ((op) == "add") {                                            \
-      typedef dgl::aten::cpu::op::Add<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "sub") {                                     \
-      typedef dgl::aten::cpu::op::Sub<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "mul") {                                     \
-      typedef dgl::aten::cpu::op::Mul<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "div") {                                     \
-      typedef dgl::aten::cpu::op::Div<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "copy_lhs") {                                \
-      typedef dgl::aten::cpu::op::CopyLhs<DType> Op;                \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "copy_rhs") {                                \
-      typedef dgl::aten::cpu::op::CopyRhs<DType> Op;                \
-      { __VA_ARGS__ }                                               \
-    } else if ((op) == "dot") {                                     \
-      typedef dgl::aten::cpu::op::Dot<DType> Op;                    \
-      { __VA_ARGS__ }                                               \
-    } else {                                                        \
-      LOG(FATAL) << "Unsupported SDDMM binary operator: " << op;    \
-    }                                                               \
+#define SWITCH_OP(op, Op, ...)                                   \
+  do {                                                           \
+    if ((op) == "add") {                                         \
+      typedef dgl::aten::cpu::op::Add<DType> Op;                 \
+      { __VA_ARGS__ }                                            \
+    } else if ((op) == "sub") {                                  \
+      typedef dgl::aten::cpu::op::Sub<DType> Op;                 \
+      { __VA_ARGS__ }                                            \
+    } else if ((op) == "mul") {                                  \
+      typedef dgl::aten::cpu::op::Mul<DType> Op;                 \
+      { __VA_ARGS__ }                                            \
+    } else if ((op) == "div") {                                  \
+      typedef dgl::aten::cpu::op::Div<DType> Op;                 \
+      { __VA_ARGS__ }                                            \
+    } else if ((op) == "copy_lhs") {                             \
+      typedef dgl::aten::cpu::op::CopyLhs<DType> Op;             \
+      { __VA_ARGS__ }                                            \
+    } else if ((op) == "copy_rhs") {                             \
+      typedef dgl::aten::cpu::op::CopyRhs<DType> Op;             \
+      { __VA_ARGS__ }                                            \
+    } else if ((op) == "dot") {                                  \
+      typedef dgl::aten::cpu::op::Dot<DType> Op;                 \
+      { __VA_ARGS__ }                                            \
+    } else {                                                     \
+      LOG(FATAL) << "Unsupported SDDMM binary operator: " << op; \
+    }                                                            \
  } while (0)

 }  // namespace op

--- a/src/array/cpu/segment_reduce.h
+++ b/src/array/cpu/segment_reduce.h
@@ -7,10 +7,11 @@
 #define DGL_ARRAY_CPU_SEGMENT_REDUCE_H_

 #include <dgl/array.h>
-#include <dgl/runtime/parallel_for.h>
 #include <dgl/base_heterograph.h>
-#include <vector>
+#include <dgl/runtime/parallel_for.h>
+
 #include <string>
+#include <vector>

 namespace dgl {
 namespace aten {
@@ -26,11 +27,10 @@ template <typename IdType, typename DType>
 void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
  int n = out->shape[0];
  int dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
-    dim *= out->shape[i];
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* offsets_data = offsets.Ptr<IdType>();
-  DType *out_data = out.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
  runtime::parallel_for(0, n, [=](int b, int e) {
    for (auto i = b; i < e; ++i) {
      for (IdType j = offsets_data[i]; j < offsets_data[i + 1]; ++j) {
@@ -51,16 +51,14 @@ void SegmentSum(NDArray feat, NDArray offsets, NDArray out) {
 *        used in backward phase.
 */
 template <typename IdType, typename DType, typename Cmp>
-void SegmentCmp(NDArray feat, NDArray offsets,
-                NDArray out, NDArray arg) {
+void SegmentCmp(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
  int n = out->shape[0];
  int dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
-    dim *= out->shape[i];
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* offsets_data = offsets.Ptr<IdType>();
-  DType *out_data = out.Ptr<DType>();
-  IdType *arg_data = arg.Ptr<IdType>();
+  DType* out_data = out.Ptr<DType>();
+  IdType* arg_data = arg.Ptr<IdType>();
  std::fill(out_data, out_data + out.NumElements(), Cmp::zero);
  std::fill(arg_data, arg_data + arg.NumElements(), -1);
  runtime::parallel_for(0, n, [=](int b, int e) {
@@ -89,8 +87,7 @@ template <typename IdType, typename DType>
 void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
  int n = feat->shape[0];
  int dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
-    dim *= out->shape[i];
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* idx_data = idx.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();
@@ -114,24 +111,26 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
 * \param list_out List of the output tensors.
 */
 template <typename IdType, typename DType>
-void UpdateGradMinMax_hetero(HeteroGraphPtr graph,
-                       const std::string& op,
-                       const std::vector<NDArray>& list_feat,
-                       const std::vector<NDArray>& list_idx,
-                       const std::vector<NDArray>& list_idx_types,
-                       std::vector<NDArray>* list_out) {
+void UpdateGradMinMax_hetero(
+    HeteroGraphPtr graph, const std::string& op,
+    const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
+    const std::vector<NDArray>& list_idx_types,
+    std::vector<NDArray>* list_out) {
  if (op == "copy_lhs" || op == "copy_rhs") {
-    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(graph->NumVertexTypes(),
-    std::vector<dgl_id_t>());
+    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
+        graph->NumVertexTypes(), std::vector<dgl_id_t>());

    for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
      auto pair = graph->meta_graph()->FindEdge(etype);
      const dgl_id_t dst_ntype = pair.first;  // graph is reversed
      const dgl_id_t src_ntype = pair.second;
-      auto same_src_dst_ntype = std::find(std::begin(src_dst_ntypes[dst_ntype]),
-        std::end(src_dst_ntypes[dst_ntype]), src_ntype);
-      // if op is "copy_lhs", relation type with same src and dst node type will be updated once
-      if (op == "copy_lhs" && same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
+      auto same_src_dst_ntype = std::find(
+          std::begin(src_dst_ntypes[dst_ntype]),
+          std::end(src_dst_ntypes[dst_ntype]), src_ntype);
+      // if op is "copy_lhs", relation type with same src and dst node type will
+      // be updated once
+      if (op == "copy_lhs" &&
+          same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
        continue;
      src_dst_ntypes[dst_ntype].push_back(src_ntype);
      const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
@@ -149,7 +148,8 @@ void UpdateGradMinMax_hetero(HeteroGraphPtr graph,
          if (type == idx_type_data[i * dim + k]) {
            const int write_row = idx_data[i * dim + k];
 #pragma omp atomic
-            out_data[write_row * dim + k] += feat_data[i * dim + k];  // feat = dZ
+            out_data[write_row * dim + k] +=
+                feat_data[i * dim + k];  // feat = dZ
          }
        }
      }
@@ -170,8 +170,7 @@ template <typename IdType, typename DType>
 void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
  int n = feat->shape[0];
  int dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
-    dim *= out->shape[i];
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* arg_data = arg.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();

--- a/src/array/cpu/spmat_op_impl_coo.cc
+++ b/src/array/cpu/spmat_op_impl_coo.cc
@@ -3,13 +3,15 @@
 * \file array/cpu/spmat_op_impl.cc
 * \brief CPU implementation of COO sparse matrix operators
 */
-#include <dmlc/omp.h>
 #include <dgl/runtime/parallel_for.h>
-#include <vector>
-#include <unordered_set>
-#include <unordered_map>
-#include <tuple>
+#include <dmlc/omp.h>
+
 #include <numeric>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
 #include "array_utils.h"

 namespace dgl {
@@ -33,11 +35,10 @@ template <DGLDeviceType XPU, typename IdType>
 bool COOIsNonZero(COOMatrix coo, int64_t row, int64_t col) {
  CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
  CHECK(col >= 0 && col < coo.num_cols) << "Invalid col index: " << col;
-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
+  const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
  for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
-    if (coo_row_data[i] == row && coo_col_data[i] == col)
-      return true;
+    if (coo_row_data[i] == row && coo_col_data[i] == col) return true;
  }
  return false;
 }
@@ -51,9 +52,9 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
  const auto collen = col->shape[0];
  const auto rstlen = std::max(rowlen, collen);
  NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
-  IdType* rst_data = static_cast<IdType*>(rst->data);
-  const IdType* row_data = static_cast<IdType*>(row->data);
-  const IdType* col_data = static_cast<IdType*>(col->data);
+  IdType *rst_data = static_cast<IdType *>(rst->data);
+  const IdType *row_data = static_cast<IdType *>(row->data);
+  const IdType *col_data = static_cast<IdType *>(col->data);
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
  const int64_t kmax = std::max(rowlen, collen);
@@ -61,7 +62,8 @@ NDArray COOIsNonZero(COOMatrix coo, NDArray row, NDArray col) {
    for (auto k = b; k < e; ++k) {
      int64_t i = row_stride * k;
      int64_t j = col_stride * k;
-      rst_data[k] = COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j])? 1 : 0;
+      rst_data[k] =
+          COOIsNonZero<XPU, IdType>(coo, row_data[i], col_data[j]) ? 1 : 0;
    }
  });
  return rst;
@@ -75,11 +77,11 @@ template NDArray COOIsNonZero<kDGLCPU, int64_t>(COOMatrix, NDArray, NDArray);
 template <DGLDeviceType XPU, typename IdType>
 bool COOHasDuplicate(COOMatrix coo) {
  std::unordered_set<std::pair<IdType, IdType>, PairHash> hashmap;
-  const IdType* src_data = static_cast<IdType*>(coo.row->data);
-  const IdType* dst_data = static_cast<IdType*>(coo.col->data);
+  const IdType *src_data = static_cast<IdType *>(coo.row->data);
+  const IdType *dst_data = static_cast<IdType *>(coo.col->data);
  const auto nnz = coo.row->shape[0];
  for (IdType eid = 0; eid < nnz; ++eid) {
-    const auto& p = std::make_pair(src_data[eid], dst_data[eid]);
+    const auto &p = std::make_pair(src_data[eid], dst_data[eid]);
    if (hashmap.count(p)) {
      return true;
    } else {
@@ -97,11 +99,10 @@ template bool COOHasDuplicate<kDGLCPU, int64_t>(COOMatrix coo);
 template <DGLDeviceType XPU, typename IdType>
 int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
  CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;
-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
+  const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
  int64_t result = 0;
  for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
-    if (coo_row_data[i] == row)
-      ++result;
+    if (coo_row_data[i] == row) ++result;
  }
  return result;
 }
@@ -113,9 +114,9 @@ template <DGLDeviceType XPU, typename IdType>
 NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
  CHECK_SAME_DTYPE(coo.col, rows);
  const auto len = rows->shape[0];
-  const IdType* vid_data = static_cast<IdType*>(rows->data);
+  const IdType *vid_data = static_cast<IdType *>(rows->data);
  NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
-  IdType* rst_data = static_cast<IdType*>(rst->data);
+  IdType *rst_data = static_cast<IdType *>(rst->data);
 #pragma omp parallel for
  for (int64_t i = 0; i < len; ++i) {
    rst_data[i] = COOGetRowNNZ<XPU, IdType>(coo, vid_data[i]);
@@ -126,16 +127,17 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
 template NDArray COOGetRowNNZ<kDGLCPU, int32_t>(COOMatrix, NDArray);
 template NDArray COOGetRowNNZ<kDGLCPU, int64_t>(COOMatrix, NDArray);

-///////////////////////////// COOGetRowDataAndIndices /////////////////////////////
+////////////////////////// COOGetRowDataAndIndices /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
 std::pair<NDArray, NDArray> COOGetRowDataAndIndices(
    COOMatrix coo, int64_t row) {
  CHECK(row >= 0 && row < coo.num_rows) << "Invalid row index: " << row;

-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
-  const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
+  const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *coo_data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;

  std::vector<IdType> indices;
  std::vector<IdType> data;
@@ -147,13 +149,14 @@ std::pair<NDArray, NDArray> COOGetRowDataAndIndices(
    }
  }

-  return std::make_pair(NDArray::FromVector(data), NDArray::FromVector(indices));
+  return std::make_pair(
+      NDArray::FromVector(data), NDArray::FromVector(indices));
 }

-template std::pair<NDArray, NDArray>
-COOGetRowDataAndIndices<kDGLCPU, int32_t>(COOMatrix, int64_t);
-template std::pair<NDArray, NDArray>
-COOGetRowDataAndIndices<kDGLCPU, int64_t>(COOMatrix, int64_t);
+template std::pair<NDArray, NDArray> COOGetRowDataAndIndices<kDGLCPU, int32_t>(
+    COOMatrix, int64_t);
+template std::pair<NDArray, NDArray> COOGetRowDataAndIndices<kDGLCPU, int64_t>(
+    COOMatrix, int64_t);

 ///////////////////////////// COOGetData /////////////////////////////

@@ -162,34 +165,35 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
  const int64_t rowlen = rows->shape[0];
  const int64_t collen = cols->shape[0];
  CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
-    << "Invalid row and col Id array:" << rows << " " << cols;
+      << "Invalid row and col Id array:" << rows << " " << cols;
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  const IdType* row_data = rows.Ptr<IdType>();
-  const IdType* col_data = cols.Ptr<IdType>();
+  const IdType *row_data = rows.Ptr<IdType>();
+  const IdType *col_data = cols.Ptr<IdType>();

-  const IdType* coo_row = coo.row.Ptr<IdType>();
-  const IdType* coo_col = coo.col.Ptr<IdType>();
-  const IdType* data = COOHasData(coo) ? coo.data.Ptr<IdType>() : nullptr;
+  const IdType *coo_row = coo.row.Ptr<IdType>();
+  const IdType *coo_col = coo.col.Ptr<IdType>();
+  const IdType *data = COOHasData(coo) ? coo.data.Ptr<IdType>() : nullptr;
  const int64_t nnz = coo.row->shape[0];

  const int64_t retlen = std::max(rowlen, collen);
  IdArray ret = Full(-1, retlen, rows->dtype.bits, rows->ctx);
-  IdType* ret_data = ret.Ptr<IdType>();
+  IdType *ret_data = ret.Ptr<IdType>();

-  // TODO(minjie): We might need to consider sorting the COO beforehand especially
-  //   when the number of (row, col) pairs is large. Need more benchmarks to justify
-  //   the choice.
+  // TODO(minjie): We might need to consider sorting the COO beforehand
+  // especially when the number of (row, col) pairs is large. Need more
+  // benchmarks to justify the choice.

  if (coo.row_sorted) {
    parallel_for(0, retlen, [&](size_t b, size_t e) {
      for (auto p = b; p < e; ++p) {
-        const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
+        const IdType row_id = row_data[p * row_stride],
+                     col_id = col_data[p * col_stride];
        auto it = std::lower_bound(coo_row, coo_row + nnz, row_id);
        for (; it < coo_row + nnz && *it == row_id; ++it) {
          const auto idx = it - coo_row;
          if (coo_col[idx] == col_id) {
-            ret_data[p] = data? data[idx] : idx;
+            ret_data[p] = data ? data[idx] : idx;
            break;
          }
        }
@@ -198,10 +202,11 @@ IdArray COOGetData(COOMatrix coo, IdArray rows, IdArray cols) {
  } else {
 #pragma omp parallel for
    for (int64_t p = 0; p < retlen; ++p) {
-      const IdType row_id = row_data[p * row_stride], col_id = col_data[p * col_stride];
+      const IdType row_id = row_data[p * row_stride],
+                   col_id = col_data[p * col_stride];
      for (int64_t idx = 0; idx < nnz; ++idx) {
        if (coo_row[idx] == row_id && coo_col[idx] == col_id) {
-          ret_data[p] = data? data[idx] : idx;
+          ret_data[p] = data ? data[idx] : idx;
          break;
        }
      }
@@ -217,8 +222,8 @@ template IdArray COOGetData<kDGLCPU, int64_t>(COOMatrix, IdArray, IdArray);
 ///////////////////////////// COOGetDataAndIndices /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
-std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
-                                          NDArray cols) {
+std::vector<NDArray> COOGetDataAndIndices(
+    COOMatrix coo, NDArray rows, NDArray cols) {
  CHECK_SAME_DTYPE(coo.col, rows);
  CHECK_SAME_DTYPE(coo.col, cols);
  const int64_t rowlen = rows->shape[0];
@@ -226,16 +231,17 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
  const int64_t len = std::max(rowlen, collen);

  CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
-    << "Invalid row and col id array.";
+      << "Invalid row and col id array.";

  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  const IdType* row_data = static_cast<IdType*>(rows->data);
-  const IdType* col_data = static_cast<IdType*>(cols->data);
+  const IdType *row_data = static_cast<IdType *>(rows->data);
+  const IdType *col_data = static_cast<IdType *>(cols->data);

-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
-  const IdType* data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
+  const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;

  std::vector<IdType> ret_rows, ret_cols;
  std::vector<IdType> ret_data;
@@ -244,21 +250,27 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
  ret_data.reserve(len);

  // NOTE(BarclayII): With a small number of lookups, linear scan is faster.
-  // The threshold 200 comes from benchmarking both algorithms on a P3.8x instance.
-  // I also tried sorting plus binary search.  The speed gain is only significant for
-  // medium-sized graphs and lookups, so I didn't include it.
+  // The threshold 200 comes from benchmarking both algorithms on a P3.8x
+  // instance. I also tried sorting plus binary search.  The speed gain is only
+  // significant for medium-sized graphs and lookups, so I didn't include it.
  if (len >= 200) {
-    // TODO(BarclayII) Ideally we would want to cache this object.  However I'm not sure
-    // what is the best way to do so since this object is valid for CPU only.
-    std::unordered_multimap<std::pair<IdType, IdType>, IdType, PairHash> pair_map;
+    // TODO(BarclayII) Ideally we would want to cache this object.  However I'm
+    // not sure what is the best way to do so since this object is valid for CPU
+    // only.
+    std::unordered_multimap<std::pair<IdType, IdType>, IdType, PairHash>
+        pair_map;
    pair_map.reserve(coo.row->shape[0]);
    for (int64_t k = 0; k < coo.row->shape[0]; ++k)
-      pair_map.emplace(std::make_pair(coo_row_data[k], coo_col_data[k]), data ? data[k]: k);
+      pair_map.emplace(
+          std::make_pair(coo_row_data[k], coo_col_data[k]), data ? data[k] : k);

-    for (int64_t i = 0, j = 0; i < rowlen && j < collen; i += row_stride, j += col_stride) {
+    for (int64_t i = 0, j = 0; i < rowlen && j < collen;
+         i += row_stride, j += col_stride) {
      const IdType row_id = row_data[i], col_id = col_data[j];
-      CHECK(row_id >= 0 && row_id < coo.num_rows) << "Invalid row index: " << row_id;
-      CHECK(col_id >= 0 && col_id < coo.num_cols) << "Invalid col index: " << col_id;
+      CHECK(row_id >= 0 && row_id < coo.num_rows)
+          << "Invalid row index: " << row_id;
+      CHECK(col_id >= 0 && col_id < coo.num_cols)
+          << "Invalid col index: " << col_id;
      auto range = pair_map.equal_range({row_id, col_id});
      for (auto it = range.first; it != range.second; ++it) {
        ret_rows.push_back(row_id);
@@ -267,10 +279,13 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
      }
    }
  } else {
-    for (int64_t i = 0, j = 0; i < rowlen && j < collen; i += row_stride, j += col_stride) {
+    for (int64_t i = 0, j = 0; i < rowlen && j < collen;
+         i += row_stride, j += col_stride) {
      const IdType row_id = row_data[i], col_id = col_data[j];
-      CHECK(row_id >= 0 && row_id < coo.num_rows) << "Invalid row index: " << row_id;
-      CHECK(col_id >= 0 && col_id < coo.num_cols) << "Invalid col index: " << col_id;
+      CHECK(row_id >= 0 && row_id < coo.num_rows)
+          << "Invalid row index: " << row_id;
+      CHECK(col_id >= 0 && col_id < coo.num_cols)
+          << "Invalid col index: " << col_id;
      for (int64_t k = 0; k < coo.row->shape[0]; ++k) {
        if (coo_row_data[k] == row_id && coo_col_data[k] == col_id) {
          ret_rows.push_back(row_id);
@@ -281,9 +296,9 @@ std::vector<NDArray> COOGetDataAndIndices(COOMatrix coo, NDArray rows,
    }
  }

-  return {NDArray::FromVector(ret_rows),
-          NDArray::FromVector(ret_cols),
-          NDArray::FromVector(ret_data)};
+  return {
+      NDArray::FromVector(ret_rows), NDArray::FromVector(ret_cols),
+      NDArray::FromVector(ret_data)};
 }

 template std::vector<NDArray> COOGetDataAndIndices<kDGLCPU, int32_t>(
@@ -304,7 +319,8 @@ template COOMatrix COOTranspose<kDGLCPU, int64_t>(COOMatrix coo);
 ///////////////////////////// COOToCSR /////////////////////////////
 namespace {

-template <class IdType> CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
+template <class IdType>
+CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
  const int64_t N = coo.num_rows;
  const int64_t NNZ = coo.row->shape[0];
  const IdType *const row_data = static_cast<IdType *>(coo.row->data);
@@ -389,11 +405,13 @@ template <class IdType> CSRMatrix SortedCOOToCSR(const COOMatrix &coo) {
    std::fill(Bp, Bp + N + 1, 0);
  }

-  return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
-                   ret_data, coo.col_sorted);
+  return CSRMatrix(
+      coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
+      coo.col_sorted);
 }

-template <class IdType> CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
+template <class IdType>
+CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
  const int64_t N = coo.num_rows;
  const int64_t NNZ = coo.row->shape[0];
  const IdType *const row_data = static_cast<IdType *>(coo.row->data);
@@ -507,11 +525,13 @@ template <class IdType> CSRMatrix UnSortedSparseCOOToCSR(const COOMatrix &coo) {
      Bp[i + 1] += i_start;
    }
  }
-  return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
-                   ret_data, coo.col_sorted);
+  return CSRMatrix(
+      coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
+      coo.col_sorted);
 }

-template <class IdType> CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
+template <class IdType>
+CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
  const int64_t N = coo.num_rows;
  const int64_t NNZ = coo.row->shape[0];
  const IdType *const row_data = static_cast<IdType *>(coo.row->data);
@@ -597,8 +617,9 @@ template <class IdType> CSRMatrix UnSortedDenseCOOToCSR(const COOMatrix &coo) {
  }
  CHECK_EQ(Bp[N], NNZ);

-  return CSRMatrix(coo.num_rows, coo.num_cols, ret_indptr, ret_indices,
-                   ret_data, coo.col_sorted);
+  return CSRMatrix(
+      coo.num_rows, coo.num_cols, ret_indptr, ret_indices, ret_data,
+      coo.col_sorted);
 }

 }  // namespace
@@ -643,9 +664,10 @@ COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end) {
  CHECK(start >= 0 && start < coo.num_rows) << "Invalid start row " << start;
  CHECK(end > 0 && end <= coo.num_rows) << "Invalid end row " << end;

-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
-  const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
+  const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *coo_data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;

  std::vector<IdType> ret_row, ret_col;
  std::vector<IdType> ret_data;
@@ -660,13 +682,9 @@ COOMatrix COOSliceRows(COOMatrix coo, int64_t start, int64_t end) {
    }
  }
  return COOMatrix(
-    end - start,
-    coo.num_cols,
-    NDArray::FromVector(ret_row),
-    NDArray::FromVector(ret_col),
-    NDArray::FromVector(ret_data),
-    coo.row_sorted,
-    coo.col_sorted);
+      end - start, coo.num_cols, NDArray::FromVector(ret_row),
+      NDArray::FromVector(ret_col), NDArray::FromVector(ret_data),
+      coo.row_sorted, coo.col_sorted);
 }

 template COOMatrix COOSliceRows<kDGLCPU, int32_t>(COOMatrix, int64_t, int64_t);
@@ -674,9 +692,10 @@ template COOMatrix COOSliceRows<kDGLCPU, int64_t>(COOMatrix, int64_t, int64_t);

 template <DGLDeviceType XPU, typename IdType>
 COOMatrix COOSliceRows(COOMatrix coo, NDArray rows) {
-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
-  const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
+  const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *coo_data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;

  std::vector<IdType> ret_row, ret_col;
  std::vector<IdType> ret_data;
@@ -695,24 +714,27 @@ COOMatrix COOSliceRows(COOMatrix coo, NDArray rows) {
  }

  return COOMatrix{
-    rows->shape[0],
-    coo.num_cols,
-    NDArray::FromVector(ret_row),
-    NDArray::FromVector(ret_col),
-    NDArray::FromVector(ret_data),
-    coo.row_sorted, coo.col_sorted};
+      rows->shape[0],
+      coo.num_cols,
+      NDArray::FromVector(ret_row),
+      NDArray::FromVector(ret_col),
+      NDArray::FromVector(ret_data),
+      coo.row_sorted,
+      coo.col_sorted};
 }

-template COOMatrix COOSliceRows<kDGLCPU, int32_t>(COOMatrix , NDArray);
-template COOMatrix COOSliceRows<kDGLCPU, int64_t>(COOMatrix , NDArray);
+template COOMatrix COOSliceRows<kDGLCPU, int32_t>(COOMatrix, NDArray);
+template COOMatrix COOSliceRows<kDGLCPU, int64_t>(COOMatrix, NDArray);

 ///////////////////////////// COOSliceMatrix /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
-COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols) {
-  const IdType* coo_row_data = static_cast<IdType*>(coo.row->data);
-  const IdType* coo_col_data = static_cast<IdType*>(coo.col->data);
-  const IdType* coo_data = COOHasData(coo) ? static_cast<IdType*>(coo.data->data) : nullptr;
+COOMatrix COOSliceMatrix(
+    COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols) {
+  const IdType *coo_row_data = static_cast<IdType *>(coo.row->data);
+  const IdType *coo_col_data = static_cast<IdType *>(coo.col->data);
+  const IdType *coo_data =
+      COOHasData(coo) ? static_cast<IdType *>(coo.data->data) : nullptr;

  IdHashMap<IdType> row_map(rows), col_map(cols);

@@ -733,11 +755,10 @@ COOMatrix COOSliceMatrix(COOMatrix coo, runtime::NDArray rows, runtime::NDArray
    }
  }

-  return COOMatrix(rows->shape[0], cols->shape[0],
-                   NDArray::FromVector(ret_row),
-                   NDArray::FromVector(ret_col),
-                   NDArray::FromVector(ret_data),
-                   coo.row_sorted, coo.col_sorted);
+  return COOMatrix(
+      rows->shape[0], cols->shape[0], NDArray::FromVector(ret_row),
+      NDArray::FromVector(ret_col), NDArray::FromVector(ret_data),
+      coo.row_sorted, coo.col_sorted);
 }

 template COOMatrix COOSliceMatrix<kDGLCPU, int32_t>(
@@ -745,36 +766,38 @@ template COOMatrix COOSliceMatrix<kDGLCPU, int32_t>(
 template COOMatrix COOSliceMatrix<kDGLCPU, int64_t>(
    COOMatrix coo, runtime::NDArray rows, runtime::NDArray cols);

-
 ///////////////////////////// COOReorder /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
-COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
-                     runtime::NDArray new_col_id_arr) {
+COOMatrix COOReorder(
+    COOMatrix coo, runtime::NDArray new_row_id_arr,
+    runtime::NDArray new_col_id_arr) {
  CHECK_SAME_DTYPE(coo.row, new_row_id_arr);
  CHECK_SAME_DTYPE(coo.col, new_col_id_arr);

  // Input COO
-  const IdType* in_rows = static_cast<IdType*>(coo.row->data);
-  const IdType* in_cols = static_cast<IdType*>(coo.col->data);
+  const IdType *in_rows = static_cast<IdType *>(coo.row->data);
+  const IdType *in_cols = static_cast<IdType *>(coo.col->data);
  int64_t num_rows = coo.num_rows;
  int64_t num_cols = coo.num_cols;
  int64_t nnz = coo.row->shape[0];
  CHECK_EQ(num_rows, new_row_id_arr->shape[0])
-      << "The new row Id array needs to be the same as the number of rows of COO";
+      << "The new row Id array needs to be the same as the number of rows of "
+         "COO";
  CHECK_EQ(num_cols, new_col_id_arr->shape[0])
-      << "The new col Id array needs to be the same as the number of cols of COO";
+      << "The new col Id array needs to be the same as the number of cols of "
+         "COO";

  // New row/col Ids.
-  const IdType* new_row_ids = static_cast<IdType*>(new_row_id_arr->data);
-  const IdType* new_col_ids = static_cast<IdType*>(new_col_id_arr->data);
+  const IdType *new_row_ids = static_cast<IdType *>(new_row_id_arr->data);
+  const IdType *new_col_ids = static_cast<IdType *>(new_col_id_arr->data);

  // Output COO
  NDArray out_row_arr = NDArray::Empty({nnz}, coo.row->dtype, coo.row->ctx);
  NDArray out_col_arr = NDArray::Empty({nnz}, coo.col->dtype, coo.col->ctx);
  NDArray out_data_arr = COOHasData(coo) ? coo.data : NullArray();
-  IdType *out_row = static_cast<IdType*>(out_row_arr->data);
-  IdType *out_col = static_cast<IdType*>(out_col_arr->data);
+  IdType *out_row = static_cast<IdType *>(out_row_arr->data);
+  IdType *out_col = static_cast<IdType *>(out_col_arr->data);

  parallel_for(0, nnz, [=](size_t b, size_t e) {
    for (auto i = b; i < e; ++i) {
@@ -785,10 +808,10 @@ COOMatrix COOReorder(COOMatrix coo, runtime::NDArray new_row_id_arr,
  return COOMatrix(num_rows, num_cols, out_row_arr, out_col_arr, out_data_arr);
 }

-template COOMatrix COOReorder<kDGLCPU, int64_t>(COOMatrix csr, runtime::NDArray new_row_ids,
-                                               runtime::NDArray new_col_ids);
-template COOMatrix COOReorder<kDGLCPU, int32_t>(COOMatrix csr, runtime::NDArray new_row_ids,
-                                               runtime::NDArray new_col_ids);
+template COOMatrix COOReorder<kDGLCPU, int64_t>(
+    COOMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);
+template COOMatrix COOReorder<kDGLCPU, int32_t>(
+    COOMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);

 }  // namespace impl
 }  // namespace aten

--- a/src/array/cpu/spmat_op_impl_csr.cc
+++ b/src/array/cpu/spmat_op_impl_csr.cc
@@ -5,10 +5,12 @@
 */
 #include <dgl/array.h>
 #include <dgl/runtime/parallel_for.h>
-#include <vector>
-#include <unordered_set>
-#include <numeric>
+
 #include <atomic>
+#include <numeric>
+#include <unordered_set>
+#include <vector>
+
 #include "array_utils.h"

 namespace dgl {
@@ -26,8 +28,8 @@ bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
  if (csr.sorted) {
-    const IdType *start = indices_data + indptr_data[row];
-    const IdType *end = indices_data + indptr_data[row + 1];
+    const IdType* start = indices_data + indptr_data[row];
+    const IdType* end = indices_data + indptr_data[row + 1];
    return std::binary_search(start, end, col);
  } else {
    for (IdType i = indptr_data[row]; i < indptr_data[row + 1]; ++i) {
@@ -53,12 +55,15 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
  const IdType* col_data = static_cast<IdType*>(col->data);
  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  runtime::parallel_for(0, std::max(rowlen, collen), 1, [=](int64_t b, int64_t e) {
-    int64_t i = (row_stride == 0) ? 0 : b;
-    int64_t j = (col_stride == 0) ? 0 : b;
-    for (int64_t k = b; i < e && j < e; i += row_stride, j += col_stride, ++k)
-      rst_data[k] = CSRIsNonZero<XPU, IdType>(csr, row_data[i], col_data[j]) ? 1 : 0;
-  });
+  runtime::parallel_for(
+      0, std::max(rowlen, collen), 1, [=](int64_t b, int64_t e) {
+        int64_t i = (row_stride == 0) ? 0 : b;
+        int64_t j = (col_stride == 0) ? 0 : b;
+        for (int64_t k = b; i < e && j < e;
+             i += row_stride, j += col_stride, ++k)
+          rst_data[k] =
+              CSRIsNonZero<XPU, IdType>(csr, row_data[i], col_data[j]) ? 1 : 0;
+      });
  return rst;
 }

@@ -73,7 +78,7 @@ bool CSRHasDuplicate(CSRMatrix csr) {
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
  for (IdType src = 0; src < csr.num_rows; ++src) {
    std::unordered_set<IdType> hashmap;
-    for (IdType eid = indptr_data[src]; eid < indptr_data[src+1]; ++eid) {
+    for (IdType eid = indptr_data[src]; eid < indptr_data[src + 1]; ++eid) {
      const IdType dst = indices_data[eid];
      if (hashmap.count(dst)) {
        return true;
@@ -117,7 +122,7 @@ NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
 template NDArray CSRGetRowNNZ<kDGLCPU, int32_t>(CSRMatrix, NDArray);
 template NDArray CSRGetRowNNZ<kDGLCPU, int64_t>(CSRMatrix, NDArray);

-///////////////////////////// CSRGetRowColumnIndices /////////////////////////////
+/////////////////////////// CSRGetRowColumnIndices /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
 NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
@@ -140,7 +145,8 @@ NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
  if (CSRHasData(csr))
    return csr.data.CreateView({len}, csr.data->dtype, offset);
  else
-    return aten::Range(offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
+    return aten::Range(
+        offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
 }

 template NDArray CSRGetRowData<kDGLCPU, int32_t>(CSRMatrix, int64_t);
@@ -150,12 +156,12 @@ template NDArray CSRGetRowData<kDGLCPU, int64_t>(CSRMatrix, int64_t);
 ///////////////////////////// CSRGetDataAndIndices /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
-void CollectDataIndicesFromSorted(const IdType *indices_data, const IdType *data,
-                                  const IdType start, const IdType end, const IdType col,
-                                  std::vector<IdType> *col_vec,
-                                  std::vector<IdType> *ret_vec) {
-  const IdType *start_ptr = indices_data + start;
-  const IdType *end_ptr = indices_data + end;
+void CollectDataIndicesFromSorted(
+    const IdType* indices_data, const IdType* data, const IdType start,
+    const IdType end, const IdType col, std::vector<IdType>* col_vec,
+    std::vector<IdType>* ret_vec) {
+  const IdType* start_ptr = indices_data + start;
+  const IdType* end_ptr = indices_data + end;
  auto it = std::lower_bound(start_ptr, end_ptr, col);
  // This might be a multi-graph. We need to collect all of the matched
  // columns.
@@ -173,13 +179,15 @@ void CollectDataIndicesFromSorted(const IdType *indices_data, const IdType *data
 }

 template <DGLDeviceType XPU, typename IdType>
-std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray rows, NDArray cols) {
-  // TODO(minjie): more efficient implementation for matrix without duplicate entries
+std::vector<NDArray> CSRGetDataAndIndices(
+    CSRMatrix csr, NDArray rows, NDArray cols) {
+  // TODO(minjie): more efficient implementation for matrix without duplicate
+  // entries
  const int64_t rowlen = rows->shape[0];
  const int64_t collen = cols->shape[0];

  CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
-    << "Invalid row and col id array.";
+      << "Invalid row and col id array.";

  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
@@ -188,40 +196,43 @@ std::vector<NDArray> CSRGetDataAndIndices(CSRMatrix csr, NDArray rows, NDArray c

  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
-  const IdType* data = CSRHasData(csr)? static_cast<IdType*>(csr.data->data) : nullptr;
+  const IdType* data =
+      CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;

  std::vector<IdType> ret_rows, ret_cols;
  std::vector<IdType> ret_data;

-  for (int64_t i = 0, j = 0; i < rowlen && j < collen; i += row_stride, j += col_stride) {
+  for (int64_t i = 0, j = 0; i < rowlen && j < collen;
+       i += row_stride, j += col_stride) {
    const IdType row_id = row_data[i], col_id = col_data[j];
-    CHECK(row_id >= 0 && row_id < csr.num_rows) << "Invalid row index: " << row_id;
-    CHECK(col_id >= 0 && col_id < csr.num_cols) << "Invalid col index: " << col_id;
+    CHECK(row_id >= 0 && row_id < csr.num_rows)
+        << "Invalid row index: " << row_id;
+    CHECK(col_id >= 0 && col_id < csr.num_cols)
+        << "Invalid col index: " << col_id;
    if (csr.sorted) {
      // Here we collect col indices and data.
-      CollectDataIndicesFromSorted<XPU, IdType>(indices_data, data,
-                                                indptr_data[row_id],
-                                                indptr_data[row_id + 1],
-                                                col_id, &ret_cols,
-                                                &ret_data);
+      CollectDataIndicesFromSorted<XPU, IdType>(
+          indices_data, data, indptr_data[row_id], indptr_data[row_id + 1],
+          col_id, &ret_cols, &ret_data);
      // We need to add row Ids.
      while (ret_rows.size() < ret_data.size()) {
        ret_rows.push_back(row_id);
      }
    } else {
-      for (IdType i = indptr_data[row_id]; i < indptr_data[row_id+1]; ++i) {
+      for (IdType i = indptr_data[row_id]; i < indptr_data[row_id + 1]; ++i) {
        if (indices_data[i] == col_id) {
          ret_rows.push_back(row_id);
          ret_cols.push_back(col_id);
-          ret_data.push_back(data? data[i] : i);
+          ret_data.push_back(data ? data[i] : i);
        }
      }
    }
  }

-  return {NDArray::FromVector(ret_rows, csr.indptr->ctx),
-          NDArray::FromVector(ret_cols, csr.indptr->ctx),
-          NDArray::FromVector(ret_data, csr.data->ctx)};
+  return {
+      NDArray::FromVector(ret_rows, csr.indptr->ctx),
+      NDArray::FromVector(ret_cols, csr.indptr->ctx),
+      NDArray::FromVector(ret_data, csr.data->ctx)};
 }

 template std::vector<NDArray> CSRGetDataAndIndices<kDGLCPU, int32_t>(
@@ -240,9 +251,12 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
  const int64_t nnz = csr.indices->shape[0];
  const IdType* Ap = static_cast<IdType*>(csr.indptr->data);
  const IdType* Aj = static_cast<IdType*>(csr.indices->data);
-  const IdType* Ax = CSRHasData(csr)? static_cast<IdType*>(csr.data->data) : nullptr;
-  NDArray ret_indptr = NDArray::Empty({M + 1}, csr.indptr->dtype, csr.indptr->ctx);
-  NDArray ret_indices = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
+  const IdType* Ax =
+      CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
+  NDArray ret_indptr =
+      NDArray::Empty({M + 1}, csr.indptr->dtype, csr.indptr->ctx);
+  NDArray ret_indices =
+      NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
  NDArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
  IdType* Bp = static_cast<IdType*>(ret_indptr->data);
  IdType* Bi = static_cast<IdType*>(ret_indices->data);
@@ -263,10 +277,10 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
  Bp[M] = nnz;

  for (int64_t i = 0; i < N; ++i) {
-    for (IdType j = Ap[i]; j < Ap[i+1]; ++j) {
+    for (IdType j = Ap[i]; j < Ap[i + 1]; ++j) {
      const IdType dst = Aj[j];
      Bi[Bp[dst]] = i;
-      Bx[Bp[dst]] = Ax? Ax[j] : j;
+      Bx[Bp[dst]] = Ax ? Ax[j] : j;
      Bp[dst]++;
    }
  }
@@ -278,7 +292,8 @@ CSRMatrix CSRTranspose(CSRMatrix csr) {
    last = temp;
  }

-  return CSRMatrix{csr.num_cols, csr.num_rows, ret_indptr, ret_indices, ret_data};
+  return CSRMatrix{
+      csr.num_cols, csr.num_rows, ret_indptr, ret_indices, ret_data};
 }

 template CSRMatrix CSRTranspose<kDGLCPU, int32_t>(CSRMatrix csr);
@@ -293,14 +308,13 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
  IdType* ret_row_data = static_cast<IdType*>(ret_row->data);
  parallel_for(0, csr.indptr->shape[0] - 1, 10000, [=](int64_t b, int64_t e) {
    for (auto i = b; i < e; ++i) {
-      std::fill(ret_row_data + indptr_data[i],
-                ret_row_data + indptr_data[i + 1],
-                i);
+      std::fill(
+          ret_row_data + indptr_data[i], ret_row_data + indptr_data[i + 1], i);
    }
  });
-  return COOMatrix(csr.num_rows, csr.num_cols,
-                   ret_row, csr.indices, csr.data,
-                   true, csr.sorted);
+  return COOMatrix(
+      csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
+      csr.sorted);
 }

 template COOMatrix CSRToCOO<kDGLCPU, int32_t>(CSRMatrix csr);
@@ -315,7 +329,8 @@ COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
  // data array should have the same type as the indices arrays
-  const IdType* data = CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
+  const IdType* data =
+      CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
  NDArray ret_row = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
  NDArray ret_col = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
  IdType* ret_row_data = static_cast<IdType*>(ret_row->data);
@@ -343,7 +358,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
  const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
  const int64_t num_rows = end - start;
  const int64_t nnz = indptr[end] - indptr[start];
-  IdArray ret_indptr = IdArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indices->ctx);
+  IdArray ret_indptr =
+      IdArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indices->ctx);
  IdType* r_indptr = static_cast<IdType*>(ret_indptr->data);
  for (int64_t i = start; i < end + 1; ++i) {
    r_indptr[i - start] = indptr[i] - indptr[start];
@@ -353,13 +369,13 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
      {nnz}, csr.indices->dtype, indptr[start] * sizeof(IdType));
  IdArray ret_data;
  if (CSRHasData(csr))
-    ret_data = csr.data.CreateView({nnz}, csr.data->dtype, indptr[start] * sizeof(IdType));
+    ret_data = csr.data.CreateView(
+        {nnz}, csr.data->dtype, indptr[start] * sizeof(IdType));
  else
-    ret_data = aten::Range(indptr[start], indptr[end],
-                           csr.indptr->dtype.bits, csr.indptr->ctx);
-  return CSRMatrix(num_rows, csr.num_cols,
-                   ret_indptr, ret_indices, ret_data,
-                   csr.sorted);
+    ret_data = aten::Range(
+        indptr[start], indptr[end], csr.indptr->dtype.bits, csr.indptr->ctx);
+  return CSRMatrix(
+      num_rows, csr.num_cols, ret_indptr, ret_indices, ret_data, csr.sorted);
 }

 template CSRMatrix CSRSliceRows<kDGLCPU, int32_t>(CSRMatrix, int64_t, int64_t);
@@ -370,7 +386,8 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
  CHECK_SAME_DTYPE(csr.indices, rows);
  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
-  const IdType* data = CSRHasData(csr)? static_cast<IdType*>(csr.data->data) : nullptr;
+  const IdType* data =
+      CSRHasData(csr) ? static_cast<IdType*>(csr.data->data) : nullptr;
  const auto len = rows->shape[0];
  const IdType* rows_data = static_cast<IdType*>(rows->data);
  int64_t nnz = 0;
@@ -389,28 +406,28 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
  bool err = false;
  std::stringstream err_msg_stream;

-  // Perform two-round parallel prefix sum using OpenMP
-  #pragma omp parallel
+// Perform two-round parallel prefix sum using OpenMP
+#pragma omp parallel
  {
    int64_t tid = omp_get_thread_num();
    int64_t num_threads = omp_get_num_threads();

-    #pragma omp single
+#pragma omp single
    {
-        sums.resize(num_threads + 1);
-        sums[0] = 0;
+      sums.resize(num_threads + 1);
+      sums[0] = 0;
    }

    int64_t sum = 0;

-    // First round of parallel prefix sum. All threads perform local prefix sums.
-    #pragma omp for schedule(static) nowait
+// First round of parallel prefix sum. All threads perform local prefix sums.
+#pragma omp for schedule(static) nowait
    for (int64_t i = 0; i < len; ++i) {
      int64_t rid = rows_data[i];
      if (rid >= csr.num_rows) {
        if (!err_flag.test_and_set()) {
-          err_msg_stream << "expect row ID " << rid << " to be less than number of rows "
-            << csr.num_rows;
+          err_msg_stream << "expect row ID " << rid
+                         << " to be less than number of rows " << csr.num_rows;
          err = true;
        }
      } else {
@@ -419,20 +436,18 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
      }
    }
    sums[tid + 1] = sum;
-    #pragma omp barrier
+#pragma omp barrier

-    #pragma omp single
+#pragma omp single
    {
-      for (int64_t i = 1; i < num_threads; ++i)
-        sums[i] += sums[i - 1];
+      for (int64_t i = 1; i < num_threads; ++i) sums[i] += sums[i - 1];
    }

    int64_t offset = sums[tid];

-    // Second round of parallel prefix sum. Update the local prefix sums.
-    #pragma omp for schedule(static)
-    for (int64_t i = 0; i < len; ++i)
-      ret_indptr_data[i + 1] += offset;
+// Second round of parallel prefix sum. Update the local prefix sums.
+#pragma omp for schedule(static)
+    for (int64_t i = 0; i < len; ++i) ret_indptr_data[i + 1] += offset;
  }
  if (err) {
    LOG(FATAL) << err_msg_stream.str();
@@ -454,26 +469,30 @@ CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
    for (auto i = b; i < e; ++i) {
      const IdType rid = rows_data[i];
      // note: zero is allowed
-      std::copy(indices_data + indptr_data[rid], indices_data + indptr_data[rid + 1],
-                ret_indices_data + ret_indptr_data[i]);
+      std::copy(
+          indices_data + indptr_data[rid], indices_data + indptr_data[rid + 1],
+          ret_indices_data + ret_indptr_data[i]);
      if (data)
-        std::copy(data + indptr_data[rid], data + indptr_data[rid + 1],
-                  ret_data + ret_indptr_data[i]);
+        std::copy(
+            data + indptr_data[rid], data + indptr_data[rid + 1],
+            ret_data + ret_indptr_data[i]);
      else
-        std::iota(ret_data + ret_indptr_data[i], ret_data + ret_indptr_data[i + 1],
-                  indptr_data[rid]);
+        std::iota(
+            ret_data + ret_indptr_data[i], ret_data + ret_indptr_data[i + 1],
+            indptr_data[rid]);
    }
  });
  return ret;
 }

-template CSRMatrix CSRSliceRows<kDGLCPU, int32_t>(CSRMatrix , NDArray);
-template CSRMatrix CSRSliceRows<kDGLCPU, int64_t>(CSRMatrix , NDArray);
+template CSRMatrix CSRSliceRows<kDGLCPU, int32_t>(CSRMatrix, NDArray);
+template CSRMatrix CSRSliceRows<kDGLCPU, int64_t>(CSRMatrix, NDArray);

 ///////////////////////////// CSRSliceMatrix /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
-CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
+CSRMatrix CSRSliceMatrix(
+    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
  IdHashMap<IdType> hashmap(cols);
  const int64_t new_nrows = rows->shape[0];
  const int64_t new_ncols = cols->shape[0];
@@ -482,7 +501,8 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray

  const IdType* indptr_data = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices_data = static_cast<IdType*>(csr.indices->data);
-  const IdType* data = has_data? static_cast<IdType*>(csr.data->data) : nullptr;
+  const IdType* data =
+      has_data ? static_cast<IdType*>(csr.data->data) : nullptr;

  std::vector<IdType> sub_indptr, sub_indices;
  std::vector<IdType> sub_data;
@@ -498,7 +518,7 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
      if (newj != kInvalidId) {
        ++sub_indptr[i];
        sub_indices.push_back(newj);
-        sub_data.push_back(has_data? data[p] : p);
+        sub_data.push_back(has_data ? data[p] : p);
      }
    }
  }
@@ -512,13 +532,13 @@ CSRMatrix CSRSliceMatrix(CSRMatrix csr, runtime::NDArray rows, runtime::NDArray
  sub_indptr[new_nrows] = sub_indices.size();

  const int64_t nnz = sub_data.size();
-  NDArray sub_data_arr = NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
+  NDArray sub_data_arr =
+      NDArray::Empty({nnz}, csr.indptr->dtype, csr.indptr->ctx);
  IdType* ptr = static_cast<IdType*>(sub_data_arr->data);
  std::copy(sub_data.begin(), sub_data.end(), ptr);
-  return CSRMatrix{new_nrows, new_ncols,
-    NDArray::FromVector(sub_indptr, csr.indptr->ctx),
-    NDArray::FromVector(sub_indices, csr.indptr->ctx),
-    sub_data_arr};
+  return CSRMatrix{
+      new_nrows, new_ncols, NDArray::FromVector(sub_indptr, csr.indptr->ctx),
+      NDArray::FromVector(sub_indices, csr.indptr->ctx), sub_data_arr};
 }

 template CSRMatrix CSRSliceMatrix<kDGLCPU, int32_t>(
@@ -529,8 +549,9 @@ template CSRMatrix CSRSliceMatrix<kDGLCPU, int64_t>(
 ///////////////////////////// CSRReorder /////////////////////////////

 template <DGLDeviceType XPU, typename IdType>
-CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
-                     runtime::NDArray new_col_id_arr) {
+CSRMatrix CSRReorder(
+    CSRMatrix csr, runtime::NDArray new_row_id_arr,
+    runtime::NDArray new_col_id_arr) {
  CHECK_SAME_DTYPE(csr.indices, new_row_id_arr);
  CHECK_SAME_DTYPE(csr.indices, new_col_id_arr);

@@ -543,21 +564,25 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
  int64_t nnz = csr.indices->shape[0];
  CHECK_EQ(nnz, in_indptr[num_rows]);
  CHECK_EQ(num_rows, new_row_id_arr->shape[0])
-      << "The new row Id array needs to be the same as the number of rows of CSR";
+      << "The new row Id array needs to be the same as the number of rows of "
+         "CSR";
  CHECK_EQ(num_cols, new_col_id_arr->shape[0])
-      << "The new col Id array needs to be the same as the number of cols of CSR";
+      << "The new col Id array needs to be the same as the number of cols of "
+         "CSR";

  // New row/col Ids.
  const IdType* new_row_ids = static_cast<IdType*>(new_row_id_arr->data);
  const IdType* new_col_ids = static_cast<IdType*>(new_col_id_arr->data);

  // Output CSR
-  NDArray out_indptr_arr = NDArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indptr->ctx);
-  NDArray out_indices_arr = NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
+  NDArray out_indptr_arr =
+      NDArray::Empty({num_rows + 1}, csr.indptr->dtype, csr.indptr->ctx);
+  NDArray out_indices_arr =
+      NDArray::Empty({nnz}, csr.indices->dtype, csr.indices->ctx);
  NDArray out_data_arr = NDArray::Empty({nnz}, csr.data->dtype, csr.data->ctx);
-  IdType *out_indptr = static_cast<IdType*>(out_indptr_arr->data);
-  IdType *out_indices = static_cast<IdType*>(out_indices_arr->data);
-  IdType *out_data = static_cast<IdType*>(out_data_arr->data);
+  IdType* out_indptr = static_cast<IdType*>(out_indptr_arr->data);
+  IdType* out_indices = static_cast<IdType*>(out_indices_arr->data);
+  IdType* out_data = static_cast<IdType*>(out_data_arr->data);

  // Compute the length of rows for the new matrix.
  std::vector<IdType> new_row_lens(num_rows, -1);
@@ -579,12 +604,12 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
  // Here I iterate rows in the order of the old matrix.
  parallel_for(0, num_rows, [=](size_t b, size_t e) {
    for (auto i = b; i < e; ++i) {
-      const IdType *in_row = in_indices + in_indptr[i];
-      const IdType *in_row_data = in_data + in_indptr[i];
+      const IdType* in_row = in_indices + in_indptr[i];
+      const IdType* in_row_data = in_data + in_indptr[i];

      int64_t new_row_id = new_row_ids[i];
-      IdType *out_row = out_indices + out_indptr[new_row_id];
-      IdType *out_row_data = out_data + out_indptr[new_row_id];
+      IdType* out_row = out_indices + out_indptr[new_row_id];
+      IdType* out_row_data = out_data + out_indptr[new_row_id];

      int64_t row_len = new_row_lens[new_row_id];
      // Here I iterate col indices in a row in the order of the old matrix.
@@ -595,14 +620,14 @@ CSRMatrix CSRReorder(CSRMatrix csr, runtime::NDArray new_row_id_arr,
      // TODO(zhengda) maybe we should sort the column indices.
    }
  });
-  return CSRMatrix(num_rows, num_cols,
-    out_indptr_arr, out_indices_arr, out_data_arr);
+  return CSRMatrix(
+      num_rows, num_cols, out_indptr_arr, out_indices_arr, out_data_arr);
 }

-template CSRMatrix CSRReorder<kDGLCPU, int64_t>(CSRMatrix csr, runtime::NDArray new_row_ids,
-                                               runtime::NDArray new_col_ids);
-template CSRMatrix CSRReorder<kDGLCPU, int32_t>(CSRMatrix csr, runtime::NDArray new_row_ids,
-                                               runtime::NDArray new_col_ids);
+template CSRMatrix CSRReorder<kDGLCPU, int64_t>(
+    CSRMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);
+template CSRMatrix CSRReorder<kDGLCPU, int32_t>(
+    CSRMatrix csr, runtime::NDArray new_row_ids, runtime::NDArray new_col_ids);

 }  // namespace impl
 }  // namespace aten

--- a/src/array/cpu/spmm.h
+++ b/src/array/cpu/spmm.h
@@ -8,14 +8,15 @@

 #include <dgl/array.h>
 #include <dgl/bcast.h>
-#include <dgl/runtime/parallel_for.h>
 #include <dgl/runtime/config.h>
+#include <dgl/runtime/parallel_for.h>
 #include <math.h>
+
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include <algorithm>
 #include <vector>
+
 #include "spmm_binary_ops.h"
 #if !defined(_WIN32)
 #ifdef USE_AVX
@@ -44,8 +45,9 @@ namespace cpu {
 *       JIT'ed kernel.
 */
 template <typename IdType, typename DType, typename Op>
-void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast,
-                     const CSRMatrix& csr, const DType* X, const DType* W, DType* O) {
+void SpMMSumCsrXbyak(
+    dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast,
+    const CSRMatrix& csr, const DType* X, const DType* W, DType* O) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = csr.indptr.Ptr<IdType>();
  const IdType* indices = csr.indices.Ptr<IdType>();
@@ -79,8 +81,9 @@ void SpMMSumCsrXbyak(dgl::ElemWiseAddUpdate<Op>* cpu_spec, const BcastOff& bcast
 *       for the computation of different nodes.
 */
 template <typename IdType, typename DType, typename Op>
-void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X,
-                     const DType* W, DType* O) {
+void SpMMSumCsrNaive(
+    const BcastOff& bcast, const CSRMatrix& csr, const DType* X, const DType* W,
+    DType* O) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = csr.indptr.Ptr<IdType>();
  const IdType* indices = csr.indices.Ptr<IdType>();
@@ -97,9 +100,9 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
          const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
          const DType* lhs_off =
-            Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
+              Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
          const DType* rhs_off =
-            Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+              Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
          out_off[k] += Op::Call(lhs_off, rhs_off);
        }
      }
@@ -118,8 +121,9 @@ void SpMMSumCsrNaive(const BcastOff& bcast, const CSRMatrix& csr, const DType* X
 *       for the computation of different nodes.
 */
 template <typename IdType, typename DType, typename Op>
-void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
-                NDArray efeat, NDArray out) {
+void SpMMSumCsr(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
+    NDArray out) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = csr.indptr.Ptr<IdType>();
  const IdType* indices = csr.indices.Ptr<IdType>();
@@ -135,17 +139,15 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
    CHECK_NOTNULL(X);
  }
  if (Op::use_rhs) {
-    if (has_idx)
-      CHECK_NOTNULL(edges);
+    if (has_idx) CHECK_NOTNULL(edges);
    CHECK_NOTNULL(W);
  }
 #if !defined(_WIN32)
 #ifdef USE_AVX
 #ifdef USE_LIBXSMM
-  const bool no_libxsmm =
-       bcast.use_bcast ||
-       std::is_same<DType, double>::value ||
-       !dgl::runtime::Config::Global()->IsLibxsmmAvailable();
+  const bool no_libxsmm = bcast.use_bcast ||
+                          std::is_same<DType, double>::value ||
+                          !dgl::runtime::Config::Global()->IsLibxsmmAvailable();
  if (!no_libxsmm) {
    SpMMSumCsrLibxsmm<IdType, DType, Op>(bcast, csr, ufeat, efeat, out);
  } else {
@@ -156,14 +158,14 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
        (dgl::IntelKernel<>::IsEnabled()) ? new ElemWiseUpd() : nullptr);
    /* Distribute the kernel among OMP threads */
    ElemWiseUpd* cpu_spec = (asm_kernel_ptr && asm_kernel_ptr->applicable())
-      ? asm_kernel_ptr.get()
-      : nullptr;
+                                ? asm_kernel_ptr.get()
+                                : nullptr;
    if (cpu_spec && dim > 16 && !bcast.use_bcast) {
      SpMMSumCsrXbyak<IdType, DType, Op>(cpu_spec, bcast, csr, X, W, O);
    } else {
 #endif  // USE_AVX
 #endif  // _WIN32
-    SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O);
+      SpMMSumCsrNaive<IdType, DType, Op>(bcast, csr, X, W, O);
 #if !defined(_WIN32)
 #ifdef USE_AVX
    }
@@ -186,8 +188,9 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
 *       we use atomic operators in the reduction phase.
 */
 template <typename IdType, typename DType, typename Op>
-void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
-                NDArray efeat, NDArray out) {
+void SpMMSumCoo(
+    const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat,
+    NDArray out) {
  const bool has_idx = !IsNullArray(coo.data);
  const IdType* row = coo.row.Ptr<IdType>();
  const IdType* col = coo.col.Ptr<IdType>();
@@ -210,9 +213,9 @@ void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
      const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
      const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
      const DType* lhs_off =
-        Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
+          Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
      const DType* rhs_off =
-        Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+          Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
      const DType val = Op::Call(lhs_off, rhs_off);
      if (val != 0) {
 #pragma omp atomic
@@ -232,21 +235,24 @@ void SpMMSumCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
 * \param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max
- * reducer. \param arge Arg-Min/Max on edges. which refers the source node
- * indices correspond to the minimum/maximum values of reduction result on
+ *        reducer.
+ * \param arge Arg-Min/Max on edges. which refers the source node indices
+          correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max
- * reducer. \note It uses node parallel strategy, different threads are
- * responsible for the computation of different nodes. \note The result will
- * contain infinity for zero-degree nodes.
+ *        reducer.
+ * \note It uses node parallel strategy, different threads are responsible for
+ *       the computation of different nodes.
+ * \note The result will contain infinity for zero-degree nodes.
 */
 template <typename IdType, typename DType, typename Op, typename Cmp>
-void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
-                NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
+void SpMMCmpCsr(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices = static_cast<IdType*>(csr.indices->data);
  const IdType* edges =
-    has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
+      has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
  const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
  const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
  const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
@@ -262,8 +268,7 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
    CHECK_NOTNULL(argX);
  }
  if (Op::use_rhs) {
-    if (has_idx)
-      CHECK_NOTNULL(edges);
+    if (has_idx) CHECK_NOTNULL(edges);
    CHECK_NOTNULL(W);
    CHECK_NOTNULL(argW);
  }
@@ -271,12 +276,12 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
 #ifdef USE_AVX
 #ifdef USE_LIBXSMM

-  const bool no_libxsmm =
-       bcast.use_bcast ||
-       std::is_same<DType, double>::value ||
-       !dgl::runtime::Config::Global()->IsLibxsmmAvailable();
+  const bool no_libxsmm = bcast.use_bcast ||
+                          std::is_same<DType, double>::value ||
+                          !dgl::runtime::Config::Global()->IsLibxsmmAvailable();
  if (!no_libxsmm) {
-    SpMMCmpCsrLibxsmm<IdType, DType, Op, Cmp>(bcast, csr, ufeat, efeat, out, argu, arge);
+    SpMMCmpCsrLibxsmm<IdType, DType, Op, Cmp>(
+        bcast, csr, ufeat, efeat, out, argu, arge);
  } else {
 #endif  // USE_LIBXSMM
 #endif  // USE_AVX
@@ -295,9 +300,9 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
            const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
            const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
            const DType* lhs_off =
-              Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
+                Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
            const DType* rhs_off =
-              Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+                Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
            const DType val = Op::Call(lhs_off, rhs_off);
            if (Cmp::Call(out_off[k], val)) {
              out_off[k] = val;
@@ -328,29 +333,31 @@ void SpMMCmpCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max
 *        reducer.
- * \param arge Arg-Min/Max on edges. which refers the source node
- *        indices correspond to the minimum/maximum values of reduction result on
+ * \param arge Arg-Min/Max on edges. which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max
 *        reducer.
- * \param argu_ntype Node type of the arg-Min/Max on source nodes, which refers the
- *        source node types correspond to the minimum/maximum values of reduction result
- *        on destination nodes. It's useful in computing gradients of Min/Max reducer.
- * \param arge_etype Edge-type of the arg-Min/Max on edges. which refers the source
- *        node indices correspond to the minimum/maximum values of reduction result on
- *        destination nodes. It's useful in computing gradients of Min/Max reducer.
+ * \param argu_ntype Node type of the arg-Min/Max on source nodes, which refers
+ *        the source node types correspond to the minimum/maximum values of
+ *        reduction result on destination nodes. It's useful in computing
+ *        gradients of Min/Max reducer.
+ * \param arge_etype Edge-type of the arg-Min/Max on edges. which refers the
+ *        source node indices correspond to the minimum/maximum values of
+ *        reduction result on destination nodes. It's useful in computing
+ *        gradients of Min/Max reducer.
 * \param src_type Node type of the source nodes of an etype
 * \param etype Edge type
 */
 template <typename IdType, typename DType, typename Op, typename Cmp>
-void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
-                NDArray efeat, NDArray out, NDArray argu, NDArray arge,
-                NDArray argu_ntype, NDArray arge_etype,
-                const int ntype, const int etype) {
+void SpMMCmpCsrHetero(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge, NDArray argu_ntype,
+    NDArray arge_etype, const int ntype, const int etype) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
  const IdType* indices = static_cast<IdType*>(csr.indices->data);
  const IdType* edges =
-    has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
+      has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
  const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
  const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
  const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
@@ -358,8 +365,10 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
  DType* O = static_cast<DType*>(out->data);
  IdType* argX = Op::use_lhs ? static_cast<IdType*>(argu->data) : nullptr;
  IdType* argW = Op::use_rhs ? static_cast<IdType*>(arge->data) : nullptr;
-  IdType* argX_ntype = Op::use_lhs ? static_cast<IdType*>(argu_ntype->data) : nullptr;
-  IdType* argW_etype = Op::use_rhs ? static_cast<IdType*>(arge_etype->data) : nullptr;
+  IdType* argX_ntype =
+      Op::use_lhs ? static_cast<IdType*>(argu_ntype->data) : nullptr;
+  IdType* argW_etype =
+      Op::use_rhs ? static_cast<IdType*>(arge_etype->data) : nullptr;
  CHECK_NOTNULL(indptr);
  CHECK_NOTNULL(O);
  if (Op::use_lhs) {
@@ -368,8 +377,7 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
    CHECK_NOTNULL(argX);
  }
  if (Op::use_rhs) {
-    if (has_idx)
-      CHECK_NOTNULL(edges);
+    if (has_idx) CHECK_NOTNULL(edges);
    CHECK_NOTNULL(W);
    CHECK_NOTNULL(argW);
  }
@@ -389,9 +397,9 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
          const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
          const DType* lhs_off =
-            Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
+              Op::use_lhs ? X + cid * lhs_dim + lhs_add : nullptr;
          const DType* rhs_off =
-            Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+              Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
          const DType val = Op::Call(lhs_off, rhs_off);
          if (Cmp::Call(out_off[k], val)) {
            out_off[k] = val;
@@ -410,7 +418,6 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
  });
 }

-
 /*!
 * \brief CPU kernel of SpMM-Min/Max on Coo format.
 * \param bcast Broadcast information.
@@ -421,22 +428,25 @@ void SpMMCmpCsrHetero(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat
 * \param argu Arg-Min/Max on source nodes, which refers the source node indices
 *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max
- * reducer. \param arge Arg-Min/Max on edges. which refers the source node
- * indices correspond to the minimum/maximum values of reduction result on
+ *        reducer.
+ * \param arge Arg-Min/Max on edges. which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
 *        destination nodes. It's useful in computing gradients of Min/Max
- * reducer. \note it uses node parallel strategy, different threads are
- * responsible for the computation of different nodes. To avoid possible data
- * hazard, we use atomic operators in the reduction phase. \note The result will
- * contain infinity for zero-degree nodes.
+ *        reducer.
+ * \note it uses node parallel strategy, different threads are responsible for
+ *       the computation of different nodes. To avoid possible data hazard, we
+ *       use atomic operators in the reduction phase.
+ * \note The result will contain infinity for zero-degree nodes.
 */
 template <typename IdType, typename DType, typename Op, typename Cmp>
-void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
-                NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
+void SpMMCmpCoo(
+    const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge) {
  const bool has_idx = !IsNullArray(coo.data);
  const IdType* row = static_cast<IdType*>(coo.row->data);
  const IdType* col = static_cast<IdType*>(coo.col->data);
  const IdType* edges =
-    has_idx ? static_cast<IdType*>(coo.data->data) : nullptr;
+      has_idx ? static_cast<IdType*>(coo.data->data) : nullptr;
  const DType* X = Op::use_lhs ? static_cast<DType*>(ufeat->data) : nullptr;
  const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
  const int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len,
@@ -460,9 +470,9 @@ void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
      const int64_t lhs_add = bcast.use_bcast ? bcast.lhs_offset[k] : k;
      const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
      const DType* lhs_off =
-        Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
+          Op::use_lhs ? X + rid * lhs_dim + lhs_add : nullptr;
      const DType* rhs_off =
-        Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+          Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
      const DType val = Op::Call(lhs_off, rhs_off);
 #pragma omp critical
      if (Cmp::Call(out_off[k], val)) {
@@ -474,7 +484,6 @@ void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
  }
 }

-
 /*!
 * \brief CPU kernel of Edge_softmax_csr_forward on Csr format.
 * \param bcast Broadcast information.
@@ -484,28 +493,29 @@ void SpMMCmpCoo(const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat,
 * \param out The result of edge_softmax_forward.
 */
 template <typename IdType, typename DType, typename Op>
-void Edge_softmax_csr_forward(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
-                NDArray efeat, NDArray out) {
+void Edge_softmax_csr_forward(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
+    NDArray out) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
  const IdType* edges =
-    has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
+      has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
  const DType* W = Op::use_rhs ? static_cast<DType*>(efeat->data) : nullptr;
  const int64_t dim = bcast.out_len, rhs_dim = bcast.rhs_len;
  runtime::parallel_for(0, csr.num_rows, [&](size_t b, size_t e) {
    for (auto rid = b; rid < e; ++rid) {
      const IdType row_start = indptr[rid], row_end = indptr[rid + 1];
-      std::vector<DType> data_e(row_end-row_start, 0);
-      std::vector<IdType> num(row_end-row_start, 0);
+      std::vector<DType> data_e(row_end - row_start, 0);
+      std::vector<IdType> num(row_end - row_start, 0);
      for (int64_t k = 0; k < dim; ++k) {
        DType max_v = -std::numeric_limits<DType>::infinity();
        for (IdType j = row_start; j < row_end; ++j) {
          const IdType eid = has_idx ? edges[j] : j;
          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
          const DType* rhs_off =
-            Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
-          data_e[j-row_start] = *rhs_off;
-          num[j-row_start] = eid*rhs_dim+rhs_add;
+              Op::use_rhs ? W + eid * rhs_dim + rhs_add : nullptr;
+          data_e[j - row_start] = *rhs_off;
+          num[j - row_start] = eid * rhs_dim + rhs_add;
          max_v = std::max<DType>(max_v, (*rhs_off));
        }
        DType exp_sum = 0;
@@ -514,15 +524,14 @@ void Edge_softmax_csr_forward(const BcastOff& bcast, const CSRMatrix& csr, NDArr
          element = std::exp(element);
          exp_sum += element;
        }
-        for (int i=0; i < row_end-row_start; i++) {
-          out.Ptr<DType>()[num[i]] = data_e[i]/exp_sum;
+        for (int i = 0; i < row_end - row_start; i++) {
+          out.Ptr<DType>()[num[i]] = data_e[i] / exp_sum;
        }
      }
    }
  });
 }

-
 /*!
 * \brief CPU kernel of Edge_softmax_csr_backward on Csr format.
 * \param bcast Broadcast information.
@@ -532,12 +541,13 @@ void Edge_softmax_csr_forward(const BcastOff& bcast, const CSRMatrix& csr, NDArr
 * \param back_out The result of edge_softmax_backward.
 */
 template <typename IdType, typename DType, typename Op>
-void Edge_softmax_csr_backward(const BcastOff& bcast, const CSRMatrix& csr, NDArray out,
-                NDArray sds, NDArray back_out) {
+void Edge_softmax_csr_backward(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray out, NDArray sds,
+    NDArray back_out) {
  const bool has_idx = !IsNullArray(csr.data);
  const IdType* indptr = static_cast<IdType*>(csr.indptr->data);
  const IdType* edges =
-    has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
+      has_idx ? static_cast<IdType*>(csr.data->data) : nullptr;
  const DType* W_out = Op::use_rhs ? static_cast<DType*>(out->data) : nullptr;
  const DType* W_sds = Op::use_rhs ? static_cast<DType*>(sds->data) : nullptr;
  const int64_t dim = bcast.out_len, rhs_dim = bcast.rhs_len;
@@ -550,17 +560,18 @@ void Edge_softmax_csr_backward(const BcastOff& bcast, const CSRMatrix& csr, NDAr
          const IdType eid = has_idx ? edges[j] : j;
          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
          const DType* rhs_off_sds =
-            Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
+              Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
          sum_sds += (*rhs_off_sds);
        }
-        for (IdType j = row_start; j< row_end; ++j) {
+        for (IdType j = row_start; j < row_end; ++j) {
          const IdType eid = has_idx ? edges[j] : j;
          const int64_t rhs_add = bcast.use_bcast ? bcast.rhs_offset[k] : k;
          const DType* rhs_off_out =
-            Op::use_rhs ? W_out + eid * rhs_dim + rhs_add : nullptr;
+              Op::use_rhs ? W_out + eid * rhs_dim + rhs_add : nullptr;
          const DType* rhs_off_sds =
-            Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
-          back_out.Ptr<DType>()[eid*rhs_dim+rhs_add] =  (*rhs_off_sds) - sum_sds*(*rhs_off_out);
+              Op::use_rhs ? W_sds + eid * rhs_dim + rhs_add : nullptr;
+          back_out.Ptr<DType>()[eid * rhs_dim + rhs_add] =
+              (*rhs_off_sds) - sum_sds * (*rhs_off_out);
        }
      }
    }

--- a/src/array/cpu/spmm_blocking_libxsmm.h
+++ b/src/array/cpu/spmm_blocking_libxsmm.h
@@ -13,13 +13,14 @@
 #include <dgl/array.h>
 #include <dgl/bcast.h>
 #include <dmlc/logging.h>
+
 #include <algorithm>

 #if !defined(_WIN32)
 #ifdef USE_AVX
 #ifdef USE_LIBXSMM
-#include <unistd.h>
 #include <libxsmm.h>
+#include <unistd.h>
 #ifdef DEBUG
 #include <x86intrin.h>
 #endif  // DEBUG
@@ -53,8 +54,10 @@ int32_t GetLLCSize() {
 *        are assigned to OMP threads.
 * \param csr The Csr matrix.
 * \param block_csr_array The array containing csr matrices of all blocks.
- * \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
- * \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
+ * \param num_M_blocks Number of blocks to create along the rows of adjacency
+ *        matrix.
+ * \param num_K_blocks Number of blocks to create along the columns of adjacency
+ *        matrix.
 * \param M_block_size block size along the rows of adjacency matrix.
 * \param K_block_size block size along the columns of adjacency matrix.
 * \param use_lhs Whether to use lhs.
@@ -62,38 +65,30 @@ int32_t GetLLCSize() {
 */
 template <typename IdType>
 inline void SpMMCreateBlocks(
-    const CSRMatrix& csr,
-    CSRMatrixInternal<IdType, IdType> *block_csr_array,
-    IdType num_M_blocks,
-    IdType num_K_blocks,
-    IdType M_block_size,
-    IdType K_block_size,
-    bool use_lhs, bool use_rhs) {
-
+    const CSRMatrix &csr, CSRMatrixInternal<IdType, IdType> *block_csr_array,
+    IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
+    IdType K_block_size, bool use_lhs, bool use_rhs) {
  const IdType M = csr.num_rows;
  const IdType K = csr.num_cols;
-  IdType* indptr = csr.indptr.Ptr<IdType>();
-  IdType* indices = csr.indices.Ptr<IdType>();
-  IdType* edges = csr.data.Ptr<IdType>();
+  IdType *indptr = csr.indptr.Ptr<IdType>();
+  IdType *indices = csr.indices.Ptr<IdType>();
+  IdType *edges = csr.data.Ptr<IdType>();
  CHECK_NOTNULL(indptr);
-  if (use_lhs)
-    CHECK_NOTNULL(indices);
-  if (use_rhs)
-    CHECK_NOTNULL(edges);
+  if (use_lhs) CHECK_NOTNULL(indices);
+  if (use_rhs) CHECK_NOTNULL(edges);

  if (num_K_blocks > 1) {
-    IdType *indptr_block_buf = reinterpret_cast<IdType *>(aligned_alloc(64,
-                                                             (M_block_size + 1) * num_M_blocks *
-                                                             num_K_blocks * sizeof(IdType)));
-    IdType *indices_block_buf = reinterpret_cast<IdType *>(aligned_alloc(64,
-                                                              indptr[M] * sizeof(IdType)));
-    IdType *edges_block_buf = reinterpret_cast<IdType *>(aligned_alloc(64,
-                                                            indptr[M] * sizeof(IdType)));
+    IdType *indptr_block_buf = reinterpret_cast<IdType *>(aligned_alloc(
+        64, (M_block_size + 1) * num_M_blocks * num_K_blocks * sizeof(IdType)));
+    IdType *indices_block_buf = reinterpret_cast<IdType *>(
+        aligned_alloc(64, indptr[M] * sizeof(IdType)));
+    IdType *edges_block_buf = reinterpret_cast<IdType *>(
+        aligned_alloc(64, indptr[M] * sizeof(IdType)));

 #pragma omp parallel
    {
-      IdType *my_cur_col_id = reinterpret_cast<IdType *>(aligned_alloc(64, 2 * M_block_size *
-                                                                          sizeof(IdType)));
+      IdType *my_cur_col_id = reinterpret_cast<IdType *>(
+          aligned_alloc(64, 2 * M_block_size * sizeof(IdType)));

 #pragma omp for
      for (IdType m = 0; m < num_M_blocks; m++) {
@@ -103,10 +98,8 @@ inline void SpMMCreateBlocks(

        IdType cur_indices_id = 0;
        IdType *my_indices_block_buf, *my_edges_block_buf;
-        if (use_lhs)
-          my_indices_block_buf = indices_block_buf + indptr[M_start];
-        if (use_rhs)
-          my_edges_block_buf = edges_block_buf + indptr[M_start];
+        if (use_lhs) my_indices_block_buf = indices_block_buf + indptr[M_start];
+        if (use_rhs) my_edges_block_buf = edges_block_buf + indptr[M_start];

        for (IdType i = M_start; i < M_end; i++) {
          my_cur_col_id[(i - M_start) * 2] = indptr[i];
@@ -119,16 +112,15 @@ inline void SpMMCreateBlocks(
          cur_csr.num_rows = M_end - M_start;
          cur_csr.num_cols = K_end - K_start;
          // Create csr_ij
-          IdType *cur_csr_indptr = indptr_block_buf + (m * num_K_blocks + k) * (M_block_size + 1);
+          IdType *cur_csr_indptr =
+              indptr_block_buf + (m * num_K_blocks + k) * (M_block_size + 1);
          IdType *cur_csr_indices = nullptr, *cur_csr_edges = nullptr;
-          if (use_lhs)
-            cur_csr_indices = my_indices_block_buf + cur_indices_id;
-          if (use_rhs)
-            cur_csr_edges = my_edges_block_buf + cur_indices_id;
+          if (use_lhs) cur_csr_indices = my_indices_block_buf + cur_indices_id;
+          if (use_rhs) cur_csr_edges = my_edges_block_buf + cur_indices_id;
          IdType cur_nnz = 0;
          for (IdType i = M_start; i < M_end; i++) {
            const IdType row_start = my_cur_col_id[(i - M_start) * 2];
-            const IdType row_end   = my_cur_col_id[(i - M_start) * 2 + 1];
+            const IdType row_end = my_cur_col_id[(i - M_start) * 2 + 1];
            cur_csr_indptr[i - M_start] = cur_nnz;
            IdType eid;
            for (eid = row_start; eid < row_end; eid++) {
@@ -138,10 +130,8 @@ inline void SpMMCreateBlocks(
                break;
              }
              CHECK_LT(cur_indices_id + cur_nnz, nnz);
-              if (use_lhs)
-                cur_csr_indices[cur_nnz] = src;
-              if (use_rhs)
-                cur_csr_edges[cur_nnz] = edge;
+              if (use_lhs) cur_csr_indices[cur_nnz] = src;
+              if (use_rhs) cur_csr_edges[cur_nnz] = edge;
              cur_nnz++;
            }
            my_cur_col_id[(i - M_start) * 2] = eid;
@@ -149,10 +139,8 @@ inline void SpMMCreateBlocks(
          cur_csr_indptr[cur_csr.num_rows] = cur_nnz;
          cur_indices_id += cur_nnz;
          cur_csr.indptr = cur_csr_indptr;
-          if (use_lhs)
-            cur_csr.indices = cur_csr_indices;
-          if (use_rhs)
-            cur_csr.data = cur_csr_edges;
+          if (use_lhs) cur_csr.indices = cur_csr_indices;
+          if (use_rhs) cur_csr.data = cur_csr_edges;
          block_csr_array[m * num_K_blocks + k] = cur_csr;
        }
        CHECK_EQ(nnz, cur_indices_id);
@@ -199,9 +187,7 @@ inline void SpMMCreateBlocks(
 */
 template <typename IdType, typename DType, typename Op>
 inline libxsmm_meltwfunction_opreduce_vecs_idx SpMMCreateLibxsmmKernel(
-    bool has_idx,
-    IdType N,
-    libxsmm_meltw_opreduce_vecs_flags redop_flag,
+    bool has_idx, IdType N, libxsmm_meltw_opreduce_vecs_flags redop_flag,
    bool is_cmp) {
  int _ld = N;
  libxsmm_meltw_opreduce_vecs_flags opredop_flags;
@@ -220,48 +206,61 @@ inline libxsmm_meltwfunction_opreduce_vecs_idx SpMMCreateLibxsmmKernel(
    opredop_flags = LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OP_COPY;
  }
  // Second, set which of lhs or rhs is considered first and second operand.
-  // This is needed since libxsmm assumes that the copy operation always copies the first operand.
-  // So, if we need to copy rhs, we need to set that as the first operand.
-  // For rhs, we also set whether to use implicit indices or provided indices.
+  // This is needed since libxsmm assumes that the copy operation always copies
+  // the first operand. So, if we need to copy rhs, we need to set that as the
+  // first operand. For rhs, we also set whether to use implicit indices or
+  // provided indices.
+  // TODO(Steve): fix this long line in a separate PR.
  if (std::is_same<Op, op::CopyLhs<DType>>::value) {
-    opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                     LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
+    opredop_flags =
+        (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+        LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
  } else if (std::is_same<Op, op::CopyRhs<DType>>::value) {
-    opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                     LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIN_VECIDX);
+    opredop_flags =
+        (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+        LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIN_VECIDX);
    if (!has_idx) {
-      opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                       LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VECIDX);
+      opredop_flags =
+          (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+          LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VECIDX);
    }
  } else {
-    opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                     LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
+    opredop_flags =
+        (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+        LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_OPORDER_VECIDX_VECIN);
    if (has_idx) {
-      opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                       LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_INDEXED_VEC);
+      opredop_flags =
+          (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+          LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_INDEXED_VEC);
    } else {
-      opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                       LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VEC);
+      opredop_flags =
+          (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+          LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_IMPLICIT_INDEXED_VEC);
    }
  }
  // Third, we set the Redop in the opredop_flags
-  opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags | redop_flag);
-  // Fourth, in case of Cmp Redop, set whether to record argmax/argmin for lhs/rhs
+  opredop_flags =
+      (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags | redop_flag);
+  // Fourth, in case of Cmp Redop, set whether to record argmax/argmin for
+  // lhs/rhs
  if (is_cmp) {
    if (Op::use_lhs) {
-      opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                       LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_0);
+      opredop_flags =
+          (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+          LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_0);
    }
    if (Op::use_rhs) {
-      opredop_flags = (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
-                       LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_1);
+      opredop_flags =
+          (libxsmm_meltw_opreduce_vecs_flags)(opredop_flags |
+          LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_RECORD_ARGOP_OFF_VEC_1);
    }
  }
  libxsmm_meltwfunction_opreduce_vecs_idx kernel = nullptr;
  if (std::is_same<DType, float>::value) {
    kernel = libxsmm_dispatch_meltw_opreduce_vecs_idx(
-               N, &_ld, &_ld, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
-               (sizeof(IdType) == 8) ? LIBXSMM_DATATYPE_I64 : LIBXSMM_DATATYPE_I32, opredop_flags);
+        N, &_ld, &_ld, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32,
+        (sizeof(IdType) == 8) ? LIBXSMM_DATATYPE_I64 : LIBXSMM_DATATYPE_I32,
+        opredop_flags);
  }
  if (kernel == nullptr) {
    LOG(FATAL) << "Failed to generate libxsmm kernel for the SpMM operation."
@@ -278,32 +277,34 @@ inline libxsmm_meltwfunction_opreduce_vecs_idx SpMMCreateLibxsmmKernel(
 * \param C The result feature on destination nodes.
 * \param has_idx For the edge features, are there indices available.
 * \param N Feature size.
- * \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
- * \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
+ * \param num_M_blocks Number of blocks to create along the rows of adjacency
+ *        matrix.
+ * \param num_K_blocks Number of blocks to create along the columns of adjacency
+ *        matrix.
 * \param M_block_size block size along the rows of adjacency matrix.
 * \param kernel The libxsmm kernel.
 */
 template <typename IdType, typename DType>
 inline void SpMMBlockwiseOpSum(
-    CSRMatrixInternal<IdType, IdType> *block_csr_array,
-    const DType *B, const DType *E, DType *C, bool has_idx, IdType N,
-    IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
+    CSRMatrixInternal<IdType, IdType> *block_csr_array, const DType *B,
+    const DType *E, DType *C, bool has_idx, IdType N, IdType num_M_blocks,
+    IdType num_K_blocks, IdType M_block_size,
    libxsmm_meltwfunction_opreduce_vecs_idx kernel) {
-
-  DType (*in_matrix1)[N] = (DType (*)[N])B;
-  DType (*in_matrix2)[N] = (DType (*)[N])E;
-  DType (*output)[N] = (DType (*)[N])C;
+  DType(*in_matrix1)[N] = (DType(*)[N])B;
+  DType(*in_matrix2)[N] = (DType(*)[N])E;
+  DType(*output)[N] = (DType(*)[N])C;
 #pragma omp parallel
  {
    for (IdType k = 0; k < num_K_blocks; k++) {
 #pragma omp for schedule(dynamic)
      for (IdType m = 0; m < num_M_blocks; m++) {
-        CSRMatrixInternal<IdType, IdType> cur_csr = block_csr_array[m * num_K_blocks + k];
+        CSRMatrixInternal<IdType, IdType> cur_csr =
+            block_csr_array[m * num_K_blocks + k];

        const IdType M_start = m * M_block_size;
        for (IdType i = 0; i < cur_csr.num_rows; i++) {
          const IdType row_start = cur_csr.indptr[i];
-          const IdType row_end   = cur_csr.indptr[i + 1];
+          const IdType row_end = cur_csr.indptr[i + 1];
          const IdType dst = i + M_start;

          libxsmm_meltw_opreduce_vecs_idx_param params;
@@ -335,36 +336,37 @@ inline void SpMMBlockwiseOpSum(
 * \param argE Arg-Min/Max on edges.
 * \param has_idx For the edge features, are there indices available.
 * \param N Feature size.
- * \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
- * \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
+ * \param num_M_blocks Number of blocks to create along the rows of adjacency
+ *        matrix.
+ * \param num_K_blocks Number of blocks to create along the columns of adjacency
+ *        matrix.
 * \param M_block_size block size along the rows of adjacency matrix.
 * \param kernel The libxsmm kernel.
 */
 template <typename IdType, typename DType, typename Op, typename Cmp>
 inline void SpMMBlockwiseOpCmp(
-    CSRMatrixInternal<IdType, IdType> *block_csr_array,
-    const DType *B, const DType *E, DType *C, IdType *argB, IdType *argE,
-    bool has_idx, IdType N,
-    IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
+    CSRMatrixInternal<IdType, IdType> *block_csr_array, const DType *B,
+    const DType *E, DType *C, IdType *argB, IdType *argE, bool has_idx,
+    IdType N, IdType num_M_blocks, IdType num_K_blocks, IdType M_block_size,
    libxsmm_meltwfunction_opreduce_vecs_idx kernel) {
-
-  DType (*in_matrix1)[N] = (DType (*)[N])B;
-  DType (*in_matrix2)[N] = (DType (*)[N])E;
-  DType (*output)[N] = (DType (*)[N])C;
-  IdType (*out_matrix1)[N] = (IdType (*)[N])argB;
-  IdType (*out_matrix2)[N] = (IdType (*)[N])argE;
+  DType(*in_matrix1)[N] = (DType(*)[N])B;
+  DType(*in_matrix2)[N] = (DType(*)[N])E;
+  DType(*output)[N] = (DType(*)[N])C;
+  IdType(*out_matrix1)[N] = (IdType(*)[N])argB;
+  IdType(*out_matrix2)[N] = (IdType(*)[N])argE;

 #pragma omp parallel
  {
    for (IdType k = 0; k < num_K_blocks; k++) {
 #pragma omp for schedule(dynamic)
      for (IdType m = 0; m < num_M_blocks; m++) {
-        CSRMatrixInternal<IdType, IdType> cur_csr = block_csr_array[m * num_K_blocks + k];
+        CSRMatrixInternal<IdType, IdType> cur_csr =
+            block_csr_array[m * num_K_blocks + k];

        const IdType M_start = m * M_block_size;
        for (IdType i = 0; i < cur_csr.num_rows; i++) {
          const IdType row_start = cur_csr.indptr[i];
-          const IdType row_end   = cur_csr.indptr[i + 1];
+          const IdType row_end = cur_csr.indptr[i + 1];
          const IdType dst = i + M_start;

          libxsmm_meltw_opreduce_vecs_idx_param params;
@@ -391,23 +393,21 @@ inline void SpMMBlockwiseOpCmp(
 /*!
 * \brief Free the tiled CSR matrix data.
 * \param block_csr_array The array containing csr matrices of all blocks.
- * \param num_M_blocks Number of blocks to create along the rows of adjacency matrix.
- * \param num_K_blocks Number of blocks to create along the columns of adjacency matrix.
+ * \param num_M_blocks Number of blocks to create along the rows of adjacency
+ *        matrix.
+ * \param num_K_blocks Number of blocks to create along the columns of adjacency
+ *        matrix.
 * \param use_lhs Whether to use lhs.
 * \param use_rhs Whether to use rhs.
 */
 template <typename IdType>
 inline void SpMMFreeBlocks(
-    CSRMatrixInternal<IdType, IdType> *block_csr_array,
-    IdType num_M_blocks, IdType num_K_blocks,
-    bool use_lhs, bool use_rhs) {
-
+    CSRMatrixInternal<IdType, IdType> *block_csr_array, IdType num_M_blocks,
+    IdType num_K_blocks, bool use_lhs, bool use_rhs) {
  if (num_K_blocks > 1) {
    free(block_csr_array[0].indptr);
-    if (use_lhs)
-      free(block_csr_array[0].indices);
-    if (use_rhs)
-      free(block_csr_array[0].data);
+    if (use_lhs) free(block_csr_array[0].indices);
+    if (use_rhs) free(block_csr_array[0].data);
  }
  free(block_csr_array);
 }
@@ -425,12 +425,8 @@ inline void SpMMFreeBlocks(
 */
 template <typename IdType, typename DType, typename Op, typename Redop>
 void SpMMRedopCsrOpt(
-    const BcastOff& bcast,
-    const CSRMatrix& csr,
-    NDArray ufeat, NDArray efeat,
-    NDArray out,
-    NDArray argu, NDArray arge) {
-
+    const BcastOff &bcast, const CSRMatrix &csr, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge) {
  int32_t llc_size = GetLLCSize();

 #ifdef DEBUG
@@ -440,11 +436,12 @@ void SpMMRedopCsrOpt(

  const bool has_idx = !IsNullArray(csr.data);

-  DType* C = out.Ptr<DType>();
-  const DType* B = ufeat.Ptr<DType>();
-  const DType* E = efeat.Ptr<DType>();
+  DType *C = out.Ptr<DType>();
+  const DType *B = ufeat.Ptr<DType>();
+  const DType *E = efeat.Ptr<DType>();
  IdType *argB, *argE;
-  if (std::is_same<Redop, op::Max<DType>>::value || std::is_same<Redop, op::Min<DType>>::value) {
+  if (std::is_same<Redop, op::Max<DType>>::value ||
+      std::is_same<Redop, op::Min<DType>>::value) {
    argB = argu.Ptr<IdType>();
    argE = arge.Ptr<IdType>();
  }
@@ -453,7 +450,7 @@ void SpMMRedopCsrOpt(
  const IdType M = csr.num_rows;
  const IdType N = bcast.out_len;
  const IdType K = csr.num_cols;
-  const IdType* indptr = csr.indptr.Ptr<IdType>();
+  const IdType *indptr = csr.indptr.Ptr<IdType>();
  CHECK_NOTNULL(indptr);
  const IdType total_nnz = indptr[M];
  if (M <= 0 || K <= 0 || N <= 0 || total_nnz <= 0) return;
@@ -461,8 +458,9 @@ void SpMMRedopCsrOpt(
  const double avg_degree = total_nnz * 1.0 / M;
  const double nnz_prob = avg_degree / K;

-  IdType K_block_size = std::min((int64_t)K, (int64_t)(llc_size / (N * sizeof(DType) *
-                                                       nnz_prob * BLOCKING_HEURISTIC_PARAM)));
+  IdType K_block_size = std::min(
+      (int64_t)K,
+      (int64_t)(llc_size / (N * sizeof(DType) * nnz_prob * BLOCKING_HEURISTIC_PARAM)));
  IdType M_block_size = M / (nthreads * NUM_BLOCKS_PER_THREAD);
  if (M_block_size == 0) M_block_size = 1;
  if (K_block_size == 0) K_block_size = 1;
@@ -471,8 +469,9 @@ void SpMMRedopCsrOpt(
  IdType num_K_blocks = (K + K_block_size - 1) / K_block_size;

  CSRMatrixInternal<IdType, IdType> *block_csr_array =
-    (CSRMatrixInternal<IdType, IdType> *)aligned_alloc(64,
-      sizeof(CSRMatrixInternal<IdType, IdType>) * num_M_blocks * num_K_blocks);
+      (CSRMatrixInternal<IdType, IdType> *)aligned_alloc(
+          64, sizeof(CSRMatrixInternal<IdType, IdType>) * num_M_blocks *
+                  num_K_blocks);

 #ifdef DEBUG
  endTick = __rdtsc();
@@ -489,14 +488,17 @@ void SpMMRedopCsrOpt(
  LOG(INFO) << "total_nnz = " << total_nnz << ", avg_degree = " << avg_degree;
  LOG(INFO) << "has_idx = " << has_idx;
  LOG(INFO) << "nnz_prob = " << nnz_prob;
-  LOG(INFO) << "K_block_size = " << K_block_size << ", M_block_size = " << M_block_size;
-  LOG(INFO) << "num_K_blocks = " << num_K_blocks << ", num_M_blocks = " << num_M_blocks;
+  LOG(INFO) << "K_block_size = " << K_block_size
+            << ", M_block_size = " << M_block_size;
+  LOG(INFO) << "num_K_blocks = " << num_K_blocks
+            << ", num_M_blocks = " << num_M_blocks;
  LOG(INFO) << "stage0 ticks = " << (endTick - startTick);
  startTick = __rdtsc();
 #endif  // DEBUG

-  SpMMCreateBlocks(csr, block_csr_array, num_M_blocks, num_K_blocks, M_block_size, K_block_size,
-                   Op::use_lhs, Op::use_rhs);
+  SpMMCreateBlocks(
+      csr, block_csr_array, num_M_blocks, num_K_blocks, M_block_size,
+      K_block_size, Op::use_lhs, Op::use_rhs);

 #ifdef DEBUG
  endTick = __rdtsc();
@@ -506,17 +508,14 @@ void SpMMRedopCsrOpt(

  libxsmm_meltwfunction_opreduce_vecs_idx kernel = nullptr;
  if (std::is_same<Redop, op::Max<DType>>::value) {
-    kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(has_idx, N,
-                                                        LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MAX,
-                                                        true);
+    kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(
+        has_idx, N, LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MAX, true);
  } else if (std::is_same<Redop, op::Min<DType>>::value) {
-    kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(has_idx, N,
-                                                        LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MIN,
-                                                        true);
+    kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(
+        has_idx, N, LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_MIN, true);
  } else if (std::is_same<Redop, op::Add<DType>>::value) {
-    kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(has_idx, N,
-                                                        LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_SUM,
-                                                        false);
+    kernel = SpMMCreateLibxsmmKernel<IdType, DType, Op>(
+        has_idx, N, LIBXSMM_MELTW_FLAG_OPREDUCE_VECS_REDOP_SUM, false);
  }

 #ifdef DEBUG
@@ -525,12 +524,15 @@ void SpMMRedopCsrOpt(
  startTick = __rdtsc();
 #endif  // DEBUG

-  if (std::is_same<Redop, op::Max<DType>>::value || std::is_same<Redop, op::Min<DType>>::value) {
-    SpMMBlockwiseOpCmp<IdType, DType, Op, Redop>(block_csr_array, B, E, C, argB, argE, has_idx, N,
-                                                 num_M_blocks, num_K_blocks, M_block_size, kernel);
+  if (std::is_same<Redop, op::Max<DType>>::value ||
+      std::is_same<Redop, op::Min<DType>>::value) {
+    SpMMBlockwiseOpCmp<IdType, DType, Op, Redop>(
+        block_csr_array, B, E, C, argB, argE, has_idx, N, num_M_blocks,
+        num_K_blocks, M_block_size, kernel);
  } else {
-    SpMMBlockwiseOpSum(block_csr_array, B, E, C, has_idx, N, num_M_blocks, num_K_blocks,
-                       M_block_size, kernel);
+    SpMMBlockwiseOpSum(
+        block_csr_array, B, E, C, has_idx, N, num_M_blocks, num_K_blocks,
+        M_block_size, kernel);
  }

 #ifdef DEBUG
@@ -539,7 +541,8 @@ void SpMMRedopCsrOpt(
  startTick = __rdtsc();
 #endif  // DEBUG

-  SpMMFreeBlocks(block_csr_array, num_M_blocks, num_K_blocks, Op::use_lhs, Op::use_rhs);
+  SpMMFreeBlocks(
+      block_csr_array, num_M_blocks, num_K_blocks, Op::use_lhs, Op::use_rhs);

 #ifdef DEBUG
  endTick = __rdtsc();
@@ -557,10 +560,12 @@ void SpMMRedopCsrOpt(
 * \note it uses libxsmm, blocking and dynamic thread scheduling.
 */
 template <typename IdType, typename DType, typename Op>
-void SpMMSumCsrLibxsmm(const BcastOff& bcast, const CSRMatrix& csr,
-                   NDArray ufeat, NDArray efeat, NDArray out) {
+void SpMMSumCsrLibxsmm(
+    const BcastOff &bcast, const CSRMatrix &csr, NDArray ufeat, NDArray efeat,
+    NDArray out) {
  NDArray dummy;
-  SpMMRedopCsrOpt<IdType, DType, Op, op::Add<DType>>(bcast, csr, ufeat, efeat, out, dummy, dummy);
+  SpMMRedopCsrOpt<IdType, DType, Op, op::Add<DType>>(
+      bcast, csr, ufeat, efeat, out, dummy, dummy);
 }

 /*!
@@ -575,9 +580,11 @@ void SpMMSumCsrLibxsmm(const BcastOff& bcast, const CSRMatrix& csr,
 * \note it uses libxsmm, blocking and dynamic thread scheduling.
 */
 template <typename IdType, typename DType, typename Op, typename Cmp>
-void SpMMCmpCsrLibxsmm(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
-                   NDArray efeat, NDArray out, NDArray argu, NDArray arge) {
-  SpMMRedopCsrOpt<IdType, DType, Op, Cmp>(bcast, csr, ufeat, efeat, out, argu, arge);
+void SpMMCmpCsrLibxsmm(
+    const BcastOff &bcast, const CSRMatrix &csr, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge) {
+  SpMMRedopCsrOpt<IdType, DType, Op, Cmp>(
+      bcast, csr, ufeat, efeat, out, argu, arge);
 }

 }  // namespace cpu

--- a/src/array/cpu/traversal.cc
+++ b/src/array/cpu/traversal.cc
@@ -4,58 +4,49 @@
 * \brief Graph traversal implementation
 */

+#include "./traversal.h"
+
 #include <dgl/graph_traversal.h>
+
 #include <algorithm>
 #include <queue>
-#include "./traversal.h"

 namespace dgl {
 namespace aten {
 namespace impl {
 namespace {
 // A utility view class to wrap a vector into a queue.
-template<typename DType>
+template <typename DType>
 struct VectorQueueWrapper {
  std::vector<DType>* vec;
  size_t head = 0;

-  explicit VectorQueueWrapper(std::vector<DType>* vec): vec(vec) {}
+  explicit VectorQueueWrapper(std::vector<DType>* vec) : vec(vec) {}

-  void push(const DType& elem) {
-    vec->push_back(elem);
-  }
+  void push(const DType& elem) { vec->push_back(elem); }

-  DType top() const {
-    return vec->operator[](head);
-  }
+  DType top() const { return vec->operator[](head); }

-  void pop() {
-    ++head;
-  }
+  void pop() { ++head; }

-  bool empty() const {
-    return head == vec->size();
-  }
+  bool empty() const { return head == vec->size(); }

-  size_t size() const {
-    return vec->size() - head;
-  }
+  size_t size() const { return vec->size() - head; }
 };

 // Internal function to merge multiple traversal traces into one ndarray.
 // It is similar to zip the vectors together.
-template<typename DType>
-IdArray MergeMultipleTraversals(
-    const std::vector<std::vector<DType>>& traces) {
+template <typename DType>
+IdArray MergeMultipleTraversals(const std::vector<std::vector<DType>>& traces) {
  int64_t max_len = 0, total_len = 0;
  for (size_t i = 0; i < traces.size(); ++i) {
    const int64_t tracelen = traces[i].size();
    max_len = std::max(max_len, tracelen);
    total_len += traces[i].size();
  }
-  IdArray ret = IdArray::Empty({total_len},
-                               DGLDataType{kDGLInt, sizeof(DType) * 8, 1},
-                               DGLContext{kDGLCPU, 0});
+  IdArray ret = IdArray::Empty(
+      {total_len}, DGLDataType{kDGLInt, sizeof(DType) * 8, 1},
+      DGLContext{kDGLCPU, 0});
  DType* ret_data = static_cast<DType*>(ret->data);
  for (int64_t i = 0; i < max_len; ++i) {
    for (size_t j = 0; j < traces.size(); ++j) {
@@ -71,15 +62,15 @@ IdArray MergeMultipleTraversals(

 // Internal function to compute sections if multiple traversal traces
 // are merged into one ndarray.
-template<typename DType>
-IdArray ComputeMergedSections(
-    const std::vector<std::vector<DType>>& traces) {
+template <typename DType>
+IdArray ComputeMergedSections(const std::vector<std::vector<DType>>& traces) {
  int64_t max_len = 0;
  for (size_t i = 0; i < traces.size(); ++i) {
    const int64_t tracelen = traces[i].size();
    max_len = std::max(max_len, tracelen);
  }
-  IdArray ret = IdArray::Empty({max_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
+  IdArray ret = IdArray::Empty(
+      {max_len}, DGLDataType{kDGLInt, 64, 1}, DGLContext{kDGLCPU, 0});
  int64_t* ret_data = static_cast<int64_t*>(ret->data);
  for (int64_t i = 0; i < max_len; ++i) {
    int64_t sec_len = 0;
@@ -101,13 +92,13 @@ Frontiers BFSNodesFrontiers(const CSRMatrix& csr, IdArray source) {
  std::vector<IdType> ids;
  std::vector<int64_t> sections;
  VectorQueueWrapper<IdType> queue(&ids);
-  auto visit = [&] (const int64_t v) { };
-  auto make_frontier = [&] () {
-      if (!queue.empty()) {
-        // do not push zero-length frontier
-        sections.push_back(queue.size());
-      }
-    };
+  auto visit = [&](const int64_t v) {};
+  auto make_frontier = [&]() {
+    if (!queue.empty()) {
+      // do not push zero-length frontier
+      sections.push_back(queue.size());
+    }
+  };
  BFSTraverseNodes<IdType>(csr, source, &queue, visit, make_frontier);

  Frontiers front;
@@ -116,8 +107,10 @@ Frontiers BFSNodesFrontiers(const CSRMatrix& csr, IdArray source) {
  return front;
 }

-template Frontiers BFSNodesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
-template Frontiers BFSNodesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
+template Frontiers BFSNodesFrontiers<kDGLCPU, int32_t>(
+    const CSRMatrix&, IdArray);
+template Frontiers BFSNodesFrontiers<kDGLCPU, int64_t>(
+    const CSRMatrix&, IdArray);

 template <DGLDeviceType XPU, typename IdType>
 Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
@@ -126,16 +119,16 @@ Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
  // NOTE: std::queue has no top() method.
  std::vector<IdType> nodes;
  VectorQueueWrapper<IdType> queue(&nodes);
-  auto visit = [&] (const IdType e) { ids.push_back(e); };
+  auto visit = [&](const IdType e) { ids.push_back(e); };
  bool first_frontier = true;
  auto make_frontier = [&] {
-      if (first_frontier) {
-        first_frontier = false;   // do not push the first section when doing edges
-      } else if (!queue.empty()) {
-        // do not push zero-length frontier
-        sections.push_back(queue.size());
-      }
-    };
+    if (first_frontier) {
+      first_frontier = false;  // do not push the first section when doing edges
+    } else if (!queue.empty()) {
+      // do not push zero-length frontier
+      sections.push_back(queue.size());
+    }
+  };
  BFSTraverseEdges<IdType>(csr, source, &queue, visit, make_frontier);

  Frontiers front;
@@ -144,21 +137,23 @@ Frontiers BFSEdgesFrontiers(const CSRMatrix& csr, IdArray source) {
  return front;
 }

-template Frontiers BFSEdgesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
-template Frontiers BFSEdgesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);
+template Frontiers BFSEdgesFrontiers<kDGLCPU, int32_t>(
+    const CSRMatrix&, IdArray);
+template Frontiers BFSEdgesFrontiers<kDGLCPU, int64_t>(
+    const CSRMatrix&, IdArray);

 template <DGLDeviceType XPU, typename IdType>
 Frontiers TopologicalNodesFrontiers(const CSRMatrix& csr) {
  std::vector<IdType> ids;
  std::vector<int64_t> sections;
  VectorQueueWrapper<IdType> queue(&ids);
-  auto visit = [&] (const uint64_t v) { };
-  auto make_frontier = [&] () {
-      if (!queue.empty()) {
-        // do not push zero-length frontier
-        sections.push_back(queue.size());
-      }
-    };
+  auto visit = [&](const uint64_t v) {};
+  auto make_frontier = [&]() {
+    if (!queue.empty()) {
+      // do not push zero-length frontier
+      sections.push_back(queue.size());
+    }
+  };
  TopologicalNodes<IdType>(csr, &queue, visit, make_frontier);

  Frontiers front;
@@ -167,8 +162,10 @@ Frontiers TopologicalNodesFrontiers(const CSRMatrix& csr) {
  return front;
 }

-template Frontiers TopologicalNodesFrontiers<kDGLCPU, int32_t>(const CSRMatrix&);
-template Frontiers TopologicalNodesFrontiers<kDGLCPU, int64_t>(const CSRMatrix&);
+template Frontiers TopologicalNodesFrontiers<kDGLCPU, int32_t>(
+    const CSRMatrix&);
+template Frontiers TopologicalNodesFrontiers<kDGLCPU, int64_t>(
+    const CSRMatrix&);

 template <DGLDeviceType XPU, typename IdType>
 Frontiers DGLDFSEdges(const CSRMatrix& csr, IdArray source) {
@@ -177,7 +174,7 @@ Frontiers DGLDFSEdges(const CSRMatrix& csr, IdArray source) {
  std::vector<std::vector<IdType>> edges(len);

  for (int64_t i = 0; i < len; ++i) {
-    auto visit = [&] (IdType e, int tag) { edges[i].push_back(e); };
+    auto visit = [&](IdType e, int tag) { edges[i].push_back(e); };
    DFSLabeledEdges<IdType>(csr, src_data[i], false, false, visit);
  }

@@ -191,11 +188,9 @@ template Frontiers DGLDFSEdges<kDGLCPU, int32_t>(const CSRMatrix&, IdArray);
 template Frontiers DGLDFSEdges<kDGLCPU, int64_t>(const CSRMatrix&, IdArray);

 template <DGLDeviceType XPU, typename IdType>
-Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
-                             IdArray source,
-                             const bool has_reverse_edge,
-                             const bool has_nontree_edge,
-                             const bool return_labels) {
+Frontiers DGLDFSLabeledEdges(
+    const CSRMatrix& csr, IdArray source, const bool has_reverse_edge,
+    const bool has_nontree_edge, const bool return_labels) {
  const int64_t len = source->shape[0];
  const IdType* src_data = static_cast<IdType*>(source->data);
  std::vector<std::vector<IdType>> edges(len);
@@ -206,14 +201,14 @@ Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
  }

  for (int64_t i = 0; i < len; ++i) {
-    auto visit = [&] (IdType e, int64_t tag) {
+    auto visit = [&](IdType e, int64_t tag) {
      edges[i].push_back(e);
      if (return_labels) {
        tags[i].push_back(tag);
      }
    };
-    DFSLabeledEdges<IdType>(csr, src_data[i],
-        has_reverse_edge, has_nontree_edge, visit);
+    DFSLabeledEdges<IdType>(
+        csr, src_data[i], has_reverse_edge, has_nontree_edge, visit);
  }

  Frontiers front;
@@ -226,16 +221,10 @@ Frontiers DGLDFSLabeledEdges(const CSRMatrix& csr,
  return front;
 }

-template Frontiers DGLDFSLabeledEdges<kDGLCPU, int32_t>(const CSRMatrix&,
-                                                       IdArray,
-                                                       const bool,
-                                                       const bool,
-                                                       const bool);
-template Frontiers DGLDFSLabeledEdges<kDGLCPU, int64_t>(const CSRMatrix&,
-                                                       IdArray,
-                                                       const bool,
-                                                       const bool,
-                                                       const bool);
+template Frontiers DGLDFSLabeledEdges<kDGLCPU, int32_t>(
+    const CSRMatrix&, IdArray, const bool, const bool, const bool);
+template Frontiers DGLDFSLabeledEdges<kDGLCPU, int64_t>(
+    const CSRMatrix&, IdArray, const bool, const bool, const bool);

 }  // namespace impl
 }  // namespace aten

--- a/src/array/cpu/traversal.h
+++ b/src/array/cpu/traversal.h
@@ -3,15 +3,16 @@
 * \file array/cpu/traversal.h
 * \brief Graph traversal routines.
 *
- * Traversal routines generate frontiers. Frontiers can be node frontiers or edge
- * frontiers depending on the traversal function. Each frontier is a
- * list of nodes/edges (specified by their ids). An optional tag can be specified
- * for each node/edge (represented by an int value).
+ * Traversal routines generate frontiers. Frontiers can be node frontiers or
+ * edge frontiers depending on the traversal function. Each frontier is a list
+ * of nodes/edges (specified by their ids). An optional tag can be specified for
+ * each node/edge (represented by an int value).
 */
 #ifndef DGL_ARRAY_CPU_TRAVERSAL_H_
 #define DGL_ARRAY_CPU_TRAVERSAL_H_

 #include <dgl/graph_interface.h>
+
 #include <stack>
 #include <tuple>
 #include <vector>
@@ -43,16 +44,16 @@ namespace impl {
 * \param reversed If true, BFS follows the in-edge direction
 * \param queue The queue used to do bfs.
 * \param visit The function to call when a node is visited.
- * \param make_frontier The function to indicate that a new froniter can be made;
+ * \param make_frontier The function to indicate that a new froniter can be
+ * made;
 */
-template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
-void BFSTraverseNodes(const CSRMatrix& csr,
-              IdArray source,
-              Queue* queue,
-              VisitFn visit,
-              FrontierFn make_frontier) {
+template <
+    typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
+void BFSTraverseNodes(
+    const CSRMatrix &csr, IdArray source, Queue *queue, VisitFn visit,
+    FrontierFn make_frontier) {
  const int64_t len = source->shape[0];
-  const IdType *src_data = static_cast<IdType*>(source->data);
+  const IdType *src_data = static_cast<IdType *>(source->data);

  const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
  const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
@@ -71,7 +72,7 @@ void BFSTraverseNodes(const CSRMatrix& csr,
    for (size_t i = 0; i < size; ++i) {
      const IdType u = queue->top();
      queue->pop();
-      for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
+      for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
        auto v = indices_data[idx];
        if (!visited[v]) {
          visited[v] = true;
@@ -109,16 +110,16 @@ void BFSTraverseNodes(const CSRMatrix& csr,
 * \param queue The queue used to do bfs.
 * \param visit The function to call when a node is visited.
 *        The argument would be edge ID.
- * \param make_frontier The function to indicate that a new frontier can be made;
+ * \param make_frontier The function to indicate that a new frontier can be
+ * made;
 */
-template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
-void BFSTraverseEdges(const CSRMatrix& csr,
-              IdArray source,
-              Queue* queue,
-              VisitFn visit,
-              FrontierFn make_frontier) {
+template <
+    typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
+void BFSTraverseEdges(
+    const CSRMatrix &csr, IdArray source, Queue *queue, VisitFn visit,
+    FrontierFn make_frontier) {
  const int64_t len = source->shape[0];
-  const IdType* src_data = static_cast<IdType*>(source->data);
+  const IdType *src_data = static_cast<IdType *>(source->data);

  const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
  const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
@@ -138,7 +139,7 @@ void BFSTraverseEdges(const CSRMatrix& csr,
    for (size_t i = 0; i < size; ++i) {
      const IdType u = queue->top();
      queue->pop();
-      for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
+      for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
        auto e = eid_data ? eid_data[idx] : idx;
        const IdType v = indices_data[idx];
        if (!visited[v]) {
@@ -174,13 +175,14 @@ void BFSTraverseEdges(const CSRMatrix& csr,
 * \param reversed If true, follows the in-edge direction
 * \param queue The queue used to do bfs.
 * \param visit The function to call when a node is visited.
- * \param make_frontier The function to indicate that a new froniter can be made;
+ * \param make_frontier The function to indicate that a new froniter can be
+ * made;
 */
-template<typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
-void TopologicalNodes(const CSRMatrix& csr,
-                      Queue* queue,
-                      VisitFn visit,
-                      FrontierFn make_frontier) {
+template <
+    typename IdType, typename Queue, typename VisitFn, typename FrontierFn>
+void TopologicalNodes(
+    const CSRMatrix &csr, Queue *queue, VisitFn visit,
+    FrontierFn make_frontier) {
  int64_t num_visited_nodes = 0;
  const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
  const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
@@ -206,7 +208,7 @@ void TopologicalNodes(const CSRMatrix& csr,
    for (size_t i = 0; i < size; ++i) {
      const IdType u = queue->top();
      queue->pop();
-      for (auto idx = indptr_data[u]; idx < indptr_data[u+1]; ++idx) {
+      for (auto idx = indptr_data[u]; idx < indptr_data[u + 1]; ++idx) {
        const IdType v = indices_data[idx];
        if (--(degrees[v]) == 0) {
          visit(v);
@@ -219,7 +221,8 @@ void TopologicalNodes(const CSRMatrix& csr,
  }

  if (num_visited_nodes != num_nodes) {
-    LOG(FATAL) << "Error in topological traversal: loop detected in the given graph.";
+    LOG(FATAL)
+        << "Error in topological traversal: loop detected in the given graph.";
  }
 }

@@ -236,32 +239,29 @@ enum DFSEdgeTag {
 * FORWARD(0), REVERSE(1), NONTREE(2)
 *
 * A FORWARD edge is one in which `u` has been visisted but `v` has not.
- * A REVERSE edge is one in which both `u` and `v` have been visisted and the edge
- * is in the DFS tree.
- * A NONTREE edge is one in which both `u` and `v` have been visisted but the edge
- * is NOT in the DFS tree.
+ * A REVERSE edge is one in which both `u` and `v` have been visisted and the
+ * edge is in the DFS tree. A NONTREE edge is one in which both `u` and `v` have
+ * been visisted but the edge is NOT in the DFS tree.
 *
 * \param source Source node.
 * \param reversed If true, DFS follows the in-edge direction
 * \param has_reverse_edge If true, REVERSE edges are included
 * \param has_nontree_edge If true, NONTREE edges are included
- * \param visit The function to call when an edge is visited; the edge id and its
- *              tag will be given as the arguments.
+ * \param visit The function to call when an edge is visited; the edge id and
+ * its tag will be given as the arguments.
 */
-template<typename IdType, typename VisitFn>
-void DFSLabeledEdges(const CSRMatrix& csr,
-                     IdType source,
-                     bool has_reverse_edge,
-                     bool has_nontree_edge,
-                     VisitFn visit) {
+template <typename IdType, typename VisitFn>
+void DFSLabeledEdges(
+    const CSRMatrix &csr, IdType source, bool has_reverse_edge,
+    bool has_nontree_edge, VisitFn visit) {
  const int64_t num_nodes = csr.num_rows;
-  CHECK_GE(num_nodes, source) << "source " << source <<
-    " is out of range [0," << num_nodes << "]";
+  CHECK_GE(num_nodes, source)
+      << "source " << source << " is out of range [0," << num_nodes << "]";
  const IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
  const IdType *indices_data = static_cast<IdType *>(csr.indices->data);
  const IdType *eid_data = static_cast<IdType *>(csr.data->data);

-  if (indptr_data[source+1]-indptr_data[source] == 0) {
+  if (indptr_data[source + 1] - indptr_data[source] == 0) {
    // no out-going edges from the source node
    return;
  }
@@ -278,7 +278,8 @@ void DFSLabeledEdges(const CSRMatrix& csr,
  while (!stack.empty()) {
    std::tie(u, i, on_tree) = stack.top();
    const IdType v = indices_data[indptr_data[u] + i];
-    const IdType uv = eid_data ? eid_data[indptr_data[u] + i] : indptr_data[u] + i;
+    const IdType uv =
+        eid_data ? eid_data[indptr_data[u] + i] : indptr_data[u] + i;
    if (visited[v]) {
      if (!on_tree && has_nontree_edge) {
        visit(uv, kNonTree);
@@ -288,7 +289,7 @@ void DFSLabeledEdges(const CSRMatrix& csr,
      stack.pop();
      // find next one.
      if (indptr_data[u] + i < indptr_data[u + 1] - 1) {
-        stack.push(std::make_tuple(u, i+1, false));
+        stack.push(std::make_tuple(u, i + 1, false));
      }
    } else {
      visited[v] = true;

--- a/src/array/cuda/array_cumsum.cu
+++ b/src/array/cuda/array_cumsum.cu
@@ -4,9 +4,10 @@
 * \brief Array cumsum GPU implementation
 */
 #include <dgl/array.h>
+
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./dgl_cub.cuh"
+#include "./utils.h"

 namespace dgl {
 using runtime::NDArray;
@@ -17,7 +18,8 @@ template <DGLDeviceType XPU, typename IdType>
 IdArray CumSum(IdArray array, bool prepend_zero) {
  const int64_t len = array.NumElements();
  if (len == 0)
-    return !prepend_zero ? array : aten::Full(0, 1, array->dtype.bits, array->ctx);
+    return !prepend_zero ? array
+                         : aten::Full(0, 1, array->dtype.bits, array->ctx);

  auto device = runtime::DeviceAPI::Get(array->ctx);
  cudaStream_t stream = runtime::getCurrentCUDAStream();

--- a/src/array/cuda/array_nonzero.cu
+++ b/src/array/cuda/array_nonzero.cu
@@ -5,9 +5,10 @@
 */

 #include <dgl/array.h>
+
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./dgl_cub.cuh"
+#include "./utils.h"

 namespace dgl {
 using runtime::NDArray;
@@ -16,14 +17,11 @@ namespace impl {

 template <typename IdType>
 struct IsNonZeroIndex {
-  explicit IsNonZeroIndex(const IdType * array) : array_(array) {
-  }
+  explicit IsNonZeroIndex(const IdType* array) : array_(array) {}

-  __device__ bool operator() (const int64_t index) {
-    return array_[index] != 0;
-  }
+  __device__ bool operator()(const int64_t index) { return array_[index] != 0; }

-  const IdType * array_;
+  const IdType* array_;
 };

 template <DGLDeviceType XPU, typename IdType>
@@ -36,22 +34,23 @@ IdArray NonZero(IdArray array) {

  cudaStream_t stream = runtime::getCurrentCUDAStream();

-  const IdType * const in_data = static_cast<const IdType*>(array->data);
-  int64_t * const out_data = static_cast<int64_t*>(ret->data);
+  const IdType* const in_data = static_cast<const IdType*>(array->data);
+  int64_t* const out_data = static_cast<int64_t*>(ret->data);

  IsNonZeroIndex<IdType> comp(in_data);
  cub::CountingInputIterator<int64_t> counter(0);

  // room for cub to output on GPU
-  int64_t * d_num_nonzeros = static_cast<int64_t*>(
-      device->AllocWorkspace(ctx, sizeof(int64_t)));
+  int64_t* d_num_nonzeros =
+      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));

  size_t temp_size = 0;
-  CUDA_CALL(cub::DeviceSelect::If(nullptr, temp_size, counter, out_data,
-      d_num_nonzeros, len, comp, stream));
-  void * temp = device->AllocWorkspace(ctx, temp_size);
-  CUDA_CALL(cub::DeviceSelect::If(temp, temp_size, counter, out_data,
-      d_num_nonzeros, len, comp, stream));
+  CUDA_CALL(cub::DeviceSelect::If(
+      nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
+      stream));
+  void* temp = device->AllocWorkspace(ctx, temp_size);
+  CUDA_CALL(cub::DeviceSelect::If(
+      temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
  device->FreeWorkspace(ctx, temp);

  // copy number of selected elements from GPU to CPU

--- a/src/array/cuda/array_sort.cu
+++ b/src/array/cuda/array_sort.cu
@@ -4,9 +4,10 @@
 * \brief Array sort GPU implementation
 */
 #include <dgl/array.h>
+
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./dgl_cub.cuh"
+#include "./utils.h"

 namespace dgl {
 using runtime::NDArray;
@@ -29,26 +30,30 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {

  cudaStream_t stream = runtime::getCurrentCUDAStream();
  if (num_bits == 0) {
-    num_bits = sizeof(IdType)*8;
+    num_bits = sizeof(IdType) * 8;
  }

  // Allocate workspace
  size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr, workspace_size,
-      keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream));
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
+      0, num_bits, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);

  // Compute
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(workspace, workspace_size,
-      keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream));
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      workspace, workspace_size, keys_in, keys_out, values_in, values_out,
+      nitems, 0, num_bits, stream));

  device->FreeWorkspace(ctx, workspace);

  return std::make_pair(sorted_array, sorted_idx);
 }

-template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int32_t>(IdArray, int num_bits);
-template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int64_t>(IdArray, int num_bits);
+template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int32_t>(
+    IdArray, int num_bits);
+template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int64_t>(
+    IdArray, int num_bits);

 }  // namespace impl
 }  // namespace aten

--- a/src/array/cuda/coo2csr.cu
+++ b/src/array/cuda/coo2csr.cu
@@ -4,6 +4,7 @@
 * \brief COO2CSR
 */
 #include <dgl/array.h>
+
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"

@@ -46,18 +47,15 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
  if (!COOHasData(coo))
    coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);

-  NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
+  NDArray indptr =
+      aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
  int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
  CUSPARSE_CALL(cusparseXcoo2csr(
-        thr_entry->cusparse_handle,
-        coo.row.Ptr<int32_t>(),
-        nnz,
-        coo.num_rows,
-        indptr_ptr,
-        CUSPARSE_INDEX_BASE_ZERO));
-
-  return CSRMatrix(coo.num_rows, coo.num_cols,
-                   indptr, coo.col, coo.data, col_sorted);
+      thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
+      indptr_ptr, CUSPARSE_INDEX_BASE_ZERO));
+
+  return CSRMatrix(
+      coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
 }

 /*!
@@ -77,9 +75,8 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
 */
 template <typename IdType>
 __global__ void _SortedSearchKernelUpperBound(
-    const IdType* hay, int64_t hay_size,
-    const IdType* needles, int64_t num_needles,
-    IdType* pos) {
+    const IdType* hay, int64_t hay_size, const IdType* needles,
+    int64_t num_needles, IdType* pos) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < num_needles) {
@@ -123,14 +120,12 @@ CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
  const int nt = cuda::FindNumThreads(coo.num_rows);
  const int nb = (coo.num_rows + nt - 1) / nt;
  IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
-  CUDA_KERNEL_CALL(_SortedSearchKernelUpperBound,
-      nb, nt, 0, stream,
-      coo.row.Ptr<int64_t>(), nnz,
-      rowids.Ptr<int64_t>(), coo.num_rows,
-      indptr.Ptr<int64_t>() + 1);
-
-  return CSRMatrix(coo.num_rows, coo.num_cols,
-                   indptr, coo.col, coo.data, col_sorted);
+  CUDA_KERNEL_CALL(
+      _SortedSearchKernelUpperBound, nb, nt, 0, stream, coo.row.Ptr<int64_t>(),
+      nnz, rowids.Ptr<int64_t>(), coo.num_rows, indptr.Ptr<int64_t>() + 1);
+
+  return CSRMatrix(
+      coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
 }

 template CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo);

--- a/src/array/cuda/coo_sort.cu
+++ b/src/array/cuda/coo_sort.cu
@@ -4,8 +4,9 @@
 * \brief Sort COO index
 */
 #include <dgl/array.h>
-#include "../../runtime/cuda/cuda_common.h"
+
 #include "../../c_api_common.h"
+#include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"

 namespace dgl {
@@ -18,21 +19,20 @@ namespace impl {
 ///////////////////////////// COOSort_ /////////////////////////////

 /**
-* @brief Encode row and column IDs into a single scalar per edge.
-*
-* @tparam IdType The type to encode as.
-* @param row The row (src) IDs per edge.
-* @param col The column (dst) IDs per edge.
-* @param nnz The number of edges.
-* @param col_bits The number of bits used to encode the destination. The row
-* information is packed into the remaining bits.
-* @param key The encoded edges (output).
-*/
+ * @brief Encode row and column IDs into a single scalar per edge.
+ *
+ * @tparam IdType The type to encode as.
+ * @param row The row (src) IDs per edge.
+ * @param col The column (dst) IDs per edge.
+ * @param nnz The number of edges.
+ * @param col_bits The number of bits used to encode the destination. The row
+ * information is packed into the remaining bits.
+ * @param key The encoded edges (output).
+ */
 template <typename IdType>
 __global__ void _COOEncodeEdgesKernel(
-    const IdType* const row, const IdType* const col,
-    const int64_t nnz, const int col_bits, IdType * const key) {
-
+    const IdType* const row, const IdType* const col, const int64_t nnz,
+    const int col_bits, IdType* const key) {
  int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  if (tx < nnz) {
@@ -41,20 +41,19 @@ __global__ void _COOEncodeEdgesKernel(
 }

 /**
-* @brief Decode row and column IDs from the encoded edges.
-*
-* @tparam IdType The type the edges are encoded as.
-* @param key The encoded edges.
-* @param nnz The number of edges.
-* @param col_bits The number of bits used to store the column/dst ID.
-* @param row The row (src) IDs per edge (output).
-* @param col The col (dst) IDs per edge (output).
-*/
+ * @brief Decode row and column IDs from the encoded edges.
+ *
+ * @tparam IdType The type the edges are encoded as.
+ * @param key The encoded edges.
+ * @param nnz The number of edges.
+ * @param col_bits The number of bits used to store the column/dst ID.
+ * @param row The row (src) IDs per edge (output).
+ * @param col The col (dst) IDs per edge (output).
+ */
 template <typename IdType>
 __global__ void _COODecodeEdgesKernel(
    const IdType* const key, const int64_t nnz, const int col_bits,
-    IdType * const row, IdType * const col) {
-
+    IdType* const row, IdType* const col) {
  int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  if (tx < nnz) {
@@ -64,9 +63,7 @@ __global__ void _COODecodeEdgesKernel(
  }
 }

-
-
-template<typename T>
+template <typename T>
 int _NumberOfBits(const T& range) {
  if (range <= 1) {
    // ranges of 0 or 1 require no bits to store
@@ -74,12 +71,12 @@ int _NumberOfBits(const T& range) {
  }

  int bits = 1;
-  while (bits < static_cast<int>(sizeof(T)*8) && (1 << bits) < range) {
+  while (bits < static_cast<int>(sizeof(T) * 8) && (1 << bits) < range) {
    ++bits;
  }

-  CHECK_EQ((range-1) >> bits, 0);
-  CHECK_NE((range-1) >> (bits-1), 0);
+  CHECK_EQ((range - 1) >> bits, 0);
+  CHECK_NE((range - 1) >> (bits - 1), 0);

  return bits;
 }
@@ -95,20 +92,20 @@ void COOSort_(COOMatrix* coo, bool sort_column) {
    const int num_bits = row_bits + col_bits;

    const int nt = 256;
-    const int nb = (nnz+nt-1)/nt;
-    CHECK(static_cast<int64_t>(nb)*nt >= nnz);
+    const int nb = (nnz + nt - 1) / nt;
+    CHECK(static_cast<int64_t>(nb) * nt >= nnz);

    IdArray pos = aten::NewIdArray(nnz, coo->row->ctx, coo->row->dtype.bits);

-    CUDA_KERNEL_CALL(_COOEncodeEdgesKernel, nb, nt, 0, stream,
-        coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>(),
-        nnz, col_bits, pos.Ptr<IdType>());
+    CUDA_KERNEL_CALL(
+        _COOEncodeEdgesKernel, nb, nt, 0, stream, coo->row.Ptr<IdType>(),
+        coo->col.Ptr<IdType>(), nnz, col_bits, pos.Ptr<IdType>());

    auto sorted = Sort(pos, num_bits);

-    CUDA_KERNEL_CALL(_COODecodeEdgesKernel, nb, nt, 0, stream,
-        sorted.first.Ptr<IdType>(), nnz, col_bits,
-        coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());
+    CUDA_KERNEL_CALL(
+        _COODecodeEdgesKernel, nb, nt, 0, stream, sorted.first.Ptr<IdType>(),
+        nnz, col_bits, coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());

    if (aten::COOHasData(*coo))
      coo->data = IndexSelect(coo->data, sorted.second);
@@ -138,8 +135,8 @@ template void COOSort_<kDGLCUDA, int64_t>(COOMatrix* coo, bool sort_column);

 template <typename IdType>
 __global__ void _COOIsSortedKernel(
-    const IdType* row, const IdType* col,
-    int64_t nnz, int8_t* row_sorted, int8_t* col_sorted) {
+    const IdType* row, const IdType* col, int64_t nnz, int8_t* row_sorted,
+    int8_t* col_sorted) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < nnz) {
@@ -148,8 +145,8 @@ __global__ void _COOIsSortedKernel(
      col_sorted[0] = 1;
    } else {
      row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
-      col_sorted[tx] = static_cast<int8_t>(
-          row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
+      col_sorted[tx] =
+          static_cast<int8_t>(row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
    }
    tx += stride_x;
  }
@@ -161,18 +158,19 @@ std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
  const auto& ctx = coo.row->ctx;
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  auto device = runtime::DeviceAPI::Get(ctx);
-  // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but should
-  // be fine.
+  // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
+  // should be fine.
  int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
  int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
  const int nt = cuda::FindNumThreads(nnz);
  const int nb = (nnz + nt - 1) / nt;
-  CUDA_KERNEL_CALL(_COOIsSortedKernel, nb, nt, 0, stream,
-      coo.row.Ptr<IdType>(), coo.col.Ptr<IdType>(),
-      nnz, row_flags, col_flags);
+  CUDA_KERNEL_CALL(
+      _COOIsSortedKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+      coo.col.Ptr<IdType>(), nnz, row_flags, col_flags);

  const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
-  const bool col_sorted = row_sorted? cuda::AllTrue(col_flags, nnz, ctx) : false;
+  const bool col_sorted =
+      row_sorted ? cuda::AllTrue(col_flags, nnz, ctx) : false;

  device->FreeWorkspace(ctx, row_flags);
  device->FreeWorkspace(ctx, col_flags);

--- a/src/array/cuda/csr2coo.cu
+++ b/src/array/cuda/csr2coo.cu
@@ -4,6 +4,7 @@
 * \brief CSR2COO
 */
 #include <dgl/array.h>
+
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"

@@ -32,20 +33,16 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {

  NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
  const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
-  NDArray row = aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
+  NDArray row =
+      aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
  int32_t* row_ptr = static_cast<int32_t*>(row->data);

  CUSPARSE_CALL(cusparseXcsr2coo(
-      thr_entry->cusparse_handle,
-      indptr_ptr,
-      indices->shape[0],
-      csr.num_rows,
-      row_ptr,
-      CUSPARSE_INDEX_BASE_ZERO));
-
-  return COOMatrix(csr.num_rows, csr.num_cols,
-                   row, indices, data,
-                   true, csr.sorted);
+      thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
+      row_ptr, CUSPARSE_INDEX_BASE_ZERO));
+
+  return COOMatrix(
+      csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
 }

 /*!
@@ -65,8 +62,8 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
 */
 template <typename DType, typename IdType>
 __global__ void _RepeatKernel(
-    const DType* val, const IdType* pos,
-    DType* out, int64_t n_row, int64_t length) {
+    const DType* val, const IdType* pos, DType* out, int64_t n_row,
+    int64_t length) {
  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
@@ -88,15 +85,13 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {

  const int nt = 256;
  const int nb = (nnz + nt - 1) / nt;
-  CUDA_KERNEL_CALL(_RepeatKernel,
-      nb, nt, 0, stream,
-      rowids.Ptr<int64_t>(),
-      csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(),
-      csr.num_rows, nnz);
-
-  return COOMatrix(csr.num_rows, csr.num_cols,
-                   ret_row, csr.indices, csr.data,
-                   true, csr.sorted);
+  CUDA_KERNEL_CALL(
+      _RepeatKernel, nb, nt, 0, stream, rowids.Ptr<int64_t>(),
+      csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>(), csr.num_rows, nnz);
+
+  return COOMatrix(
+      csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
+      csr.sorted);
 }

 template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
@@ -111,8 +106,7 @@ COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
 template <>
 COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
  COOMatrix coo = CSRToCOO<kDGLCUDA, int32_t>(csr);
-  if (aten::IsNullArray(coo.data))
-    return coo;
+  if (aten::IsNullArray(coo.data)) return coo;

  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
@@ -130,21 +124,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {

  size_t workspace_size = 0;
  CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
-      thr_entry->cusparse_handle,
-      coo.num_rows, coo.num_cols,
-      row->shape[0],
-      data_ptr,
-      row_ptr,
-      &workspace_size));
+      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
+      data_ptr, row_ptr, &workspace_size));
  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
  CUSPARSE_CALL(cusparseXcoosortByRow(
-      thr_entry->cusparse_handle,
-      coo.num_rows, coo.num_cols,
-      row->shape[0],
-      data_ptr,
-      row_ptr,
-      col_ptr,
-      workspace));
+      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
+      data_ptr, row_ptr, col_ptr, workspace));
  device->FreeWorkspace(row->ctx, workspace);

  // The row and column field have already been reordered according
@@ -158,8 +143,7 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
 template <>
 COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr) {
  COOMatrix coo = CSRToCOO<kDGLCUDA, int64_t>(csr);
-  if (aten::IsNullArray(coo.data))
-    return coo;
+  if (aten::IsNullArray(coo.data)) return coo;
  const auto& sorted = Sort(coo.data);

  coo.row = IndexSelect(coo.row, sorted.second);

--- a/src/array/cuda/csr_sort.cu
+++ b/src/array/cuda/csr_sort.cu
@@ -4,9 +4,10 @@
 * \brief Sort CSR index
 */
 #include <dgl/array.h>
+
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./dgl_cub.cuh"
+#include "./utils.h"

 namespace dgl {

@@ -20,8 +21,8 @@ namespace impl {
 */
 template <typename IdType>
 __global__ void _SegmentIsSorted(
-    const IdType* indptr, const IdType* indices,
-    int64_t num_rows, int8_t* flags) {
+    const IdType* indptr, const IdType* indices, int64_t num_rows,
+    int8_t* flags) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < num_rows) {
@@ -39,15 +40,15 @@ bool CSRIsSorted(CSRMatrix csr) {
  const auto& ctx = csr.indptr->ctx;
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  auto device = runtime::DeviceAPI::Get(ctx);
-  // We allocate a workspace of num_rows bytes. It wastes a little bit memory but should
-  // be fine.
-  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
+  // We allocate a workspace of num_rows bytes. It wastes a little bit memory
+  // but should be fine.
+  int8_t* flags =
+      static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
  const int nt = cuda::FindNumThreads(csr.num_rows);
  const int nb = (csr.num_rows + nt - 1) / nt;
-  CUDA_KERNEL_CALL(_SegmentIsSorted,
-      nb, nt, 0, stream,
-      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
-      csr.num_rows, flags);
+  CUDA_KERNEL_CALL(
+      _SegmentIsSorted, nb, nt, 0, stream, csr.indptr.Ptr<IdType>(),
+      csr.indices.Ptr<IdType>(), csr.num_rows, flags);
  bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
  device->FreeWorkspace(ctx, flags);
  return ret;
@@ -82,10 +83,8 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {

  size_t workspace_size = 0;
  CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
-      thr_entry->cusparse_handle,
-      csr->num_rows, csr->num_cols, nnz,
-      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
-      &workspace_size));
+      thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);

  cusparseMatDescr_t descr;
@@ -93,11 +92,8 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
  CUSPARSE_CALL(cusparseXcsrsort(
-      thr_entry->cusparse_handle,
-      csr->num_rows, csr->num_cols, nnz,
-      descr,
-      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(),
-      data.Ptr<int32_t>(),
+      thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
      workspace));

  csr->sorted = true;
@@ -115,8 +111,7 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
  const auto& ctx = csr->indptr->ctx;
  const int64_t nnz = csr->indices->shape[0];
  const auto nbits = csr->indptr->dtype.bits;
-  if (!aten::CSRHasData(*csr))
-    csr->data = aten::Range(0, nnz, nbits, ctx);
+  if (!aten::CSRHasData(*csr)) csr->data = aten::Range(0, nnz, nbits, ctx);

  IdArray new_indices = csr->indices.Clone();
  IdArray new_data = csr->data.Clone();
@@ -129,15 +124,15 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {

  // Allocate workspace
  size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(nullptr, workspace_size,
-      key_in, key_out, value_in, value_out,
-      nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t)*8, stream));
+  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+      nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
+      csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);

  // Compute
-  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(workspace, workspace_size,
-      key_in, key_out, value_in, value_out,
-      nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t)*8, stream));
+  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+      workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
+      csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));

  csr->sorted = true;
  csr->indices = new_indices;

--- a/src/array/cuda/csr_transpose.cc
+++ b/src/array/cuda/csr_transpose.cc
@@ -4,6 +4,7 @@
 * \brief CSR transpose (convert to CSC)
 */
 #include <dgl/array.h>
+
 #include "../../runtime/cuda/cuda_common.h"

 namespace dgl {
@@ -33,14 +34,13 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
  const int64_t nnz = indices->shape[0];
  const auto& ctx = indptr->ctx;
  const auto bits = indptr->dtype.bits;
-  if (aten::IsNullArray(data))
-    data = aten::Range(0, nnz, bits, ctx);
+  if (aten::IsNullArray(data)) data = aten::Range(0, nnz, bits, ctx);
  const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
  const int32_t* indices_ptr = static_cast<int32_t*>(indices->data);
  const void* data_ptr = data->data;

-  // (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz == 0.
-  // We need to do it ourselves.
+  // (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz
+  // == 0. We need to do it ourselves.
  NDArray t_indptr = aten::Full(0, csr.num_cols + 1, bits, ctx);
  NDArray t_indices = aten::NewIdArray(nnz, ctx, bits);
  NDArray t_data = aten::NewIdArray(nnz, ctx, bits);
@@ -53,40 +53,29 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
  // workspace
  size_t workspace_size;
  CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize(
-      thr_entry->cusparse_handle,
-      csr.num_rows, csr.num_cols, nnz,
-      data_ptr, indptr_ptr, indices_ptr,
-      t_data_ptr, t_indptr_ptr, t_indices_ptr,
-      CUDA_R_32F,
-      CUSPARSE_ACTION_NUMERIC,
-      CUSPARSE_INDEX_BASE_ZERO,
+      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
+      indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
+      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
      &workspace_size));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
  CUSPARSE_CALL(cusparseCsr2cscEx2(
-      thr_entry->cusparse_handle,
-      csr.num_rows, csr.num_cols, nnz,
-      data_ptr, indptr_ptr, indices_ptr,
-      t_data_ptr, t_indptr_ptr, t_indices_ptr,
-      CUDA_R_32F,
-      CUSPARSE_ACTION_NUMERIC,
-      CUSPARSE_INDEX_BASE_ZERO,
+      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
+      indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
+      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
      workspace));
  device->FreeWorkspace(ctx, workspace);
 #else
  CUSPARSE_CALL(cusparseScsr2csc(
-      thr_entry->cusparse_handle,
-      csr.num_rows, csr.num_cols, nnz,
+      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz,
      static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr,
      static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr,
-      CUSPARSE_ACTION_NUMERIC,
-      CUSPARSE_INDEX_BASE_ZERO));
+      CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
 #endif

-  return CSRMatrix(csr.num_cols, csr.num_rows,
-                   t_indptr, t_indices, t_data,
-                   false);
+  return CSRMatrix(
+      csr.num_cols, csr.num_rows, t_indptr, t_indices, t_data, false);
 }

 template <>

--- a/src/array/cuda/cuda_filter.cu
+++ b/src/array/cuda/cuda_filter.cu
@@ -7,8 +7,8 @@
 #include <dgl/runtime/device_api.h>

 #include "../../runtime/cuda/cuda_common.h"
-#include "../filter.h"
 #include "../../runtime/cuda/cuda_hashtable.cuh"
+#include "../filter.h"
 #include "./dgl_cub.cuh"

 using namespace dgl::runtime::cuda;
@@ -20,35 +20,29 @@ namespace {

 cudaStream_t cudaStream = runtime::getCurrentCUDAStream();

-template<typename IdType, bool include>
+template <typename IdType, bool include>
 __global__ void _IsInKernel(
-    DeviceOrderedHashTable<IdType> table,
-    const IdType * const array,
-    const int64_t size,
-    IdType * const mark) {
-  const int64_t idx = threadIdx.x + blockDim.x*blockIdx.x;
+    DeviceOrderedHashTable<IdType> table, const IdType* const array,
+    const int64_t size, IdType* const mark) {
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (idx < size) {
    mark[idx] = table.Contains(array[idx]) ^ (!include);
  }
 }

-template<typename IdType>
+template <typename IdType>
 __global__ void _InsertKernel(
-    const IdType * const prefix,
-    const int64_t size,
-    IdType * const result) {
-  const int64_t idx = threadIdx.x + blockDim.x*blockIdx.x;
+    const IdType* const prefix, const int64_t size, IdType* const result) {
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (idx < size) {
-    if (prefix[idx] != prefix[idx+1]) {
+    if (prefix[idx] != prefix[idx + 1]) {
      result[prefix[idx]] = idx;
    }
  }
 }

-template<typename IdType, bool include>
-IdArray _PerformFilter(
-    const OrderedHashTable<IdType>& table,
-    IdArray test) {
+template <typename IdType, bool include>
+IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
  const auto& ctx = test->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
  const int64_t size = test->shape[0];
@@ -60,22 +54,20 @@ IdArray _PerformFilter(
  // we need two arrays: 1) to act as a prefixsum
  // for the number of entries that will be inserted, and
  // 2) to collect the included items.
-  IdType * prefix = static_cast<IdType*>(
-      device->AllocWorkspace(ctx, sizeof(IdType)*(size+1)));
+  IdType* prefix = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, sizeof(IdType) * (size + 1)));

  // will resize down later
-  IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType)*8);
+  IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType) * 8);

  // mark each index based on it's existence in the hashtable
  {
    const dim3 block(256);
-    const dim3 grid((size+block.x-1)/block.x);
+    const dim3 grid((size + block.x - 1) / block.x);

-    CUDA_KERNEL_CALL((_IsInKernel<IdType, include>),
-        grid, block, 0, cudaStream,
-        table.DeviceHandle(),
-        static_cast<const IdType*>(test->data),
-        size,
+    CUDA_KERNEL_CALL(
+        (_IsInKernel<IdType, include>), grid, block, 0, cudaStream,
+        table.DeviceHandle(), static_cast<const IdType*>(test->data), size,
        prefix);
  }

@@ -83,40 +75,28 @@ IdArray _PerformFilter(
  {
    size_t workspace_bytes;
    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        nullptr,
-        workspace_bytes,
-        static_cast<IdType*>(nullptr),
-        static_cast<IdType*>(nullptr),
-        size+1, cudaStream));
-    void * workspace = device->AllocWorkspace(ctx, workspace_bytes);
+        nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
+        static_cast<IdType*>(nullptr), size + 1, cudaStream));
+    void* workspace = device->AllocWorkspace(ctx, workspace_bytes);

    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        workspace,
-        workspace_bytes,
-        prefix,
-        prefix,
-        size+1, cudaStream));
+        workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
    device->FreeWorkspace(ctx, workspace);
  }

  // copy number using the internal current stream;
  IdType num_unique;
-  device->CopyDataFromTo(prefix+size, 0,
-      &num_unique, 0,
-      sizeof(num_unique),
-      ctx,
-      DGLContext{kDGLCPU, 0},
-      test->dtype);
+  device->CopyDataFromTo(
+      prefix + size, 0, &num_unique, 0, sizeof(num_unique), ctx,
+      DGLContext{kDGLCPU, 0}, test->dtype);

  // insert items into set
  {
    const dim3 block(256);
-    const dim3 grid((size+block.x-1)/block.x);
+    const dim3 grid((size + block.x - 1) / block.x);

-    CUDA_KERNEL_CALL(_InsertKernel,
-        grid, block, 0, cudaStream,
-        prefix,
-        size,
+    CUDA_KERNEL_CALL(
+        _InsertKernel, grid, block, 0, cudaStream, prefix, size,
        static_cast<IdType*>(result->data));
  }
  device->FreeWorkspace(ctx, prefix);
@@ -124,16 +104,13 @@ IdArray _PerformFilter(
  return result.CreateView({num_unique}, result->dtype);
 }

-
-template<typename IdType>
+template <typename IdType>
 class CudaFilterSet : public Filter {
 public:
-  explicit CudaFilterSet(IdArray array) :
-      table_(array->shape[0], array->ctx, cudaStream) {
+  explicit CudaFilterSet(IdArray array)
+      : table_(array->shape[0], array->ctx, cudaStream) {
    table_.FillWithUnique(
-        static_cast<const IdType*>(array->data),
-        array->shape[0],
-        cudaStream);
+        static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
  }

  IdArray find_included_indices(IdArray test) override {
@@ -150,7 +127,7 @@ class CudaFilterSet : public Filter {

 }  // namespace

-template<DGLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 FilterRef CreateSetFilter(IdArray set) {
  return FilterRef(std::make_shared<CudaFilterSet<IdType>>(set));
 }

--- a/src/array/cuda/dgl_cub.cuh
+++ b/src/array/cuda/dgl_cub.cuh
 /*!
 *  Copyright (c) 2021 by Contributors
 * \file cuda_common.h
- * \brief Wrapper to place cub in dgl namespace. 
+ * \brief Wrapper to place cub in dgl namespace.
 */

 #ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_

--- a/src/array/cuda/disjoint_union.cu
+++ b/src/array/cuda/disjoint_union.cu
 /**
-*   Copyright (c) 2022, NVIDIA CORPORATION.
-*
-*   Licensed under the Apache License, Version 2.0 (the "License");
-*   you may not use this file except in compliance with the License.
-*   You may obtain a copy of the License at
-*
-*       http://www.apache.org/licenses/LICENSE-2.0
-*
-*   Unless required by applicable law or agreed to in writing, software
-*   distributed under the License is distributed on an "AS IS" BASIS,
-*   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-*   See the License for the specific language governing permissions and
-*   limitations under the License.
-*
-* \file array/gpu/disjoint_union.cu
-* \brief Disjoint union GPU implementation.
-*/
+ *   Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * \file array/gpu/disjoint_union.cu
+ * \brief Disjoint union GPU implementation.
+ */

-#include <dgl/runtime/parallel_for.h>
 #include <dgl/array.h>
-#include <vector>
+#include <dgl/runtime/parallel_for.h>
+
 #include <tuple>
+#include <vector>
+
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"

@@ -31,8 +33,8 @@ namespace impl {

 template <typename IdType>
 __global__ void _DisjointUnionKernel(
-    IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
-    int64_t n_arrs, int n_elms) {
+    IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
+    int n_elms) {
  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < n_elms) {
@@ -48,7 +50,8 @@ __global__ void _DisjointUnionKernel(
 }

 template <DGLDeviceType XPU, typename IdType>
-std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMatrix>& coos) {
+std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
+    const std::vector<COOMatrix>& coos) {
  IdType n = coos.size(), nbits = coos[0].row->dtype.bits;
  IdArray n_rows = NewIdArray(n, CPU, nbits);
  IdArray n_cols = NewIdArray(n, CPU, nbits);
@@ -58,7 +61,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMa
  IdType* n_cols_data = n_cols.Ptr<IdType>();
  IdType* n_elms_data = n_elms.Ptr<IdType>();

-  dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e){
+  dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e) {
    for (IdType i = b; i < e; ++i) {
      n_rows_data[i] = coos[i].num_rows;
      n_cols_data[i] = coos[i].num_cols;
@@ -66,30 +69,30 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(const std::vector<COOMa
    }
  });

-  return std::make_tuple(CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
-                         CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
-                         CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
+  return std::make_tuple(
+      CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
+      CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
+      CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
 }

 template <DGLDeviceType XPU, typename IdType>
-void _Merge(IdType** arrs, IdType* prefix, IdType* offset, IdType* out,
-            int64_t n_arrs, int n_elms,
-            DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
+void _Merge(
+    IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
+    int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
  auto device = runtime::DeviceAPI::Get(ctx);
  int nt = 256;
  int nb = (n_elms + nt - 1) / nt;

  IdType** arrs_dev = static_cast<IdType**>(
-      device->AllocWorkspace(ctx, n_arrs*sizeof(IdType*)));
+      device->AllocWorkspace(ctx, n_arrs * sizeof(IdType*)));

  device->CopyDataFromTo(
-      arrs, 0, arrs_dev, 0, sizeof(IdType*)*n_arrs,
-      DGLContext{kDGLCPU, 0}, ctx, dtype);
+      arrs, 0, arrs_dev, 0, sizeof(IdType*) * n_arrs, DGLContext{kDGLCPU, 0},
+      ctx, dtype);

-  CUDA_KERNEL_CALL(_DisjointUnionKernel,
-      nb, nt, 0, stream,
-      arrs_dev, prefix, offset,
-      out, n_arrs, n_elms);
+  CUDA_KERNEL_CALL(
+      _DisjointUnionKernel, nb, nt, 0, stream, arrs_dev, prefix, offset, out,
+      n_arrs, n_elms);

  device->FreeWorkspace(ctx, arrs_dev);
 }
@@ -132,52 +135,50 @@ COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {

  IdType n_elements = 0;
  device->CopyDataFromTo(
-      &prefix_elm[coos.size()], 0, &n_elements, 0,
-      sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
-      coos[0].row->dtype);
+      &prefix_elm[coos.size()], 0, &n_elements, 0, sizeof(IdType),
+      coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);

  device->CopyDataFromTo(
-      &prefix_src[coos.size()], 0, &src_offset, 0,
-      sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
-      coos[0].row->dtype);
+      &prefix_src[coos.size()], 0, &src_offset, 0, sizeof(IdType),
+      coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);

  device->CopyDataFromTo(
-      &prefix_dst[coos.size()], 0, &dst_offset, 0,
-      sizeof(IdType), coos[0].row->ctx, DGLContext{kDGLCPU, 0},
-      coos[0].row->dtype);
+      &prefix_dst[coos.size()], 0, &dst_offset, 0, sizeof(IdType),
+      coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);

  // Union src array
-  IdArray result_src = NewIdArray(
-    n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
-  _Merge<XPU, IdType>(rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(),
-         coos.size(), n_elements, ctx, dtype, stream);
+  IdArray result_src =
+      NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
+  _Merge<XPU, IdType>(
+      rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(), coos.size(),
+      n_elements, ctx, dtype, stream);

  // Union dst array
-  IdArray result_dst = NewIdArray(
-    n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
-  _Merge<XPU, IdType>(cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(),
-         coos.size(), n_elements, ctx, dtype, stream);
+  IdArray result_dst =
+      NewIdArray(n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
+  _Merge<XPU, IdType>(
+      cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(), coos.size(),
+      n_elements, ctx, dtype, stream);

  // Union data array if exists and fetch number of elements
  IdArray result_dat = NullArray();
  if (has_data) {
-    result_dat =  NewIdArray(
-      n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
-    _Merge<XPU, IdType>(data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
-          coos.size(), n_elements, ctx, dtype, stream);
+    result_dat =
+        NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
+    _Merge<XPU, IdType>(
+        data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
+        coos.size(), n_elements, ctx, dtype, stream);
  }

  return COOMatrix(
-    src_offset, dst_offset,
-    result_src,
-    result_dst,
-    result_dat,
-    row_sorted,
-    col_sorted);
+      src_offset, dst_offset, result_src, result_dst, result_dat, row_sorted,
+      col_sorted);
 }

-template COOMatrix DisjointUnionCoo<kDGLCUDA, int32_t>(const std::vector<COOMatrix>& coos);
-template COOMatrix DisjointUnionCoo<kDGLCUDA, int64_t>(const std::vector<COOMatrix>& coos);
+template COOMatrix DisjointUnionCoo<kDGLCUDA, int32_t>(
+    const std::vector<COOMatrix>& coos);
+template COOMatrix DisjointUnionCoo<kDGLCUDA, int64_t>(
+    const std::vector<COOMatrix>& coos);

 }  // namespace impl
 }  // namespace aten