[Misc] clang-format auto fix. (#4812)

* [Misc] clang-format auto fix. * manual Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] clang-format auto fix. (#4812)
* [Misc] clang-format auto fix. * manual Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
33a2d9e1 · Hongzhi (Steve), Chen · GitHub · 360e6cb4 · 33a2d9e1 · 33a2d9e1
Unverified Commit 33a2d9e1 authored Nov 04, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 04, 2022
20 changed files
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -5,8 +5,9 @@
 */
 #include <dgl/runtime/threading_backend.h>
 #include <dmlc/logging.h>
-#include <thread>
+
 #include <algorithm>
+#include <thread>
 #if defined(__linux__) || defined(__ANDROID__)
 #include <fstream>
 #else
@@ -21,12 +22,12 @@ namespace threading {

 class ThreadGroup::Impl {
 public:
-  Impl(int num_workers,
-       std::function<void(int)> worker_callback,
-       bool exclude_worker0)
+  Impl(
+      int num_workers, std::function<void(int)> worker_callback,
+      bool exclude_worker0)
      : num_workers_(num_workers) {
    CHECK_GE(num_workers, 1)
-      << "Requested a non-positive number of worker threads.";
+        << "Requested a non-positive number of worker threads.";
    for (int i = exclude_worker0; i < num_workers_; ++i) {
      threads_.emplace_back([worker_callback, i] { worker_callback(i); });
    }
@@ -35,7 +36,7 @@ class ThreadGroup::Impl {
  ~Impl() { Join(); }

  void Join() {
-    for (auto& t : threads_) {
+    for (auto &t : threads_) {
      if (t.joinable()) t.join();
    }
  }
@@ -64,11 +65,11 @@ class ThreadGroup::Impl {
    if (val == nullptr || atoi(val) == 1) {
      // Do not set affinity if there are more workers than found cores
      if (sorted_order_.size() >= static_cast<unsigned int>(num_workers_)) {
-          SetAffinity(exclude_worker0, mode == kLittle);
+        SetAffinity(exclude_worker0, mode == kLittle);
      } else {
        LOG(WARNING)
-          << "The thread affinity cannot be set when the number of workers"
-          << "is larger than the number of available cores in the system.";
+            << "The thread affinity cannot be set when the number of workers"
+            << "is larger than the number of available cores in the system.";
      }
    }
    return num_workers_used;
@@ -82,15 +83,14 @@ class ThreadGroup::Impl {
 #if defined(__ANDROID__)
 #ifndef CPU_SET
 #define CPU_SETSIZE 1024
-#define __NCPUBITS (8 * sizeof (uint64_t))
+#define __NCPUBITS (8 * sizeof(uint64_t))
    typedef struct {
      uint64_t __bits[CPU_SETSIZE / __NCPUBITS];
    } cpu_set_t;

 #define CPU_SET(cpu, cpusetp) \
-    ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
-#define CPU_ZERO(cpusetp) \
-    memset((cpusetp), 0, sizeof(cpu_set_t))
+  ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
+#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
 #endif
 #endif
 #if defined(__linux__) || defined(__ANDROID__)
@@ -99,7 +99,8 @@ class ThreadGroup::Impl {
    for (unsigned i = 0; i < threads_.size(); ++i) {
      unsigned core_id;
      if (reverse) {
-        core_id = sorted_order_[sorted_order_.size() - (i + exclude_worker0) - 1];
+        core_id =
+            sorted_order_[sorted_order_.size() - (i + exclude_worker0) - 1];
      } else {
        core_id = sorted_order_[i + exclude_worker0];
      }
@@ -107,10 +108,11 @@ class ThreadGroup::Impl {
      CPU_ZERO(&cpuset);
      CPU_SET(core_id, &cpuset);
 #if defined(__ANDROID__)
-      sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
+      sched_setaffinity(
+          threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
 #else
-      pthread_setaffinity_np(threads_[i].native_handle(),
-          sizeof(cpu_set_t), &cpuset);
+      pthread_setaffinity_np(
+          threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
 #endif
    }
    if (exclude_worker0) {  // bind the master thread to core 0
@@ -122,11 +124,9 @@ class ThreadGroup::Impl {
        CPU_SET(sorted_order_[0], &cpuset);
      }
 #if defined(__ANDROID__)
-      sched_setaffinity(pthread_self(),
-        sizeof(cpu_set_t), &cpuset);
+      sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
 #else
-      pthread_setaffinity_np(pthread_self(),
-        sizeof(cpu_set_t), &cpuset);
+      pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 #endif
    }
 #endif
@@ -134,27 +134,28 @@ class ThreadGroup::Impl {

  void InitSortedOrder() {
    unsigned int threads = std::thread::hardware_concurrency();
-    std::vector<std::pair <unsigned int, int64_t> > max_freqs;
+    std::vector<std::pair<unsigned int, int64_t> > max_freqs;

    for (unsigned int i = 0; i < threads; ++i) {
      int64_t cur_freq = 0;
-      #if defined(__linux__) || defined(__ANDROID__)
-        std::ostringstream filepath;
-        filepath << "/sys/devices/system/cpu/cpu"  << i << "/cpufreq/cpuinfo_max_freq";
-        std::ifstream ifs(filepath.str());
-        if (!ifs.fail()) {
-          if (!(ifs >> cur_freq)) {
-            cur_freq = -1;
-          }
-          ifs.close();
+#if defined(__linux__) || defined(__ANDROID__)
+      std::ostringstream filepath;
+      filepath << "/sys/devices/system/cpu/cpu" << i
+               << "/cpufreq/cpuinfo_max_freq";
+      std::ifstream ifs(filepath.str());
+      if (!ifs.fail()) {
+        if (!(ifs >> cur_freq)) {
+          cur_freq = -1;
        }
-      #endif
+        ifs.close();
+      }
+#endif
      max_freqs.push_back(std::make_pair(i, cur_freq));
    }

-    auto fcmpbyfreq = [] (const std::pair<unsigned int, int64_t> &a,
-                          const std::pair<unsigned int, int64_t> &b) {
-        return a.second == b.second ? a.first < b.first : a.second > b.second;
+    auto fcmpbyfreq = [](const std::pair<unsigned int, int64_t> &a,
+                         const std::pair<unsigned int, int64_t> &b) {
+      return a.second == b.second ? a.first < b.first : a.second > b.second;
    };
    std::sort(max_freqs.begin(), max_freqs.end(), fcmpbyfreq);
    int64_t big_freq = max_freqs.begin()->second;
@@ -180,20 +181,20 @@ class ThreadGroup::Impl {
  int little_count_ = 0;
 };

-ThreadGroup::ThreadGroup(int num_workers,
-                         std::function<void(int)> worker_callback,
-                         bool exclude_worker0)
-  : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {}
+ThreadGroup::ThreadGroup(
+    int num_workers, std::function<void(int)> worker_callback,
+    bool exclude_worker0)
+    : impl_(new ThreadGroup::Impl(
+          num_workers, worker_callback, exclude_worker0)) {}
 ThreadGroup::~ThreadGroup() { delete impl_; }
 void ThreadGroup::Join() { impl_->Join(); }

-int ThreadGroup::Configure(AffinityMode mode, int nthreads, bool exclude_worker0) {
+int ThreadGroup::Configure(
+    AffinityMode mode, int nthreads, bool exclude_worker0) {
  return impl_->Configure(mode, nthreads, exclude_worker0);
 }

-void YieldThread() {
-  std::this_thread::yield();
-}
+void YieldThread() { std::this_thread::yield(); }

 int MaxConcurrency() {
  int max_concurrency = 1;
@@ -212,7 +213,6 @@ int MaxConcurrency() {
  return std::max(max_concurrency, 1);
 }

-
 }  // namespace threading
 }  // namespace runtime
 }  // namespace dgl
--- a/src/runtime/utils.cc
+++ b/src/runtime/utils.cc
@@ -4,16 +4,14 @@
 * \brief DGL util functions
 */

-#include <dmlc/omp.h>
-
-
 #include <dgl/aten/coo.h>
 #include <dgl/packed_func_ext.h>
+#include <dmlc/omp.h>
+
 #include <utility>

-#include "../c_api_common.h"
 #include "../array/array_op.h"
-
+#include "../c_api_common.h"

 using namespace dgl::runtime;
 using namespace dgl::aten::impl;
@@ -21,33 +19,33 @@ using namespace dgl::aten::impl;
 namespace dgl {

 DGL_REGISTER_GLOBAL("utils.internal._CAPI_DGLSetOMPThreads")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    int num_threads = args[0];
-    omp_set_num_threads(num_threads);
-  });
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      int num_threads = args[0];
+      omp_set_num_threads(num_threads);
+    });

 DGL_REGISTER_GLOBAL("utils.internal._CAPI_DGLGetOMPThreads")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    *rv = omp_get_max_threads();
-  });
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      *rv = omp_get_max_threads();
+    });

 DGL_REGISTER_GLOBAL("utils.checks._CAPI_DGLCOOIsSorted")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    IdArray src = args[0];
-    IdArray dst = args[1];
-    int64_t num_src = args[2];
-    int64_t num_dst = args[3];
-
-    bool row_sorted, col_sorted;
-    std::tie(row_sorted, col_sorted) = COOIsSorted(
-        aten::COOMatrix(num_src, num_dst, src, dst));
-
-    // make sure col_sorted is only true when row_sorted is true
-    assert(!(!row_sorted && col_sorted));
-
-    // 0 for unosrted, 1 for row sorted, 2 for row and col sorted
-    int64_t sorted_status = row_sorted + col_sorted;
-    *rv = sorted_status;
-  });
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      IdArray src = args[0];
+      IdArray dst = args[1];
+      int64_t num_src = args[2];
+      int64_t num_dst = args[3];
+
+      bool row_sorted, col_sorted;
+      std::tie(row_sorted, col_sorted) =
+          COOIsSorted(aten::COOMatrix(num_src, num_dst, src, dst));
+
+      // make sure col_sorted is only true when row_sorted is true
+      assert(!(!row_sorted && col_sorted));
+
+      // 0 for unosrted, 1 for row sorted, 2 for row and col sorted
+      int64_t sorted_status = row_sorted + col_sorted;
+      *rv = sorted_status;
+    });

 }  // namespace dgl
--- a/src/runtime/workspace.h
+++ b/src/runtime/workspace.h
@@ -4,23 +4,23 @@
 * \brief Operations on partition implemented in CUDA.
 */

-
 #ifndef DGL_RUNTIME_WORKSPACE_H_
 #define DGL_RUNTIME_WORKSPACE_H_

 #include <dgl/runtime/device_api.h>
+
 #include <cassert>

 namespace dgl {
 namespace runtime {

-template<typename T>
+template <typename T>
 class Workspace {
 public:
-  Workspace(DeviceAPI* device, DGLContext ctx, const size_t size) :
-      device_(device),
-      ctx_(ctx),
-      ptr_(static_cast<T*>(device_->AllocWorkspace(ctx_, sizeof(T)*size))) {
+  Workspace(DeviceAPI* device, DGLContext ctx, const size_t size)
+      : device_(device),
+        ctx_(ctx),
+        ptr_(static_cast<T*>(device_->AllocWorkspace(ctx_, sizeof(T) * size))) {
  }

  ~Workspace() {
@@ -29,16 +29,14 @@ class Workspace {
    }
  }

-  operator bool() const {
-    return ptr_ != nullptr;
-  }
+  operator bool() const { return ptr_ != nullptr; }

-  T * get() {
+  T* get() {
    assert(*this);
    return ptr_;
  }

-  T const * get() const {
+  T const* get() const {
    assert(*this);
    return ptr_;
  }
@@ -52,17 +50,16 @@ class Workspace {
 private:
  DeviceAPI* device_;
  DGLContext ctx_;
-  T * ptr_;
+  T* ptr_;
 };

-template<>
+template <>
 class Workspace<void> {
 public:
-  Workspace(DeviceAPI* device, DGLContext ctx, const size_t size) :
-      device_(device),
-      ctx_(ctx),
-      ptr_(static_cast<void*>(device_->AllocWorkspace(ctx_, size))) {
-  }
+  Workspace(DeviceAPI* device, DGLContext ctx, const size_t size)
+      : device_(device),
+        ctx_(ctx),
+        ptr_(static_cast<void*>(device_->AllocWorkspace(ctx_, size))) {}

  ~Workspace() {
    if (*this) {
@@ -70,16 +67,14 @@ class Workspace<void> {
    }
  }

-  operator bool() const {
-    return ptr_ != nullptr;
-  }
+  operator bool() const { return ptr_ != nullptr; }

-  void * get() {
+  void* get() {
    assert(*this);
    return ptr_;
  }

-  void const * get() const {
+  void const* get() const {
    assert(*this);
    return ptr_;
  }
@@ -93,7 +88,7 @@ class Workspace<void> {
 private:
  DeviceAPI* device_;
  DGLContext ctx_;
-  void * ptr_;
+  void* ptr_;
 };

 }  // namespace runtime

--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -4,6 +4,7 @@
 * \brief Workspace pool utility.
 */
 #include "workspace_pool.h"
+
 #include <memory>

 namespace dgl {
@@ -26,7 +27,8 @@ class WorkspacePool::Pool {
  // allocate from pool
  void* Alloc(DGLContext ctx, DeviceAPI* device, size_t nbytes) {
    // Allocate align to page.
-    nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize * kWorkspacePageSize;
+    nbytes = (nbytes + (kWorkspacePageSize - 1)) / kWorkspacePageSize *
+             kWorkspacePageSize;
    if (nbytes == 0) nbytes = kWorkspacePageSize;
    Entry e;
    DGLDataType type;
@@ -39,7 +41,8 @@ class WorkspacePool::Pool {
      if (e.size < nbytes) {
        // resize the page
        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        e.data =
+            device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
        e.size = nbytes;
      }
    } else if (free_list_.size() == 1) {
@@ -49,7 +52,8 @@ class WorkspacePool::Pool {
      if (free_list_.back().size >= nbytes) {
        // find smallest fit
        auto it = free_list_.end() - 2;
-        for (; it->size >= nbytes; --it) {}
+        for (; it->size >= nbytes; --it) {
+        }
        e = *(it + 1);
        free_list_.erase(it + 1);
      } else {
@@ -57,7 +61,8 @@ class WorkspacePool::Pool {
        e = free_list_.back();
        free_list_.pop_back();
        device->FreeDataSpace(ctx, e.data);
-        e.data = device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
+        e.data =
+            device->AllocDataSpace(ctx, nbytes, kTempAllocaAlignment, type);
        e.size = nbytes;
      }
    }
@@ -73,7 +78,8 @@ class WorkspacePool::Pool {
      allocated_.pop_back();
    } else {
      int index = static_cast<int>(allocated_.size()) - 2;
-      for (; index > 0 && allocated_[index].data != data; --index) {}
+      for (; index > 0 && allocated_[index].data != data; --index) {
+      }
      CHECK_GT(index, 0) << "trying to free things that has not been allocated";
      e = allocated_[index];
      allocated_.erase(allocated_.begin() + index);
@@ -113,16 +119,16 @@ class WorkspacePool::Pool {
  std::vector<Entry> allocated_;
 };

-WorkspacePool::WorkspacePool(DGLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
-    : device_type_(device_type), device_(device) {
-}
+WorkspacePool::WorkspacePool(
+    DGLDeviceType device_type, std::shared_ptr<DeviceAPI> device)
+    : device_type_(device_type), device_(device) {}

 WorkspacePool::~WorkspacePool() {
-  /*
-   Comment out the destruct of WorkspacePool, due to Segmentation fault with MXNet
-   Since this will be only called at the termination of process,
-   not manually wiping out should not cause problems.
-  */
+  /**
+   * Comment out the destruct of WorkspacePool, due to Segmentation fault with
+   * MXNet Since this will be only called at the termination of process, not
+   * manually wiping out should not cause problems.
+   */
  // for (size_t i = 0; i < array_.size(); ++i) {
  //   if (array_[i] != nullptr) {
  //     DGLContext ctx;
@@ -145,8 +151,9 @@ void* WorkspacePool::AllocWorkspace(DGLContext ctx, size_t size) {
 }

 void WorkspacePool::FreeWorkspace(DGLContext ctx, void* ptr) {
-  CHECK(static_cast<size_t>(ctx.device_id) < array_.size() &&
-        array_[ctx.device_id] != nullptr);
+  CHECK(
+      static_cast<size_t>(ctx.device_id) < array_.size() &&
+      array_[ctx.device_id] != nullptr);
  array_[ctx.device_id]->Free(ptr);
 }


--- a/src/runtime/workspace_pool.h
+++ b/src/runtime/workspace_pool.h
@@ -7,8 +7,9 @@
 #define DGL_RUNTIME_WORKSPACE_POOL_H_

 #include <dgl/runtime/device_api.h>
-#include <vector>
+
 #include <memory>
+#include <vector>

 namespace dgl {
 namespace runtime {

--- a/src/scheduler/scheduler.cc
+++ b/src/scheduler/scheduler.cc
@@ -4,6 +4,7 @@
 * \brief DGL Scheduler implementation
 */
 #include <dgl/scheduler.h>
+
 #include <unordered_map>
 #include <vector>

@@ -11,100 +12,97 @@ namespace dgl {
 namespace sched {

 template <class IdType>
-std::vector<IdArray> DegreeBucketing(const IdArray& msg_ids, const IdArray& vids,
-        const IdArray& recv_ids) {
-    auto n_msgs = msg_ids->shape[0];
-
-    const IdType* vid_data = static_cast<IdType*>(vids->data);
-    const IdType* msg_id_data = static_cast<IdType*>(msg_ids->data);
-    const IdType* recv_id_data = static_cast<IdType*>(recv_ids->data);
-
-    // in edge: dst->msgs
-    std::unordered_map<IdType, std::vector<IdType>> in_edges;
-    for (IdType i = 0; i < n_msgs; ++i) {
-        in_edges[vid_data[i]].push_back(msg_id_data[i]);
-    }
+std::vector<IdArray> DegreeBucketing(
+    const IdArray& msg_ids, const IdArray& vids, const IdArray& recv_ids) {
+  auto n_msgs = msg_ids->shape[0];

-    // bkt: deg->dsts
-    std::unordered_map<IdType, std::vector<IdType>> bkt;
-    for (const auto& it : in_edges) {
-        bkt[it.second.size()].push_back(it.first);
-    }
+  const IdType* vid_data = static_cast<IdType*>(vids->data);
+  const IdType* msg_id_data = static_cast<IdType*>(msg_ids->data);
+  const IdType* recv_id_data = static_cast<IdType*>(recv_ids->data);

-    std::unordered_set<IdType> zero_deg_nodes;
-    for (IdType i = 0; i < recv_ids->shape[0]; ++i) {
-        if (in_edges.find(recv_id_data[i]) == in_edges.end()) {
-            zero_deg_nodes.insert(recv_id_data[i]);
-        }
-    }
-    auto n_zero_deg = zero_deg_nodes.size();
-
-    // calc output size
-    IdType n_deg = bkt.size();
-    IdType n_dst = in_edges.size();
-    IdType n_mid_sec = bkt.size();  // zero deg won't affect message size
-    if (n_zero_deg > 0) {
-        n_deg += 1;
-        n_dst += n_zero_deg;
+  // in edge: dst->msgs
+  std::unordered_map<IdType, std::vector<IdType>> in_edges;
+  for (IdType i = 0; i < n_msgs; ++i) {
+    in_edges[vid_data[i]].push_back(msg_id_data[i]);
+  }
+
+  // bkt: deg->dsts
+  std::unordered_map<IdType, std::vector<IdType>> bkt;
+  for (const auto& it : in_edges) {
+    bkt[it.second.size()].push_back(it.first);
+  }
+
+  std::unordered_set<IdType> zero_deg_nodes;
+  for (IdType i = 0; i < recv_ids->shape[0]; ++i) {
+    if (in_edges.find(recv_id_data[i]) == in_edges.end()) {
+      zero_deg_nodes.insert(recv_id_data[i]);
    }
+  }
+  auto n_zero_deg = zero_deg_nodes.size();
+
+  // calc output size
+  IdType n_deg = bkt.size();
+  IdType n_dst = in_edges.size();
+  IdType n_mid_sec = bkt.size();  // zero deg won't affect message size
+  if (n_zero_deg > 0) {
+    n_deg += 1;
+    n_dst += n_zero_deg;
+  }

-    // initialize output
-    IdArray degs = IdArray::Empty({n_deg}, vids->dtype, vids->ctx);
-    IdArray nids = IdArray::Empty({n_dst}, vids->dtype, vids->ctx);
-    IdArray nid_section = IdArray::Empty({n_deg}, vids->dtype, vids->ctx);
-    IdArray mids = IdArray::Empty({n_msgs}, vids->dtype, vids->ctx);
-    IdArray mid_section = IdArray::Empty({n_mid_sec}, vids->dtype, vids->ctx);
-    IdType* deg_ptr = static_cast<IdType*>(degs->data);
-    IdType* nid_ptr = static_cast<IdType*>(nids->data);
-    IdType* nsec_ptr = static_cast<IdType*>(nid_section->data);
-    IdType* mid_ptr = static_cast<IdType*>(mids->data);
-    IdType* msec_ptr = static_cast<IdType*>(mid_section->data);
-
-    // fill in bucketing ordering
-    for (const auto& it : bkt) {  // for each bucket
-        const IdType deg = it.first;
-        const IdType bucket_size = it.second.size();
-        *deg_ptr++ = deg;
-        *nsec_ptr++ = bucket_size;
-        *msec_ptr++ = deg * bucket_size;
-        for (const auto dst : it.second) {  // for each dst in this bucket
-            *nid_ptr++ = dst;
-            for (const auto mid : in_edges[dst]) {  // for each in edge of dst
-                *mid_ptr++ = mid;
-            }
-        }
+  // initialize output
+  IdArray degs = IdArray::Empty({n_deg}, vids->dtype, vids->ctx);
+  IdArray nids = IdArray::Empty({n_dst}, vids->dtype, vids->ctx);
+  IdArray nid_section = IdArray::Empty({n_deg}, vids->dtype, vids->ctx);
+  IdArray mids = IdArray::Empty({n_msgs}, vids->dtype, vids->ctx);
+  IdArray mid_section = IdArray::Empty({n_mid_sec}, vids->dtype, vids->ctx);
+  IdType* deg_ptr = static_cast<IdType*>(degs->data);
+  IdType* nid_ptr = static_cast<IdType*>(nids->data);
+  IdType* nsec_ptr = static_cast<IdType*>(nid_section->data);
+  IdType* mid_ptr = static_cast<IdType*>(mids->data);
+  IdType* msec_ptr = static_cast<IdType*>(mid_section->data);
+
+  // fill in bucketing ordering
+  for (const auto& it : bkt) {  // for each bucket
+    const IdType deg = it.first;
+    const IdType bucket_size = it.second.size();
+    *deg_ptr++ = deg;
+    *nsec_ptr++ = bucket_size;
+    *msec_ptr++ = deg * bucket_size;
+    for (const auto dst : it.second) {  // for each dst in this bucket
+      *nid_ptr++ = dst;
+      for (const auto mid : in_edges[dst]) {  // for each in edge of dst
+        *mid_ptr++ = mid;
+      }
    }
+  }

-    if (n_zero_deg > 0) {
-        *deg_ptr = 0;
-        *nsec_ptr = n_zero_deg;
-        for (const auto dst : zero_deg_nodes) {
-            *nid_ptr++ = dst;
-        }
+  if (n_zero_deg > 0) {
+    *deg_ptr = 0;
+    *nsec_ptr = n_zero_deg;
+    for (const auto dst : zero_deg_nodes) {
+      *nid_ptr++ = dst;
    }
+  }

-    std::vector<IdArray> ret;
-    ret.push_back(std::move(degs));
-    ret.push_back(std::move(nids));
-    ret.push_back(std::move(nid_section));
-    ret.push_back(std::move(mids));
-    ret.push_back(std::move(mid_section));
+  std::vector<IdArray> ret;
+  ret.push_back(std::move(degs));
+  ret.push_back(std::move(nids));
+  ret.push_back(std::move(nid_section));
+  ret.push_back(std::move(mids));
+  ret.push_back(std::move(mid_section));

-    return ret;
+  return ret;
 }

-template std::vector<IdArray> DegreeBucketing<int32_t>(const IdArray& msg_ids,
-                                                       const IdArray& vids,
-                                                       const IdArray& recv_ids);
+template std::vector<IdArray> DegreeBucketing<int32_t>(
+    const IdArray& msg_ids, const IdArray& vids, const IdArray& recv_ids);

-template std::vector<IdArray> DegreeBucketing<int64_t>(const IdArray& msg_ids,
-                                                       const IdArray& vids,
-                                                       const IdArray& recv_ids);
+template std::vector<IdArray> DegreeBucketing<int64_t>(
+    const IdArray& msg_ids, const IdArray& vids, const IdArray& recv_ids);

 template <class IdType>
-std::vector<IdArray> GroupEdgeByNodeDegree(const IdArray& uids,
-                                           const IdArray& vids,
-                                           const IdArray& eids) {
+std::vector<IdArray> GroupEdgeByNodeDegree(
+    const IdArray& uids, const IdArray& vids, const IdArray& eids) {
  auto n_edge = eids->shape[0];
  const IdType* eid_data = static_cast<IdType*>(eids->data);
  const IdType* uid_data = static_cast<IdType*>(uids->data);

--- a/src/scheduler/scheduler_apis.cc
+++ b/src/scheduler/scheduler_apis.cc
@@ -6,8 +6,9 @@
 #include <dgl/array.h>
 #include <dgl/graph.h>
 #include <dgl/scheduler.h>
-#include "../c_api_common.h"
+
 #include "../array/cpu/array_utils.h"
+#include "../c_api_common.h"

 using dgl::runtime::DGLArgs;
 using dgl::runtime::DGLRetValue;
@@ -15,30 +16,32 @@ using dgl::runtime::NDArray;

 namespace dgl {

-DGL_REGISTER_GLOBAL("_deprecate.runtime.degree_bucketing._CAPI_DGLDegreeBucketing")
-  .set_body([](DGLArgs args, DGLRetValue* rv) {
-    const IdArray msg_ids = args[0];
-    const IdArray vids = args[1];
-    const IdArray nids = args[2];
-    CHECK_SAME_DTYPE(msg_ids, vids);
-    CHECK_SAME_DTYPE(msg_ids, nids);
-    ATEN_ID_TYPE_SWITCH(msg_ids->dtype, IdType, {
-      *rv = ConvertNDArrayVectorToPackedFunc(
-        sched::DegreeBucketing<IdType>(msg_ids, vids, nids));
+DGL_REGISTER_GLOBAL(
+    "_deprecate.runtime.degree_bucketing._CAPI_DGLDegreeBucketing")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      const IdArray msg_ids = args[0];
+      const IdArray vids = args[1];
+      const IdArray nids = args[2];
+      CHECK_SAME_DTYPE(msg_ids, vids);
+      CHECK_SAME_DTYPE(msg_ids, nids);
+      ATEN_ID_TYPE_SWITCH(msg_ids->dtype, IdType, {
+        *rv = ConvertNDArrayVectorToPackedFunc(
+            sched::DegreeBucketing<IdType>(msg_ids, vids, nids));
+      });
    });
-  });

-DGL_REGISTER_GLOBAL("_deprecate.runtime.degree_bucketing._CAPI_DGLGroupEdgeByNodeDegree")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
-    const IdArray uids = args[0];
-    const IdArray vids = args[1];
-    const IdArray eids = args[2];
-    CHECK_SAME_DTYPE(uids, vids);
-    CHECK_SAME_DTYPE(uids, eids);
-    ATEN_ID_TYPE_SWITCH(uids->dtype, IdType, {
-      *rv = ConvertNDArrayVectorToPackedFunc(
-        sched::GroupEdgeByNodeDegree<IdType>(uids, vids, eids));
+DGL_REGISTER_GLOBAL(
+    "_deprecate.runtime.degree_bucketing._CAPI_DGLGroupEdgeByNodeDegree")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      const IdArray uids = args[0];
+      const IdArray vids = args[1];
+      const IdArray eids = args[2];
+      CHECK_SAME_DTYPE(uids, vids);
+      CHECK_SAME_DTYPE(uids, eids);
+      ATEN_ID_TYPE_SWITCH(uids->dtype, IdType, {
+        *rv = ConvertNDArrayVectorToPackedFunc(
+            sched::GroupEdgeByNodeDegree<IdType>(uids, vids, eids));
+      });
    });
-  });

 }  // namespace dgl
--- a/tensoradapter/include/tensoradapter.h
+++ b/tensoradapter/include/tensoradapter.h
@@ -3,8 +3,8 @@
 * \file tensoradapter.h
 * \brief Header file for functions exposed by the adapter library.
 *
- * Functions in this library must be exported with extern "C" so that DGL can locate
- * them with dlsym(3) (or GetProcAddress on Windows).
+ * Functions in this library must be exported with extern "C" so that DGL can
+ * locate them with dlsym(3) (or GetProcAddress on Windows).
 */

 #ifndef TENSORADAPTER_H_
@@ -66,7 +66,6 @@ cudaStream_t CUDACurrentStream();
 */
 void RecordStream(void* ptr, cudaStream_t stream, int device_id);
 #endif  // DGL_USE_CUDA
-
 }

 };  // namespace tensoradapter

--- a/tensoradapter/pytorch/torch.cpp
+++ b/tensoradapter/pytorch/torch.cpp
@@ -4,11 +4,10 @@
 * \brief Implementation of PyTorch adapter library.
 */

-#include <tensoradapter_exports.h>
 #include <c10/core/CPUAllocator.h>
+#include <tensoradapter_exports.h>
 #ifdef DGL_USE_CUDA
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAStream.h>
 #include <cuda_runtime.h>
@@ -29,8 +28,7 @@ TA_EXPORTS void CPURawDelete(void* ptr) {
 #ifdef DGL_USE_CUDA
 TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) {
  at::globalContext().lazyInitCUDA();
-  return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(
-    nbytes, stream);
+  return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream);
 }

 TA_EXPORTS void CUDARawDelete(void* ptr) {
@@ -43,22 +41,21 @@ TA_EXPORTS cudaStream_t CUDACurrentStream() {

 TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) {
  c10::DataPtr data_ptr{
-    ptr, ptr, &c10::cuda::CUDACachingAllocator::raw_delete,
-    c10::Device(c10::DeviceType::CUDA, device_id)};
+      ptr, ptr, &c10::cuda::CUDACachingAllocator::raw_delete,
+      c10::Device(c10::DeviceType::CUDA, device_id)};
  c10::cuda::CUDACachingAllocator::recordStream(
-    data_ptr,
-    // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it here
-    c10::cuda::CUDAStream(
-      c10::cuda::CUDAStream::UNCHECKED,
-      c10::Stream(
-          c10::Stream::UNSAFE,
-          c10::Device(c10::DeviceType::CUDA, device_id),
-          reinterpret_cast<int64_t>(stream)))
-  );
+      data_ptr,
+      // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it
+      // here
+      c10::cuda::CUDAStream(
+          c10::cuda::CUDAStream::UNCHECKED,
+          c10::Stream(
+              c10::Stream::UNSAFE,
+              c10::Device(c10::DeviceType::CUDA, device_id),
+              reinterpret_cast<int64_t>(stream))));
  data_ptr.release_context();
 }
 #endif  // DGL_USE_CUDA
-
 };

 };  // namespace tensoradapter
--- a/tests/cpp/common.h
+++ b/tests/cpp/common.h
@@ -22,9 +22,7 @@ inline int32_t* PI32(dgl::runtime::NDArray nd) {
  return static_cast<int32_t*>(nd->data);
 }

-inline int64_t Len(dgl::runtime::NDArray nd) {
-  return nd->shape[0];
-}
+inline int64_t Len(dgl::runtime::NDArray nd) { return nd->shape[0]; }

 template <typename T>
 inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) {
@@ -35,8 +33,7 @@ inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) {
  if (a1.NumElements() == 0) return true;
  int64_t num = 1;
  for (int i = 0; i < a1->ndim; ++i) {
-    if (a1->shape[i] != a2->shape[i])
-      return false;
+    if (a1->shape[i] != a2->shape[i]) return false;
    num *= a1->shape[i];
  }
  if (a1->ctx != a2->ctx) return false;
@@ -50,11 +47,9 @@ inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) {

 template <typename T>
 inline bool IsInArray(dgl::runtime::NDArray a, T x) {
-  if (!a.defined() || a->shape[0] == 0)
-    return false;
+  if (!a.defined() || a->shape[0] == 0) return false;
  for (int64_t i = 0; i < a->shape[0]; ++i) {
-    if (x == static_cast<T*>(a->data)[i])
-      return true;
+    if (x == static_cast<T*>(a->data)[i]) return true;
  }
  return false;
 }

--- a/tests/cpp/graph_index_test.cc
+++ b/tests/cpp/graph_index_test.cc
@@ -3,10 +3,10 @@
 * \file graph_index_test.cc
 * \brief Test GraphIndex
 */
-#include <gtest/gtest.h>
 #include <dgl/graph.h>
+#include <gtest/gtest.h>

-TEST(GraphTest, TestNumVertices){
+TEST(GraphTest, TestNumVertices) {
  dgl::Graph g;
  g.AddVertices(10);
  ASSERT_EQ(g.NumVertices(), 10);

--- a/tests/cpp/message_queue_test.cc
+++ b/tests/cpp/message_queue_test.cc
@@ -4,15 +4,16 @@
 * \brief Message queue for DGL distributed training.
 */
 #include <gtest/gtest.h>
+
 #include <string>
 #include <thread>
 #include <vector>

 #include "../src/rpc/network/msg_queue.h"

-using std::string;
 using dgl::network::Message;
 using dgl::network::MessageQueue;
+using std::string;

 TEST(MessageQueueTest, AddRemove) {
  MessageQueue queue(5, 1);  // size:5, num_of_producer:1
@@ -50,7 +51,7 @@ TEST(MessageQueueTest, AddRemove) {
  // msg 9
  std::string str_9("666666");
  Message msg_9 = {const_cast<char*>(str_9.data()), 6};
-  EXPECT_EQ(queue.Add(msg_9), MSG_GT_SIZE);      // exceed queue size
+  EXPECT_EQ(queue.Add(msg_9), MSG_GT_SIZE);  // exceed queue size
  // msg 10
  std::string str_10("55555");
  Message msg_10 = {const_cast<char*>(str_10.data()), 5};
@@ -92,7 +93,7 @@ TEST(MessageQueueTest, MultiThread) {
  for (int i = 0; i < kNumOfProducer; ++i) {
    thread_pool.push_back(new std::thread(start_add, &queue, i));
  }
-  for (int i = 0; i < kNumOfProducer*kNumOfMessage; ++i) {
+  for (int i = 0; i < kNumOfProducer * kNumOfMessage; ++i) {
    Message msg;
    EXPECT_EQ(queue.Remove(&msg), REMOVE_SUCCESS);
    EXPECT_EQ(string(msg.data, msg.size), string("apple"));

--- a/tests/cpp/socket_communicator_test.cc
+++ b/tests/cpp/socket_communicator_test.cc
@@ -3,26 +3,28 @@
 * \file socket_communicator_test.cc
 * \brief Test SocketCommunicator
 */
+#include "../src/rpc/network/socket_communicator.h"
+
 #include <gtest/gtest.h>
+#include <stdlib.h>
 #include <string.h>
+#include <time.h>
+
+#include <chrono>
+#include <fstream>
+#include <streambuf>
 #include <string>
 #include <thread>
 #include <vector>
-#include <fstream>
-#include <streambuf>
-#include <chrono>
-#include <stdlib.h>
-#include <time.h>

 #include "../src/rpc/network/msg_queue.h"
-#include "../src/rpc/network/socket_communicator.h"

 using std::string;

-using dgl::network::SocketSender;
-using dgl::network::SocketReceiver;
-using dgl::network::Message;
 using dgl::network::DefaultMessageDeleter;
+using dgl::network::Message;
+using dgl::network::SocketReceiver;
+using dgl::network::SocketSender;

 const int64_t kQueueSize = 500 * 1024;
 const int kThreadNum = 2;
@@ -35,10 +37,7 @@ const int kNumReceiver = 3;
 const int kNumMessage = 10;

 const char* ip_addr[] = {
-  "tcp://127.0.0.1:50091",
-  "tcp://127.0.0.1:50092",
-  "tcp://127.0.0.1:50093"
-};
+    "tcp://127.0.0.1:50091", "tcp://127.0.0.1:50092", "tcp://127.0.0.1:50093"};

 static void start_client();
 static void start_server(int id);
@@ -70,7 +69,7 @@ TEST(SocketCommunicatorTest, SendAndRecvTimeout) {
    sender.ConnectReceiver(ip_addr[0], 0);
    sender.ConnectReceiverFinalize(kMaxTryTimes);
    for (int i = 0; i < 2; ++i) {
-      char *str_data = new char[9];
+      char* str_data = new char[9];
      memcpy(str_data, "123456789", 9);
      Message msg = {str_data, 9};
      msg.deallocator = DefaultMessageDeleter;
@@ -144,7 +143,7 @@ void start_server(int id) {
      msg.deallocator(&msg);
    }
  }
-  for (int n = 0; n < kNumSender*kNumMessage; ++n) {
+  for (int n = 0; n < kNumSender * kNumMessage; ++n) {
    Message msg;
    int recv_id;
    EXPECT_EQ(receiver.Recv(&msg, &recv_id), REMOVE_SUCCESS);
@@ -169,9 +168,7 @@ TEST(SocketCommunicatorTest, TCPSocketBind) {

 #pragma comment(lib, "ws2_32.lib")

-void sleep(int seconds) {
-  Sleep(seconds * 1000);
-}
+void sleep(int seconds) { Sleep(seconds * 1000); }

 static void start_client();
 static bool start_server();
@@ -181,9 +178,7 @@ DWORD WINAPI _ClientThreadFunc(LPVOID param) {
  return 0;
 }

-DWORD WINAPI _ServerThreadFunc(LPVOID param) {
-  return start_server() ? 1 : 0;
-}
+DWORD WINAPI _ServerThreadFunc(LPVOID param) { return start_server() ? 1 : 0; }

 TEST(SocketCommunicatorTest, SendAndRecv) {
  HANDLE hThreads[2];
@@ -191,7 +186,7 @@ TEST(SocketCommunicatorTest, SendAndRecv) {
  DWORD retcode, exitcode;

  srand((unsigned)time(NULL));
-  int port = (rand() % (5000-3000+1))+ 3000;
+  int port = (rand() % (5000 - 3000 + 1)) + 3000;
  std::string ip_addr = "tcp://127.0.0.1:" + std::to_string(port);
  std::ofstream out("addr.txt");
  out << ip_addr;
@@ -199,9 +194,11 @@ TEST(SocketCommunicatorTest, SendAndRecv) {

  ASSERT_EQ(::WSAStartup(MAKEWORD(2, 2), &wsaData), 0);

-  hThreads[0] = ::CreateThread(NULL, 0, _ClientThreadFunc, NULL, 0, NULL);  // client
+  hThreads[0] =
+      ::CreateThread(NULL, 0, _ClientThreadFunc, NULL, 0, NULL);  // client
  ASSERT_TRUE(hThreads[0] != NULL);
-  hThreads[1] = ::CreateThread(NULL, 0, _ServerThreadFunc, NULL, 0, NULL);  // server
+  hThreads[1] =
+      ::CreateThread(NULL, 0, _ServerThreadFunc, NULL, 0, NULL);  // server
  ASSERT_TRUE(hThreads[1] != NULL);

  retcode = ::WaitForMultipleObjects(2, hThreads, TRUE, INFINITE);
@@ -218,8 +215,8 @@ TEST(SocketCommunicatorTest, SendAndRecv) {

 static void start_client() {
  std::ifstream t("addr.txt");
-  std::string ip_addr((std::istreambuf_iterator<char>(t)),
-                       std::istreambuf_iterator<char>());
+  std::string ip_addr(
+      (std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
  t.close();
  SocketSender sender(kQueueSize, kThreadNum);
  sender.ConnectReceiver(ip_addr.c_str(), 0);
@@ -235,8 +232,8 @@ static void start_client() {
 static bool start_server() {
  sleep(5);
  std::ifstream t("addr.txt");
-  std::string ip_addr((std::istreambuf_iterator<char>(t)),
-                       std::istreambuf_iterator<char>());
+  std::string ip_addr(
+      (std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
  t.close();
  SocketReceiver receiver(kQueueSize, kThreadNum);
  receiver.Wait(ip_addr.c_str(), 1);

--- a/tests/cpp/string_test.cc
+++ b/tests/cpp/string_test.cc
@@ -4,15 +4,16 @@
 * \brief Test String Common
 */
 #include <gtest/gtest.h>
+
 #include <string>
 #include <vector>

 #include "../src/rpc/network/common.h"

 using dgl::network::SplitStringUsing;
-using dgl::network::StringPrintf;
 using dgl::network::SStringPrintf;
 using dgl::network::StringAppendF;
+using dgl::network::StringPrintf;

 TEST(SplitStringTest, SplitStringUsingCompoundDelim) {
  std::string full(" apple \torange ");

--- a/tests/cpp/test_aten.cc
+++ b/tests/cpp/test_aten.cc
--- a/tests/cpp/test_csrmm.cc
+++ b/tests/cpp/test_csrmm.cc
-#include <gtest/gtest.h>
 #include <dgl/array.h>
 #include <dgl/kernel.h>
+#include <gtest/gtest.h>
+
 #include "../../src/array/cpu/array_utils.h"  // PairHash
 #include "./common.h"

@@ -22,7 +23,8 @@ std::unordered_map<std::pair<IdType, IdType>, DType, aten::PairHash> COOToMap(
  for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
    IdType irow = aten::IndexSelect<IdType>(coo.row, i);
    IdType icol = aten::IndexSelect<IdType>(coo.col, i);
-    IdType ieid = aten::COOHasData(coo) ? aten::IndexSelect<IdType>(coo.data, i) : i;
+    IdType ieid =
+        aten::COOHasData(coo) ? aten::IndexSelect<IdType>(coo.data, i) : i;
    DType idata = aten::IndexSelect<DType>(weights, ieid);
    map.insert({{irow, icol}, idata});
  }
@@ -31,22 +33,16 @@ std::unordered_map<std::pair<IdType, IdType>, DType, aten::PairHash> COOToMap(

 template <typename IdType, typename DType>
 bool CSRIsClose(
-    aten::CSRMatrix A,
-    aten::CSRMatrix B,
-    NDArray A_weights,
-    NDArray B_weights,
-    DType rtol,
-    DType atol) {
+    aten::CSRMatrix A, aten::CSRMatrix B, NDArray A_weights, NDArray B_weights,
+    DType rtol, DType atol) {
  auto Amap = COOToMap<IdType, DType>(CSRToCOO(A, false), A_weights);
  auto Bmap = COOToMap<IdType, DType>(CSRToCOO(B, false), B_weights);

-  if (Amap.size() != Bmap.size())
-    return false;
+  if (Amap.size() != Bmap.size()) return false;

  for (auto itA : Amap) {
    auto itB = Bmap.find(itA.first);
-    if (itB == Bmap.end())
-      return false;
+    if (itB == Bmap.end()) return false;
    if (fabs(itA.second - itB->second) >= rtol * fabs(itA.second) + atol)
      return false;
  }
@@ -62,8 +58,7 @@ std::pair<aten::CSRMatrix, NDArray> CSR_A(DGLContext ctx = CTX) {
  //         [0. , 0. , 0. , 0. , 0.2]])
  // (0.+ indicates that the entry exists but the value is 0.)
  auto csr = aten::CSRMatrix(
-      4, 5,
-      NDArray::FromVector(std::vector<IdType>({0, 2, 4, 7, 8}), ctx),
+      4, 5, NDArray::FromVector(std::vector<IdType>({0, 2, 4, 7, 8}), ctx),
      NDArray::FromVector(std::vector<IdType>({2, 3, 2, 3, 0, 1, 3, 4}), ctx),
      NDArray::FromVector(std::vector<IdType>({1, 0, 2, 3, 4, 5, 6, 7}), ctx));
  auto weights = NDArray::FromVector(
@@ -80,11 +75,13 @@ std::pair<aten::CSRMatrix, NDArray> CSR_B(DGLContext ctx = CTX) {
  //         [0.2, 0.4, 0. , 0. , 0. , 0. ]])
  // (0.+ indicates that the entry exists but the value is 0.)
  auto csr = aten::CSRMatrix(
-      5, 6,
-      NDArray::FromVector(std::vector<IdType>({0, 3, 4, 6, 10, 12}), ctx),
-      NDArray::FromVector(std::vector<IdType>({1, 3, 5, 5, 0, 5, 0, 1, 2, 3, 0, 1}), ctx));
+      5, 6, NDArray::FromVector(std::vector<IdType>({0, 3, 4, 6, 10, 12}), ctx),
+      NDArray::FromVector(
+          std::vector<IdType>({1, 3, 5, 5, 0, 5, 0, 1, 2, 3, 0, 1}), ctx));
  auto weights = NDArray::FromVector(
-      std::vector<DType>({0.9, 0.6, 0.3, 0.4, 0.0, 0.9, 0.8, 0.2, 0.3, 0.2, 0.2, 0.4}), ctx);
+      std::vector<DType>(
+          {0.9, 0.6, 0.3, 0.4, 0.0, 0.9, 0.8, 0.2, 0.3, 0.2, 0.2, 0.4}),
+      ctx);
  return {csr, weights};
 }

@@ -95,11 +92,10 @@ std::pair<aten::CSRMatrix, NDArray> CSR_C(DGLContext ctx = CTX) {
  //         [0. , 0.2, 0. , 0.9, 0.2],
  //         [0. , 1. , 0. , 0.7, 0. ]])
  auto csr = aten::CSRMatrix(
-      4, 5,
-      NDArray::FromVector(std::vector<IdType>({0, 1, 3, 6, 8}), ctx),
+      4, 5, NDArray::FromVector(std::vector<IdType>({0, 1, 3, 6, 8}), ctx),
      NDArray::FromVector(std::vector<IdType>({3, 3, 4, 1, 3, 4, 1, 3}), ctx));
  auto weights = NDArray::FromVector(
-      std::vector<DType>({0.2, 0.5, 0.4, 0.2, 0.9, 0.2, 1. , 0.7}), ctx);
+      std::vector<DType>({0.2, 0.5, 0.4, 0.2, 0.9, 0.2, 1., 0.7}), ctx);
  return {csr, weights};
 }

@@ -111,31 +107,36 @@ std::pair<aten::CSRMatrix, NDArray> CSR_A_mm_B(DGLContext ctx = CTX) {
  //         [0.04, 0.08, 0.  , 0.  , 0.  , 0.  ]])
  // (0.+ indicates that the entry exists but the value is 0.)
  auto csr = aten::CSRMatrix(
-      4, 6,
-      NDArray::FromVector(std::vector<IdType>({0,  5, 10, 15, 17}), ctx),
-      NDArray::FromVector(std::vector<IdType>(
-          {0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1}), ctx));
+      4, 6, NDArray::FromVector(std::vector<IdType>({0, 5, 10, 15, 17}), ctx),
+      NDArray::FromVector(
+          std::vector<IdType>(
+              {0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1}),
+          ctx));
  auto weights = NDArray::FromVector(
-      std::vector<DType>({
-        0.56, 0.14, 0.21, 0.14, 0.9 , 0.  , 0.  , 0.  , 0.  , 0.45, 0.16, 0.4 , 0.06, 0.28, 0.4 ,
-        0.04, 0.08}), ctx);
+      std::vector<DType>(
+          {0.56, 0.14, 0.21, 0.14, 0.9, 0., 0., 0., 0., 0.45, 0.16, 0.4, 0.06,
+           0.28, 0.4, 0.04, 0.08}),
+      ctx);
  return {csr, weights};
 }

 template <typename IdType, typename DType>
 std::pair<aten::CSRMatrix, NDArray> CSR_A_plus_C(DGLContext ctx = CTX) {
  auto csr = aten::CSRMatrix(
-      4, 5,
-      NDArray::FromVector(std::vector<IdType>({0,  2,  5,  9, 12}), ctx),
-      NDArray::FromVector(std::vector<IdType>({2, 3, 2, 3, 4, 0, 1, 3, 4, 1, 3, 4}), ctx));
+      4, 5, NDArray::FromVector(std::vector<IdType>({0, 2, 5, 9, 12}), ctx),
+      NDArray::FromVector(
+          std::vector<IdType>({2, 3, 2, 3, 4, 0, 1, 3, 4, 1, 3, 4}), ctx));
  auto weights = NDArray::FromVector(
-      std::vector<DType>({1. , 0.9, 0.5, 0.5, 0.4, 0.4, 0.9, 1.1, 0.2, 1. , 0.7, 0.2}), ctx);
+      std::vector<DType>(
+          {1., 0.9, 0.5, 0.5, 0.4, 0.4, 0.9, 1.1, 0.2, 1., 0.7, 0.2}),
+      ctx);
  return {csr, weights};
 }

 template <typename DType>
 NDArray CSR_A_mask_C(DGLContext ctx = CTX) {
-  return NDArray::FromVector(std::vector<DType>({0.7, 0.0, 0.0, 0.7, 0.2, 0.0, 0.0, 0.0}), ctx);
+  return NDArray::FromVector(
+      std::vector<DType>({0.7, 0.0, 0.0, 0.7, 0.2, 0.0, 0.0, 0.0}), ctx);
 }

 template <typename IdType, typename DType>
@@ -144,7 +145,8 @@ void _TestCsrmm(DGLContext ctx = CTX) {
  auto B = CSR_B<IdType, DType>(ctx);
  auto A_mm_B = aten::CSRMM(A.first, A.second, B.first, B.second);
  auto A_mm_B2 = CSR_A_mm_B<IdType, DType>(ctx);
-  bool result = CSRIsClose<IdType, DType>(A_mm_B.first, A_mm_B2.first, A_mm_B.second, A_mm_B2.second, 1e-4, 1e-4);
+  bool result = CSRIsClose<IdType, DType>(
+      A_mm_B.first, A_mm_B2.first, A_mm_B.second, A_mm_B2.second, 1e-4, 1e-4);
  ASSERT_TRUE(result);
 }

@@ -155,7 +157,8 @@ void _TestCsrsum(DGLContext ctx = CTX) {
  auto A_plus_C = aten::CSRSum({A.first, C.first}, {A.second, C.second});
  auto A_plus_C2 = CSR_A_plus_C<IdType, DType>(ctx);
  bool result = CSRIsClose<IdType, DType>(
-      A_plus_C.first, A_plus_C2.first, A_plus_C.second, A_plus_C2.second, 1e-4, 1e-4);
+      A_plus_C.first, A_plus_C2.first, A_plus_C.second, A_plus_C2.second, 1e-4,
+      1e-4);
  ASSERT_TRUE(result);
 }

@@ -164,7 +167,8 @@ void _TestCsrmask(DGLContext ctx = CTX) {
  auto A = CSR_A<IdType, DType>(ctx);
  auto C = CSR_C<IdType, DType>(ctx);
  auto C_coo = CSRToCOO(C.first, false);
-  auto A_mask_C = aten::CSRGetData<DType>(A.first, C_coo.row, C_coo.col, A.second, 0);
+  auto A_mask_C =
+      aten::CSRGetData<DType>(A.first, C_coo.row, C_coo.col, A.second, 0);
  auto A_mask_C2 = CSR_A_mask_C<DType>(ctx);
  ASSERT_TRUE(ArrayEQ<DType>(A_mask_C, A_mask_C2));
 }

--- a/tests/cpp/test_partition.cc
+++ b/tests/cpp/test_partition.cc
 #include <gtest/gtest.h>
+
 #include "../../src/partition/ndarray_partition.h"
 #include "./common.h"

-
 using namespace dgl;
 using namespace dgl::partition;

-template<DGLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 void _TestRemainder_GeneratePermutation() {
  const int64_t size = 160000;
  const int num_parts = 7;
-  NDArrayPartitionRef part = CreatePartitionRemainderBased(
-      size,  num_parts);
+  NDArrayPartitionRef part = CreatePartitionRemainderBased(size, num_parts);

-  IdArray idxs = aten::Range(0, size/10, sizeof(IdType)*8,
-      DGLContext{XPU, 0});
+  IdArray idxs =
+      aten::Range(0, size / 10, sizeof(IdType) * 8, DGLContext{XPU, 0});

  std::pair<IdArray, IdArray> result = part->GeneratePermutation(idxs);

@@ -22,41 +21,40 @@ void _TestRemainder_GeneratePermutation() {
  IdArray perm = result.first.CopyTo(DGLContext{kDGLCPU, 0});
  ASSERT_TRUE(perm.Ptr<IdType>() != nullptr);
  ASSERT_EQ(perm->shape[0], idxs->shape[0]);
-  const IdType * const perm_cpu = static_cast<const IdType*>(perm->data);
+  const IdType* const perm_cpu = static_cast<const IdType*>(perm->data);

  // second part of result should be the counts
  IdArray counts = result.second.CopyTo(DGLContext{kDGLCPU, 0});
  ASSERT_TRUE(counts.Ptr<int64_t>() != nullptr);
  ASSERT_EQ(counts->shape[0], num_parts);
-  const int64_t * const counts_cpu = static_cast<const int64_t*>(counts->data);
+  const int64_t* const counts_cpu = static_cast<const int64_t*>(counts->data);

-  std::vector<int64_t> prefix(num_parts+1, 0);
+  std::vector<int64_t> prefix(num_parts + 1, 0);
  for (int p = 0; p < num_parts; ++p) {
-    prefix[p+1] = prefix[p] + counts_cpu[p];
+    prefix[p + 1] = prefix[p] + counts_cpu[p];
  }
  ASSERT_EQ(prefix.back(), idxs->shape[0]);

  // copy original indexes to cpu
  idxs = idxs.CopyTo(DGLContext{kDGLCPU, 0});
-  const IdType * const idxs_cpu = static_cast<const IdType*>(idxs->data);
+  const IdType* const idxs_cpu = static_cast<const IdType*>(idxs->data);

  for (int p = 0; p < num_parts; ++p) {
-    for (int64_t i = prefix[p]; i < prefix[p+1]; ++i) {
+    for (int64_t i = prefix[p]; i < prefix[p + 1]; ++i) {
      EXPECT_EQ(idxs_cpu[perm_cpu[i]] % num_parts, p);
    }
  }
 }

-template<DGLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 void _TestRemainder_MapToX() {
  const int64_t size = 160000;
  const int num_parts = 7;
-  NDArrayPartitionRef part = CreatePartitionRemainderBased(
-      size,  num_parts);
+  NDArrayPartitionRef part = CreatePartitionRemainderBased(size, num_parts);

  for (int part_id = 0; part_id < num_parts; ++part_id) {
-    IdArray local = aten::Range(0, part->PartSize(part_id), sizeof(IdType)*8,
-        DGLContext{XPU, 0});
+    IdArray local = aten::Range(
+        0, part->PartSize(part_id), sizeof(IdType) * 8, DGLContext{XPU, 0});
    IdArray global = part->MapToGlobal(local, part_id);
    IdArray act_local = part->MapToLocal(global).CopyTo(CPU);

@@ -64,8 +62,9 @@ void _TestRemainder_MapToX() {
    ASSERT_EQ(global->shape[0], local->shape[0]);
    global = global.CopyTo(CPU);
    for (size_t i = 0; i < global->shape[0]; ++i) {
-      EXPECT_EQ(Ptr<IdType>(global)[i] % num_parts, part_id) << "i=" << i <<
-          ", num_parts=" << num_parts << ", part_id=" << part_id;
+      EXPECT_EQ(Ptr<IdType>(global)[i] % num_parts, part_id)
+          << "i=" << i << ", num_parts=" << num_parts
+          << ", part_id=" << part_id;
    }

    // the remapped local indices to should match the original
@@ -77,7 +76,6 @@ void _TestRemainder_MapToX() {
  }
 }

-
 TEST(PartitionTest, TestRemainderPartition) {
 #ifdef DGL_USE_CUDA
  _TestRemainder_GeneratePermutation<kDGLCUDA, int32_t>();
@@ -89,15 +87,10 @@ TEST(PartitionTest, TestRemainderPartition) {
  // CPU is not implemented
 }

-
-template<typename INDEX, typename RANGE>
-int _FindPart(
-    const INDEX idx,
-    const RANGE * const range,
-    const int num_parts)
-{
+template <typename INDEX, typename RANGE>
+int _FindPart(const INDEX idx, const RANGE* const range, const int num_parts) {
  for (int i = 0; i < num_parts; ++i) {
-    if (range[i+1] > idx) {
+    if (range[i + 1] > idx) {
      return i;
    }
  }
@@ -105,21 +98,21 @@ int _FindPart(
  return -1;
 }

-template<DGLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 void _TestRange_GeneratePermutation() {
  const int64_t size = 160000;
  const int num_parts = 7;
-  IdArray range = aten::NewIdArray(num_parts+1, DGLContext{kDGLCPU, 0},
-        sizeof(IdType)*8);
+  IdArray range = aten::NewIdArray(
+      num_parts + 1, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
  for (int i = 0; i < num_parts; ++i) {
-    range.Ptr<IdType>()[i] = (size/num_parts)*i;
+    range.Ptr<IdType>()[i] = (size / num_parts) * i;
  }
  range.Ptr<IdType>()[num_parts] = size;
  NDArrayPartitionRef part = CreatePartitionRangeBased(
-      size,  num_parts, range.CopyTo(DGLContext{XPU, 0}));
+      size, num_parts, range.CopyTo(DGLContext{XPU, 0}));

-  IdArray idxs = aten::Range(0, size/10, sizeof(IdType)*8,
-      DGLContext{XPU, 0});
+  IdArray idxs =
+      aten::Range(0, size / 10, sizeof(IdType) * 8, DGLContext{XPU, 0});

  std::pair<IdArray, IdArray> result = part->GeneratePermutation(idxs);

@@ -127,55 +120,59 @@ void _TestRange_GeneratePermutation() {
  IdArray perm = result.first.CopyTo(DGLContext{kDGLCPU, 0});
  ASSERT_TRUE(perm.Ptr<IdType>() != nullptr);
  ASSERT_EQ(perm->shape[0], idxs->shape[0]);
-  const IdType * const perm_cpu = static_cast<const IdType*>(perm->data);
+  const IdType* const perm_cpu = static_cast<const IdType*>(perm->data);

  // second part of result should be the counts
  IdArray counts = result.second.CopyTo(DGLContext{kDGLCPU, 0});
  ASSERT_TRUE(counts.Ptr<int64_t>() != nullptr);
  ASSERT_EQ(counts->shape[0], num_parts);
-  const int64_t * const counts_cpu = static_cast<const int64_t*>(counts->data);
+  const int64_t* const counts_cpu = static_cast<const int64_t*>(counts->data);

-  std::vector<int64_t> prefix(num_parts+1, 0);
+  std::vector<int64_t> prefix(num_parts + 1, 0);
  for (int p = 0; p < num_parts; ++p) {
-    prefix[p+1] = prefix[p] + counts_cpu[p];
+    prefix[p + 1] = prefix[p] + counts_cpu[p];
  }
  ASSERT_EQ(prefix.back(), idxs->shape[0]);

  // copy original indexes to cpu
  idxs = idxs.CopyTo(DGLContext{kDGLCPU, 0});
-  const IdType * const idxs_cpu = static_cast<const IdType*>(idxs->data);
+  const IdType* const idxs_cpu = static_cast<const IdType*>(idxs->data);

  for (int p = 0; p < num_parts; ++p) {
-    for (int64_t i = prefix[p]; i < prefix[p+1]; ++i) {
-      EXPECT_EQ(_FindPart(idxs_cpu[perm_cpu[i]], range.Ptr<IdType>(), num_parts), p);
+    for (int64_t i = prefix[p]; i < prefix[p + 1]; ++i) {
+      EXPECT_EQ(
+          _FindPart(idxs_cpu[perm_cpu[i]], range.Ptr<IdType>(), num_parts), p);
    }
  }
 }

-template<DGLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 void _TestRange_MapToX() {
  const int64_t size = 160000;
  const int num_parts = 7;
-  IdArray range = aten::NewIdArray(num_parts+1, DGLContext{kDGLCPU, 0},
-        sizeof(IdType)*8);
+  IdArray range = aten::NewIdArray(
+      num_parts + 1, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
  for (int i = 0; i < num_parts; ++i) {
-    Ptr<IdType>(range)[i] = (size/num_parts)*i;
+    Ptr<IdType>(range)[i] = (size / num_parts) * i;
  }
  range.Ptr<IdType>()[num_parts] = size;
  NDArrayPartitionRef part = CreatePartitionRangeBased(
-      size,  num_parts, range.CopyTo(DGLContext{XPU, 0}));
+      size, num_parts, range.CopyTo(DGLContext{XPU, 0}));

  for (int part_id = 0; part_id < num_parts; ++part_id) {
-    IdArray local = aten::Range(0, part->PartSize(part_id), sizeof(IdType)*8,
-        DGLContext{XPU, 0});
+    IdArray local = aten::Range(
+        0, part->PartSize(part_id), sizeof(IdType) * 8, DGLContext{XPU, 0});
    IdArray global = part->MapToGlobal(local, part_id);
    IdArray act_local = part->MapToLocal(global).CopyTo(CPU);

    ASSERT_EQ(global->shape[0], local->shape[0]);
    global = global.CopyTo(CPU);
    for (size_t i = 0; i < global->shape[0]; ++i) {
-      EXPECT_EQ(_FindPart(Ptr<IdType>(global)[i], Ptr<IdType>(range), num_parts), part_id) << "i=" << i <<
-          ", num_parts=" << num_parts << ", part_id=" << part_id << ", shape=" << global->shape[0];
+      EXPECT_EQ(
+          _FindPart(Ptr<IdType>(global)[i], Ptr<IdType>(range), num_parts),
+          part_id)
+          << "i=" << i << ", num_parts=" << num_parts << ", part_id=" << part_id
+          << ", shape=" << global->shape[0];
    }

    // the remapped local indices to should match the original

--- a/tests/cpp/test_smart_ptr_serialize.cc
+++ b/tests/cpp/test_smart_ptr_serialize.cc
@@ -5,6 +5,7 @@
 #include <dmlc/memory_io.h>
 #include <dmlc/parameter.h>
 #include <gtest/gtest.h>
+
 #include <cstring>
 #include <iostream>
 #include <sstream>
@@ -70,12 +71,12 @@ TYPED_TEST(SmartPtrTest, Vector_Test1) {
  std::vector<Pair> copy_myclasses;
  static_cast<dmlc::Stream *>(&ofs)->Read<std::vector<Pair>>(&copy_myclasses);

-  EXPECT_TRUE(std::equal(myclasses.begin(), myclasses.end(),
-                         copy_myclasses.begin(),
-                         [](const Pair &left, const Pair &right) {
-                           return (left.second->data_ == right.second->data_) &&
-                                  (left.first == right.first);
-                         }));
+  EXPECT_TRUE(std::equal(
+      myclasses.begin(), myclasses.end(), copy_myclasses.begin(),
+      [](const Pair &left, const Pair &right) {
+        return (left.second->data_ == right.second->data_) &&
+               (left.first == right.first);
+      }));
 }

 TYPED_TEST(SmartPtrTest, Vector_Test2) {
@@ -95,9 +96,9 @@ TYPED_TEST(SmartPtrTest, Vector_Test2) {
  static_cast<dmlc::Stream *>(&ofs)->Read<std::vector<SmartPtr>>(
      &copy_myclasses);

-  EXPECT_TRUE(std::equal(myclasses.begin(), myclasses.end(),
-                         copy_myclasses.begin(),
-                         [](const SmartPtr &left, const SmartPtr &right) {
-                           return left->data_ == right->data_;
-                         }));
+  EXPECT_TRUE(std::equal(
+      myclasses.begin(), myclasses.end(), copy_myclasses.begin(),
+      [](const SmartPtr &left, const SmartPtr &right) {
+        return left->data_ == right->data_;
+      }));
 }
--- a/tests/cpp/test_spmat_coo.cc
+++ b/tests/cpp/test_spmat_coo.cc
-#include <gtest/gtest.h>
-#include <dmlc/omp.h>
 #include <dgl/array.h>
+#include <dmlc/omp.h>
+#include <gtest/gtest.h>
+
 #include "./common.h"

 using namespace dgl;
@@ -17,9 +18,12 @@ aten::CSRMatrix CSR1(DGLContext ctx = CTX) {
  // data: [0, 2, 3, 1, 4]
  return aten::CSRMatrix(
      4, 5,
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX)*8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX) * 8, ctx),
      false);
 }

@@ -33,9 +37,12 @@ aten::CSRMatrix CSR2(DGLContext ctx = CTX) {
  // data: [0, 2, 5, 3, 1, 4]
  return aten::CSRMatrix(
      4, 5,
-      aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX)*8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
      false);
 }

@@ -50,8 +57,10 @@ aten::COOMatrix COO1(DGLContext ctx = CTX) {
  // col : [1, 2, 2, 0, 3]
  return aten::COOMatrix(
      4, 5,
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 0, 1, 2}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 3}), sizeof(IDX)*8, ctx));
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 3}), sizeof(IDX) * 8, ctx));
 }

 template <typename IDX>
@@ -66,8 +75,10 @@ aten::COOMatrix COO2(DGLContext ctx = CTX) {
  // col : [1, 2, 2, 0, 3, 2]
  return aten::COOMatrix(
      4, 5,
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 3, 2}), sizeof(IDX)*8, ctx));
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 3, 2}), sizeof(IDX) * 8, ctx));
 }

 template <typename IDX>
@@ -78,9 +89,12 @@ aten::CSRMatrix SR_CSR3(DGLContext ctx) {
  //  [0, 0, 0, 0, 0]]
  return aten::CSRMatrix(
      4, 5,
-      aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({2, 1, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX)*8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
      false);
 }

@@ -92,9 +106,12 @@ aten::CSRMatrix SRC_CSR3(DGLContext ctx) {
  //  [0, 0, 0, 0, 0]]
  return aten::CSRMatrix(
      4, 5,
-      aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({2, 0, 5, 3, 1, 4}), sizeof(IDX)*8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
      false);
 }

@@ -109,8 +126,10 @@ aten::COOMatrix COO3(DGLContext ctx) {
  // col : [2, 2, 1, 0, 3, 2]
  return aten::COOMatrix(
      4, 5,
-      aten::VecToIdArray(std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX)*8, ctx),
-      aten::VecToIdArray(std::vector<IDX>({2, 2, 1, 0, 3, 2}), sizeof(IDX)*8, ctx));
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 2, 1, 0, 3, 2}), sizeof(IDX) * 8, ctx));
 }

 struct SparseCOOCSR {
@@ -119,11 +138,12 @@ struct SparseCOOCSR {
  static constexpr uint64_t NUM_NZ = 5;
  template <typename IDX>
  static aten::COOMatrix COOSparse(const DGLContext &ctx = CTX) {
-    return aten::COOMatrix(NUM_ROWS, NUM_COLS,
-                           aten::VecToIdArray(std::vector<IDX>({0, 1, 2, 3, 4}),
-                                              sizeof(IDX) * 8, ctx),
-                           aten::VecToIdArray(std::vector<IDX>({1, 2, 3, 4, 5}),
-                                              sizeof(IDX) * 8, ctx));
+    return aten::COOMatrix(
+        NUM_ROWS, NUM_COLS,
+        aten::VecToIdArray(
+            std::vector<IDX>({0, 1, 2, 3, 4}), sizeof(IDX) * 8, ctx),
+        aten::VecToIdArray(
+            std::vector<IDX>({1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx));
  }

  template <typename IDX>
@@ -133,18 +153,19 @@ struct SparseCOOCSR {
      indptr[i + 1] = static_cast<IDX>(i + 1);
    }
    indptr[0] = 0;
-    return aten::CSRMatrix(NUM_ROWS, NUM_COLS,
-                           aten::VecToIdArray(indptr, sizeof(IDX) * 8, ctx),
-                           aten::VecToIdArray(std::vector<IDX>({1, 2, 3, 4, 5}),
-                                              sizeof(IDX) * 8, ctx),
-                           aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 1, 1}),
-                                              sizeof(IDX) * 8, ctx),
-                           false);
+    return aten::CSRMatrix(
+        NUM_ROWS, NUM_COLS, aten::VecToIdArray(indptr, sizeof(IDX) * 8, ctx),
+        aten::VecToIdArray(
+            std::vector<IDX>({1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx),
+        aten::VecToIdArray(
+            std::vector<IDX>({1, 1, 1, 1, 1}), sizeof(IDX) * 8, ctx),
+        false);
  }
 };

-bool isSparseCOO(const int64_t &num_threads, const int64_t &num_nodes,
-                 const int64_t &num_edges) {
+bool isSparseCOO(
+    const int64_t &num_threads, const int64_t &num_nodes,
+    const int64_t &num_edges) {
  // refer to COOToCSR<>() in ~dgl/src/array/cpu/spmat_op_impl_coo for details.
  return num_threads * num_nodes > 4 * num_edges;
 }
@@ -157,12 +178,13 @@ aten::COOMatrix RowSorted_NullData_COO(DGLContext ctx = CTX) {
  //  [0, 0, 0, 0, 0]]
  // row : [0, 0, 1, 2, 2]
  // col : [1, 2, 0, 2, 3]
-  return aten::COOMatrix(4, 5,
-                         aten::VecToIdArray(std::vector<IDX>({0, 0, 1, 2, 2}),
-                                            sizeof(IDX) * 8, ctx),
-                         aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}),
-                                            sizeof(IDX) * 8, ctx),
-                         aten::NullArray(), true, false);
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::NullArray(), true, false);
 }

 template <typename IDX>
@@ -172,14 +194,15 @@ aten::CSRMatrix RowSorted_NullData_CSR(DGLContext ctx = CTX) {
  //  [0, 0, 1, 1, 0],
  //  [0, 0, 0, 0, 0]]
  // data: [0, 1, 2, 3, 4]
-  return aten::CSRMatrix(4, 5,
-                         aten::VecToIdArray(std::vector<IDX>({0, 2, 3, 5, 5}),
-                                            sizeof(IDX) * 8, ctx),
-                         aten::VecToIdArray(std::vector<IDX>({1, 2, 0, 2, 3}),
-                                            sizeof(IDX) * 8, ctx),
-                         aten::VecToIdArray(std::vector<IDX>({0, 1, 2, 3, 4}),
-                                            sizeof(IDX) * 8, ctx),
-                         false);
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 1, 2, 3, 4}), sizeof(IDX) * 8, ctx),
+      false);
 }
 }  // namespace

@@ -299,7 +322,7 @@ TEST(SpmatTest, TestCOOHasDuplicate) {
 template <typename IDX>
 void _TestCOOSort(DGLContext ctx) {
  auto coo = COO3<IDX>(ctx);
-  
+
  auto sr_coo = COOSort(coo, false);
  ASSERT_EQ(coo.num_rows, sr_coo.num_rows);
  ASSERT_EQ(coo.num_cols, sr_coo.num_cols);
@@ -348,11 +371,11 @@ void _TestCOOSort(DGLContext ctx) {
  // row : [0, 0, 0, 1, 2, 2]
  // col : [1, 2, 2, 0, 2, 3]
  auto sort_row = aten::VecToIdArray(
-    std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX)*8, ctx);
+      std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx);
  auto sort_col = aten::VecToIdArray(
-    std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX)*8, ctx);
+      std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx);
  auto sort_col_data = aten::VecToIdArray(
-    std::vector<IDX>({2, 0, 5, 3, 1, 4}), sizeof(IDX)*8, ctx);
+      std::vector<IDX>({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx);

  ASSERT_TRUE(ArrayEQ<IDX>(sr_coo.row, sort_row));
  ASSERT_TRUE(ArrayEQ<IDX>(src_coo.row, sort_row));
@@ -372,10 +395,10 @@ TEST(SpmatTest, COOSort) {
 template <typename IDX>
 void _TestCOOReorder() {
  auto coo = COO2<IDX>();
-  auto new_row = aten::VecToIdArray(
-    std::vector<IDX>({2, 0, 3, 1}), sizeof(IDX)*8, CTX);
+  auto new_row =
+      aten::VecToIdArray(std::vector<IDX>({2, 0, 3, 1}), sizeof(IDX) * 8, CTX);
  auto new_col = aten::VecToIdArray(
-    std::vector<IDX>({2, 0, 4, 3, 1}), sizeof(IDX)*8, CTX);
+      std::vector<IDX>({2, 0, 4, 3, 1}), sizeof(IDX) * 8, CTX);
  auto new_coo = COOReorder(coo, new_row, new_col);
  ASSERT_EQ(new_coo.num_rows, coo.num_rows);
  ASSERT_EQ(new_coo.num_cols, coo.num_cols);
@@ -391,54 +414,60 @@ void _TestCOOGetData(DGLContext ctx) {
  auto coo = COO2<IDX>(ctx);
  // test get all data
  auto x = aten::COOGetAllData(coo, 0, 0);
-  auto tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX)*8, ctx);
+  auto tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
  x = aten::COOGetAllData(coo, 0, 2);
-  tx = aten::VecToIdArray(std::vector<IDX>({2, 5}), sizeof(IDX)*8, ctx);
+  tx = aten::VecToIdArray(std::vector<IDX>({2, 5}), sizeof(IDX) * 8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));

  // test get data
-  auto r = aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX)*8, ctx);
-  auto c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX)*8, ctx);
+  auto r =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  auto c =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
  x = aten::COOGetData(coo, r, c);
-  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX)*8, ctx);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));

  // test get data on sorted
-  coo = aten::COOSort(coo); 
-  r = aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX)*8, ctx);
-  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX)*8, ctx);
+  coo = aten::COOSort(coo);
+  r = aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
  x = aten::COOGetData(coo, r, c);
-  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX)*8, ctx);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));

  // test get data w/ broadcasting
-  r = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX)*8, ctx);
-  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX)*8, ctx);
+  r = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX) * 8, ctx);
+  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
  x = aten::COOGetData(coo, r, c);
-  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX)*8, ctx);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
-
 }

 TEST(SpmatTest, COOGetData) {
  _TestCOOGetData<int32_t>(CPU);
  _TestCOOGetData<int64_t>(CPU);
-//#ifdef DGL_USE_CUDA
+  //#ifdef DGL_USE_CUDA
  //_TestCOOGetData<int32_t>(GPU);
  //_TestCOOGetData<int64_t>(GPU);
-//#endif
+  //#endif
 }

 template <typename IDX>
 void _TestCOOGetDataAndIndices() {
  auto csr = COO2<IDX>();
-  auto r = aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX)*8, CTX);
-  auto c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX)*8, CTX);
+  auto r =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, CTX);
+  auto c =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, CTX);
  auto x = aten::COOGetDataAndIndices(csr, r, c);
-  auto tr = aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX)*8, CTX);
-  auto tc = aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX)*8, CTX);
-  auto td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX)*8, CTX);
+  auto tr =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, CTX);
+  auto tc =
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX) * 8, CTX);
+  auto td =
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX) * 8, CTX);
  ASSERT_TRUE(ArrayEQ<IDX>(x[0], tr));
  ASSERT_TRUE(ArrayEQ<IDX>(x[1], tc));
  ASSERT_TRUE(ArrayEQ<IDX>(x[2], td));

--- a/tests/cpp/test_spmat_csr.cc
+++ b/tests/cpp/test_spmat_csr.cc