[Misc] clang-format auto fix. (#4804)

* [Misc] clang-format auto fix. * manual * manual * manual * manual * todo * fix Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] clang-format auto fix. (#4804)
* [Misc] clang-format auto fix. * manual * manual * manual * manual * todo * fix Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
8ae50c42 · Hongzhi (Steve), Chen · GitHub · 81831111 · 8ae50c42 · 8ae50c42
Unverified Commit 8ae50c42 authored Nov 03, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 03, 2022
17 changed files
--- a/src/array/cuda/ge_spmm.cuh
+++ b/src/array/cuda/ge_spmm.cuh
@@ -6,10 +6,10 @@
 #ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_
 #define DGL_ARRAY_CUDA_GE_SPMM_CUH_

-#include "macro.cuh"
-#include "atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
+#include "atomic.cuh"
+#include "macro.cuh"

 namespace dgl {

@@ -23,23 +23,19 @@ namespace cuda {
 * \note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
 *       The grid dimension x and y are reordered for better performance.
 */
-template <typename Idx, typename DType,
-          typename BinaryOp>
+template <typename Idx, typename DType, typename BinaryOp>
 __global__ void GESpMMKernel(
-    const DType* __restrict__ ufeat,
-    const DType* __restrict__ efeat,
-    DType* __restrict__ out,
-    const Idx* __restrict__ indptr,
-    const Idx* __restrict__ indices,
-    const int64_t num_rows, const int64_t num_cols,
-    const int64_t feat_len) {
-  const Idx rid = blockIdx.x * blockDim.y + threadIdx.y;  // over vertices dimension
-  const Idx fid = (blockIdx.y * 64) + threadIdx.x;        // over feature dimension
+    const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
+    DType* __restrict__ out, const Idx* __restrict__ indptr,
+    const Idx* __restrict__ indices, const int64_t num_rows,
+    const int64_t num_cols, const int64_t feat_len) {
+  const Idx rid =
+      blockIdx.x * blockDim.y + threadIdx.y;        // over vertices dimension
+  const Idx fid = (blockIdx.y * 64) + threadIdx.x;  // over feature dimension

  if (rid < num_rows && fid < feat_len) {
    const Idx low = __ldg(indptr + rid), high = __ldg(indptr + rid + 1);
-    DType accum_0 = 0.,
-          accum_1 = 0.;
+    DType accum_0 = 0., accum_1 = 0.;

    if (blockIdx.y != gridDim.y - 1) {  // fid + 32 < feat_len
      for (Idx left = low; left < high; left += 32) {
@@ -109,24 +105,21 @@ __global__ void GESpMMKernel(
        }

        out[feat_len * rid + fid] = accum_0;
-        if (fid + 32 < feat_len)
-          out[feat_len * rid + fid + 32] = accum_1;
+        if (fid + 32 < feat_len) out[feat_len * rid + fid + 32] = accum_1;
      }
    }
  }
 }

-template <typename Idx, typename DType,
-          typename BinaryOp>
+template <typename Idx, typename DType, typename BinaryOp>
 void GESpMMCsr(
-    const CSRMatrix& csr,
-    NDArray ufeat, NDArray efeat,
-    NDArray out, int64_t feat_len) {
-  const Idx *indptr = csr.indptr.Ptr<Idx>();
-  const Idx *indices = csr.indices.Ptr<Idx>();
-  const DType *ufeat_data = ufeat.Ptr<DType>();
-  const DType *efeat_data = efeat.Ptr<DType>();
-  DType *out_data = out.Ptr<DType>();
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    int64_t feat_len) {
+  const Idx* indptr = csr.indptr.Ptr<Idx>();
+  const Idx* indices = csr.indices.Ptr<Idx>();
+  const DType* ufeat_data = ufeat.Ptr<DType>();
+  const DType* efeat_data = efeat.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();

  cudaStream_t stream = runtime::getCurrentCUDAStream();

@@ -138,12 +131,10 @@ void GESpMMCsr(
  const dim3 nthrs(ntx, nty);
  const int sh_mem_size = 0;

-  CUDA_KERNEL_CALL((GESpMMKernel<Idx, DType, BinaryOp>),
-      nblks, nthrs, sh_mem_size, stream,
-      ufeat_data, efeat_data, out_data,
-      indptr, indices,
-      csr.num_rows, csr.num_cols,
-      feat_len);
+  CUDA_KERNEL_CALL(
+      (GESpMMKernel<Idx, DType, BinaryOp>), nblks, nthrs, sh_mem_size, stream,
+      ufeat_data, efeat_data, out_data, indptr, indices, csr.num_rows,
+      csr.num_cols, feat_len);
 }

 }  // namespace cuda

--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
@@ -8,44 +8,46 @@

 ///////////////////////// Dispatchers //////////////////////////

-
 /* Macro used for switching between broadcasting and non-broadcasting kernels.
 * It also copies the auxiliary information for calculating broadcasting offsets
 * to GPU.
 */
-#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...) do { \
-  const BcastOff &info = (BCAST);                                              \
-  if (!info.use_bcast) {                                                       \
-    constexpr bool UseBcast = false;                                           \
-    if ((EDGE_MAP)) {                                                          \
-      constexpr bool UseIdx = true;                                            \
-      { __VA_ARGS__ }                                                          \
-    } else {                                                                   \
-      constexpr bool UseIdx = false;                                           \
-      { __VA_ARGS__ }                                                          \
-    }                                                                          \
-  } else {                                                                     \
-    constexpr bool UseBcast = true;                                            \
-    const DGLContext ctx = (CTX);                                               \
-    const auto device = runtime::DeviceAPI::Get(ctx);                          \
-    (LHS_OFF) = static_cast<int64_t*>(                                         \
-      device->AllocWorkspace(ctx, sizeof(int64_t) * info.lhs_offset.size()));  \
-    CUDA_CALL(cudaMemcpy((LHS_OFF), &info.lhs_offset[0],                       \
-      sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice));      \
-    (RHS_OFF) = static_cast<int64_t*>(                                         \
-      device->AllocWorkspace(ctx, sizeof(int64_t) * info.rhs_offset.size()));  \
-    CUDA_CALL(cudaMemcpy((RHS_OFF), &info.rhs_offset[0],                       \
-      sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice));      \
-    if ((EDGE_MAP)) {                                                          \
-      constexpr bool UseIdx = true;                                            \
-      { __VA_ARGS__ }                                                          \
-    } else {                                                                   \
-      constexpr bool UseIdx = false;                                           \
-      { __VA_ARGS__ }                                                          \
-    }                                                                          \
-    device->FreeWorkspace(ctx, (LHS_OFF));                                     \
-    device->FreeWorkspace(ctx, (RHS_OFF));                                     \
-  }                                                                            \
-} while (0)
+#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...)     \
+  do {                                                                        \
+    const BcastOff &info = (BCAST);                                           \
+    if (!info.use_bcast) {                                                    \
+      constexpr bool UseBcast = false;                                        \
+      if ((EDGE_MAP)) {                                                       \
+        constexpr bool UseIdx = true;                                         \
+        { __VA_ARGS__ }                                                       \
+      } else {                                                                \
+        constexpr bool UseIdx = false;                                        \
+        { __VA_ARGS__ }                                                       \
+      }                                                                       \
+    } else {                                                                  \
+      constexpr bool UseBcast = true;                                         \
+      const DGLContext ctx = (CTX);                                           \
+      const auto device = runtime::DeviceAPI::Get(ctx);                       \
+      (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
+          ctx, sizeof(int64_t) * info.lhs_offset.size()));                    \
+      CUDA_CALL(cudaMemcpy(                                                   \
+          (LHS_OFF), &info.lhs_offset[0],                                     \
+          sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
+      (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
+          ctx, sizeof(int64_t) * info.rhs_offset.size()));                    \
+      CUDA_CALL(cudaMemcpy(                                                   \
+          (RHS_OFF), &info.rhs_offset[0],                                     \
+          sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
+      if ((EDGE_MAP)) {                                                       \
+        constexpr bool UseIdx = true;                                         \
+        { __VA_ARGS__ }                                                       \
+      } else {                                                                \
+        constexpr bool UseIdx = false;                                        \
+        { __VA_ARGS__ }                                                       \
+      }                                                                       \
+      device->FreeWorkspace(ctx, (LHS_OFF));                                  \
+      device->FreeWorkspace(ctx, (RHS_OFF));                                  \
+    }                                                                         \
+  } while (0)

 #endif  // DGL_ARRAY_CUDA_MACRO_CUH_
--- a/src/array/cuda/negative_sampling.cu
+++ b/src/array/cuda/negative_sampling.cu
@@ -4,14 +4,14 @@
 * \brief rowwise sampling
 */

-#include <dgl/random.h>
+#include <curand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/array_iterator.h>
-#include <curand_kernel.h>
+#include <dgl/random.h>

+#include "../../runtime/cuda/cuda_common.h"
 #include "./dgl_cub.cuh"
 #include "./utils.h"
-#include "../../runtime/cuda/cuda_common.h"

 using namespace dgl::runtime;

@@ -23,20 +23,15 @@ namespace {

 template <typename IdType>
 __global__ void _GlobalUniformNegativeSamplingKernel(
-    const IdType* __restrict__ indptr,
-    const IdType* __restrict__ indices,
-    IdType* __restrict__ row,
-    IdType* __restrict__ col,
-    int64_t num_row,
-    int64_t num_col,
-    int64_t num_samples,
-    int num_trials,
-    bool exclude_self_loops,
-    int32_t random_seed) {
+    const IdType* __restrict__ indptr, const IdType* __restrict__ indices,
+    IdType* __restrict__ row, IdType* __restrict__ col, int64_t num_row,
+    int64_t num_col, int64_t num_samples, int num_trials,
+    bool exclude_self_loops, int32_t random_seed) {
  int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;

-  curandStatePhilox4_32_10_t rng;   // this allows generating 4 32-bit ints at a time
+  curandStatePhilox4_32_10_t
+      rng;  // this allows generating 4 32-bit ints at a time
  curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);

  while (tx < num_samples) {
@@ -50,8 +45,7 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
      int64_t u = static_cast<int64_t>(((y_lo << 32L) | z) % num_row);
      int64_t v = static_cast<int64_t>(((y_hi << 32L) | w) % num_col);

-      if (exclude_self_loops && (u == v))
-        continue;
+      if (exclude_self_loops && (u == v)) continue;

      // binary search of v among indptr[u:u+1]
      int64_t b = indptr[u], e = indptr[u + 1] - 1;
@@ -81,48 +75,47 @@ __global__ void _GlobalUniformNegativeSamplingKernel(

 template <typename DType>
 struct IsNotMinusOne {
-  __device__ __forceinline__ bool operator() (const std::pair<DType, DType>& a) {
+  __device__ __forceinline__ bool operator()(const std::pair<DType, DType>& a) {
    return a.first != -1;
  }
 };

 /*!
- * \brief Sort ordered pairs in ascending order, using \a tmp_major and \a tmp_minor as
- * temporary buffers, each with \a n elements.
+ * \brief Sort ordered pairs in ascending order, using \a tmp_major and \a
+ * tmp_minor as temporary buffers, each with \a n elements.
 */
 template <typename IdType>
 void SortOrderedPairs(
-    runtime::DeviceAPI* device,
-    DGLContext ctx,
-    IdType* major,
-    IdType* minor,
-    IdType* tmp_major,
-    IdType* tmp_minor,
-    int64_t n,
-    cudaStream_t stream) {
+    runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
+    IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
  // Sort ordered pairs in lexicographical order by two radix sorts since
  // cub's radix sorts are stable.
-  // We need a 2*n auxiliary storage to store the results form the first radix sort.
+  // We need a 2*n auxiliary storage to store the results form the first radix
+  // sort.
  size_t s1 = 0, s2 = 0;
  void* tmp1 = nullptr;
  void* tmp2 = nullptr;

  // Radix sort by minor key first, reorder the major key in the progress.
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream));
+      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
+      stream));
  tmp1 = device->AllocWorkspace(ctx, s1);
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream));
+      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
+      stream));

  // Radix sort by major key next.
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream));
-  tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) : tmp1;  // reuse buffer if s2 <= s1
+      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
+      stream));
+  tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
+                   : tmp1;  // reuse buffer if s2 <= s1
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream));
+      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
+      stream));

-  if (tmp1 != tmp2)
-    device->FreeWorkspace(ctx, tmp2);
+  if (tmp1 != tmp2) device->FreeWorkspace(ctx, tmp2);
  device->FreeWorkspace(ctx, tmp1);
 }

@@ -130,17 +123,14 @@ void SortOrderedPairs(

 template <DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
-    const CSRMatrix& csr,
-    int64_t num_samples,
-    int num_trials,
-    bool exclude_self_loops,
-    bool replace,
-    double redundancy) {
+    const CSRMatrix& csr, int64_t num_samples, int num_trials,
+    bool exclude_self_loops, bool replace, double redundancy) {
  auto ctx = csr.indptr->ctx;
  auto dtype = csr.indptr->dtype;
  const int64_t num_row = csr.num_rows;
  const int64_t num_col = csr.num_cols;
-  const int64_t num_actual_samples = static_cast<int64_t>(num_samples * (1 + redundancy));
+  const int64_t num_actual_samples =
+      static_cast<int64_t>(num_samples * (1 + redundancy));
  IdArray row = Full<IdType>(-1, num_actual_samples, ctx);
  IdArray col = Full<IdType>(-1, num_actual_samples, ctx);
  IdArray out_row = IdArray::Empty({num_actual_samples}, dtype, ctx);
@@ -156,22 +146,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  std::pair<IdArray, IdArray> result;
  int64_t num_out;

-  CUDA_KERNEL_CALL(_GlobalUniformNegativeSamplingKernel,
-      nb, nt, 0, stream,
-      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
-      row_data, col_data, num_row, num_col, num_actual_samples, num_trials,
-      exclude_self_loops, RandomEngine::ThreadLocal()->RandInt32());
+  CUDA_KERNEL_CALL(
+      _GlobalUniformNegativeSamplingKernel, nb, nt, 0, stream,
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), row_data, col_data,
+      num_row, num_col, num_actual_samples, num_trials, exclude_self_loops,
+      RandomEngine::ThreadLocal()->RandInt32());

  size_t tmp_size = 0;
-  int64_t* num_out_cuda = static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
+  int64_t* num_out_cuda =
+      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
  IsNotMinusOne<IdType> op;
  PairIterator<IdType> begin(row_data, col_data);
  PairIterator<IdType> out_begin(out_row_data, out_col_data);
  CUDA_CALL(cub::DeviceSelect::If(
-        nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream));
+      nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
+      stream));
  void* tmp = device->AllocWorkspace(ctx, tmp_size);
  CUDA_CALL(cub::DeviceSelect::If(
-        tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream));
+      tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
+      stream));
  num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);

  if (!replace) {
@@ -182,28 +175,33 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
    PairIterator<IdType> unique_begin(unique_row_data, unique_col_data);

    SortOrderedPairs(
-        device, ctx, out_row_data, out_col_data, unique_row_data, unique_col_data,
-        num_out, stream);
+        device, ctx, out_row_data, out_col_data, unique_row_data,
+        unique_col_data, num_out, stream);

    size_t tmp_size_unique = 0;
    void* tmp_unique = nullptr;
    CUDA_CALL(cub::DeviceSelect::Unique(
-          nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream));
-    tmp_unique = (tmp_size_unique > tmp_size) ?
-      device->AllocWorkspace(ctx, tmp_size_unique) :
-      tmp;      // reuse buffer
+        nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
+        num_out, stream));
+    tmp_unique = (tmp_size_unique > tmp_size)
+                     ? device->AllocWorkspace(ctx, tmp_size_unique)
+                     : tmp;  // reuse buffer
    CUDA_CALL(cub::DeviceSelect::Unique(
-          tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream));
+        tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
+        num_out, stream));
    num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);

    num_out = std::min(num_samples, num_out);
-    result = {unique_row.CreateView({num_out}, dtype), unique_col.CreateView({num_out}, dtype)};
+    result = {
+        unique_row.CreateView({num_out}, dtype),
+        unique_col.CreateView({num_out}, dtype)};

-    if (tmp_unique != tmp)
-      device->FreeWorkspace(ctx, tmp_unique);
+    if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
  } else {
    num_out = std::min(num_samples, num_out);
-    result = {out_row.CreateView({num_out}, dtype), out_col.CreateView({num_out}, dtype)};
+    result = {
+        out_row.CreateView({num_out}, dtype),
+        out_col.CreateView({num_out}, dtype)};
  }

  device->FreeWorkspace(ctx, tmp);
@@ -211,10 +209,10 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  return result;
 }

-template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int32_t>(
-    const CSRMatrix&, int64_t, int, bool, bool, double);
-template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int64_t>(
-    const CSRMatrix&, int64_t, int, bool, bool, double);
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
+    kDGLCUDA, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
+    kDGLCUDA, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);

 };  // namespace impl
 };  // namespace aten

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -4,15 +4,15 @@
 * \brief uniform rowwise sampling
 */

+#include <curand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
-#include <curand_kernel.h>
+
 #include <numeric>

-#include "./dgl_cub.cuh"
 #include "../../array/cuda/atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
-
+#include "./dgl_cub.cuh"

 using namespace dgl::aten::cuda;

@@ -25,29 +25,28 @@ namespace {
 constexpr int BLOCK_SIZE = 128;

 /**
-* @brief Compute the size of each row in the sampled CSR, without replacement.
-*
-* @tparam IdType The type of node and edge indexes.
-* @param num_picks The number of non-zero entries to pick per row.
-* @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
-* @param in_ptr The index where each row's edges start.
-* @param out_deg The size of each row in the sampled matrix, as indexed by
-* `in_rows` (output).
-*/
-template<typename IdType>
+ * @brief Compute the size of each row in the sampled CSR, without replacement.
+ *
+ * @tparam IdType The type of node and edge indexes.
+ * @param num_picks The number of non-zero entries to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The index where each row's edges start.
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
+ * `in_rows` (output).
+ */
+template <typename IdType>
 __global__ void _CSRRowWiseSampleDegreeKernel(
-    const int64_t num_picks,
-    const int64_t num_rows,
-    const IdType * const in_rows,
-    const IdType * const in_ptr,
-    IdType * const out_deg) {
+    const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    IdType* const out_deg) {
  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;

  if (tIdx < num_rows) {
    const int in_row = in_rows[tIdx];
    const int out_row = tIdx;
-    out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
+    out_deg[out_row] = min(
+        static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);

    if (out_row == num_rows - 1) {
      // make the prefixsum work
@@ -57,23 +56,21 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
 }

 /**
-* @brief Compute the size of each row in the sampled CSR, with replacement.
-*
-* @tparam IdType The type of node and edge indexes.
-* @param num_picks The number of non-zero entries to pick per row.
-* @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
-* @param in_ptr The index where each row's edges start.
-* @param out_deg The size of each row in the sampled matrix, as indexed by
-* `in_rows` (output).
-*/
-template<typename IdType>
+ * @brief Compute the size of each row in the sampled CSR, with replacement.
+ *
+ * @tparam IdType The type of node and edge indexes.
+ * @param num_picks The number of non-zero entries to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The index where each row's edges start.
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
+ * `in_rows` (output).
+ */
+template <typename IdType>
 __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
-    const int64_t num_picks,
-    const int64_t num_rows,
-    const IdType * const in_rows,
-    const IdType * const in_ptr,
-    IdType * const out_deg) {
+    const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    IdType* const out_deg) {
  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;

  if (tIdx < num_rows) {
@@ -94,41 +91,36 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
 }

 /**
-* @brief Perform row-wise uniform sampling on a CSR matrix,
-* and generate a COO matrix, without replacement.
-*
-* @tparam IdType The ID type used for matrices.
-* @tparam TILE_SIZE The number of rows covered by each threadblock.
-* @param rand_seed The random seed to use.
-* @param num_picks The number of non-zeros to pick per row.
-* @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
-* @param in_ptr The indptr array of the input CSR.
-* @param in_index The indices array of the input CSR.
-* @param data The data array of the input CSR.
-* @param out_ptr The offset to write each row to in the output COO.
-* @param out_rows The rows of the output COO (output).
-* @param out_cols The columns of the output COO (output).
-* @param out_idxs The data array of the output COO (output).
-*/
-template<typename IdType, int TILE_SIZE>
+ * @brief Perform row-wise uniform sampling on a CSR matrix,
+ * and generate a COO matrix, without replacement.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @param rand_seed The random seed to use.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The indptr array of the input CSR.
+ * @param in_index The indices array of the input CSR.
+ * @param data The data array of the input CSR.
+ * @param out_ptr The offset to write each row to in the output COO.
+ * @param out_rows The rows of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
+ */
+template <typename IdType, int TILE_SIZE>
 __global__ void _CSRRowWiseSampleUniformKernel(
-    const uint64_t rand_seed,
-    const int64_t num_picks,
-    const int64_t num_rows,
-    const IdType * const in_rows,
-    const IdType * const in_ptr,
-    const IdType * const in_index,
-    const IdType * const data,
-    const IdType * const out_ptr,
-    IdType * const out_rows,
-    IdType * const out_cols,
-    IdType * const out_idxs) {
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    const IdType* const in_index, const IdType* const data,
+    const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
+    IdType* const out_idxs) {
  // we assign one warp per row
  assert(blockDim.x == BLOCK_SIZE);

  int64_t out_row = blockIdx.x * TILE_SIZE;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

  curandStatePhilox4_32_10_t rng;
  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
@@ -177,41 +169,36 @@ __global__ void _CSRRowWiseSampleUniformKernel(
 }

 /**
-* @brief Perform row-wise uniform sampling on a CSR matrix,
-* and generate a COO matrix, with replacement.
-*
-* @tparam IdType The ID type used for matrices.
-* @tparam TILE_SIZE The number of rows covered by each threadblock.
-* @param rand_seed The random seed to use.
-* @param num_picks The number of non-zeros to pick per row.
-* @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
-* @param in_ptr The indptr array of the input CSR.
-* @param in_index The indices array of the input CSR.
-* @param data The data array of the input CSR.
-* @param out_ptr The offset to write each row to in the output COO.
-* @param out_rows The rows of the output COO (output).
-* @param out_cols The columns of the output COO (output).
-* @param out_idxs The data array of the output COO (output).
-*/
-template<typename IdType, int TILE_SIZE>
+ * @brief Perform row-wise uniform sampling on a CSR matrix,
+ * and generate a COO matrix, with replacement.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @param rand_seed The random seed to use.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The indptr array of the input CSR.
+ * @param in_index The indices array of the input CSR.
+ * @param data The data array of the input CSR.
+ * @param out_ptr The offset to write each row to in the output COO.
+ * @param out_rows The rows of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
+ */
+template <typename IdType, int TILE_SIZE>
 __global__ void _CSRRowWiseSampleUniformReplaceKernel(
-    const uint64_t rand_seed,
-    const int64_t num_picks,
-    const int64_t num_rows,
-    const IdType * const in_rows,
-    const IdType * const in_ptr,
-    const IdType * const in_index,
-    const IdType * const data,
-    const IdType * const out_ptr,
-    IdType * const out_rows,
-    IdType * const out_cols,
-    IdType * const out_idxs) {
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    const IdType* const in_index, const IdType* const data,
+    const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
+    IdType* const out_idxs) {
  // we assign one warp per row
  assert(blockDim.x == BLOCK_SIZE);

  int64_t out_row = blockIdx.x * TILE_SIZE;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);

  curandStatePhilox4_32_10_t rng;
  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
@@ -229,7 +216,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
        const int64_t out_idx = out_row_start + idx;
        out_rows[out_idx] = row;
        out_cols[out_idx] = in_index[in_row_start + edge];
-        out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge;
+        out_idxs[out_idx] =
+            data ? data[in_row_start + edge] : in_row_start + edge;
      }
    }
    out_row += 1;
@@ -248,11 +236,14 @@ COOMatrix _CSRRowWiseSamplingUniform(
  cudaStream_t stream = runtime::getCurrentCUDAStream();

  const int64_t num_rows = rows->shape[0];
-  const IdType * const slice_rows = static_cast<const IdType*>(rows->data);
-
-  IdArray picked_row = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
-  IdArray picked_col = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
-  IdArray picked_idx = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
+
+  IdArray picked_row =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_col =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_idx =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
  IdType* const out_rows = static_cast<IdType*>(picked_row->data);
  IdType* const out_cols = static_cast<IdType*>(picked_col->data);
  IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
@@ -261,65 +252,52 @@ COOMatrix _CSRRowWiseSamplingUniform(
  const IdType* in_cols = mat.indices.Ptr<IdType>();
  const IdType* data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
  if (mat.is_pinned) {
-    CUDA_CALL(cudaHostGetDevicePointer(
-        &in_ptr, mat.indptr.Ptr<IdType>(), 0));
-    CUDA_CALL(cudaHostGetDevicePointer(
-        &in_cols, mat.indices.Ptr<IdType>(), 0));
+    CUDA_CALL(cudaHostGetDevicePointer(&in_ptr, mat.indptr.Ptr<IdType>(), 0));
+    CUDA_CALL(cudaHostGetDevicePointer(&in_cols, mat.indices.Ptr<IdType>(), 0));
    if (CSRHasData(mat)) {
-      CUDA_CALL(cudaHostGetDevicePointer(
-          &data, mat.data.Ptr<IdType>(), 0));
+      CUDA_CALL(cudaHostGetDevicePointer(&data, mat.data.Ptr<IdType>(), 0));
    }
  }

  // compute degree
-  IdType * out_deg = static_cast<IdType*>(
+  IdType* out_deg = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  if (replace) {
    const dim3 block(512);
    const dim3 grid((num_rows + block.x - 1) / block.x);
    CUDA_KERNEL_CALL(
-        _CSRRowWiseSampleDegreeReplaceKernel,
-        grid, block, 0, stream,
-        num_picks, num_rows, slice_rows, in_ptr, out_deg);
+        _CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks,
+        num_rows, slice_rows, in_ptr, out_deg);
  } else {
    const dim3 block(512);
    const dim3 grid((num_rows + block.x - 1) / block.x);
    CUDA_KERNEL_CALL(
-        _CSRRowWiseSampleDegreeKernel,
-        grid, block, 0, stream,
-        num_picks, num_rows, slice_rows, in_ptr, out_deg);
+        _CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks,
+        num_rows, slice_rows, in_ptr, out_deg);
  }

  // fill out_ptr
-  IdType * out_ptr = static_cast<IdType*>(
+  IdType* out_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
-      out_deg,
-      out_ptr,
-      num_rows+1,
-      stream));
-  void * prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
-      out_deg,
-      out_ptr,
-      num_rows+1,
-      stream));
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
+  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, out_deg);

  cudaEvent_t copyEvent;
  CUDA_CALL(cudaEventCreate(&copyEvent));

-  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
-  // a cudaevent
+  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
+  // wait on a cudaevent
  IdType new_len;
  // copy using the internal current stream
-  device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
-      sizeof(new_len),
-      ctx,
-      DGLContext{kDGLCPU, 0},
-      mat.indptr->dtype);
+  device->CopyDataFromTo(
+      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
+      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
  CUDA_CALL(cudaEventRecord(copyEvent, stream));

  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
@@ -331,36 +309,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
    const dim3 block(BLOCK_SIZE);
    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
    CUDA_KERNEL_CALL(
-        (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>),
-        grid, block, 0, stream,
-        random_seed,
-        num_picks,
-        num_rows,
-        slice_rows,
-        in_ptr,
-        in_cols,
-        data,
-        out_ptr,
-        out_rows,
-        out_cols,
-        out_idxs);
+        (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>), grid, block,
+        0, stream, random_seed, num_picks, num_rows, slice_rows, in_ptr,
+        in_cols, data, out_ptr, out_rows, out_cols, out_idxs);
  } else {  // without replacement
    const dim3 block(BLOCK_SIZE);
    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
    CUDA_KERNEL_CALL(
-        (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>),
-        grid, block, 0, stream,
-        random_seed,
-        num_picks,
-        num_rows,
-        slice_rows,
-        in_ptr,
-        in_cols,
-        data,
-        out_ptr,
-        out_rows,
-        out_cols,
-        out_idxs);
+        (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>), grid, block, 0,
+        stream, random_seed, num_picks, num_rows, slice_rows, in_ptr, in_cols,
+        data, out_ptr, out_rows, out_cols, out_idxs);
  }
  device->FreeWorkspace(ctx, out_ptr);

@@ -372,8 +330,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
  picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);

-  return COOMatrix(mat.num_rows, mat.num_cols, picked_row,
-      picked_col, picked_idx);
+  return COOMatrix(
+      mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx);
 }

 template <DGLDeviceType XPU, typename IdType>
@@ -383,9 +341,11 @@ COOMatrix CSRRowWiseSamplingUniform(
    // Basically this is UnitGraph::InEdges().
    COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false);
    IdArray sliced_rows = IndexSelect(rows, coo.row);
-    return COOMatrix(mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
+    return COOMatrix(
+        mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
  } else {
-    return _CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_picks, replace);
+    return _CSRRowWiseSamplingUniform<XPU, IdType>(
+        mat, rows, num_picks, replace);
  }
 }


--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
@@ -8,9 +8,10 @@

 #include <string>
 #include <vector>
+
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./atomic.cuh"
+#include "./utils.h"

 namespace dgl {

@@ -24,11 +25,9 @@ namespace cuda {
 * \note each blockthread is responsible for aggregation on a row
 *       in the result tensor.
 */
-template <typename IdType, typename DType,
-          typename ReduceOp>
+template <typename IdType, typename DType, typename ReduceOp>
 __global__ void SegmentReduceKernel(
-    const DType* feat, const IdType* offsets,
-    DType* out, IdType* arg,
+    const DType* feat, const IdType* offsets, DType* out, IdType* arg,
    int64_t n, int64_t dim) {
  for (int row = blockIdx.x; row < n; row += gridDim.x) {
    int col = blockIdx.y * blockDim.x + threadIdx.x;
@@ -39,8 +38,7 @@ __global__ void SegmentReduceKernel(
        ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i);
      }
      out[row * dim + col] = local_accum;
-      if (ReduceOp::require_arg)
-        arg[row * dim + col] = local_arg;
+      if (ReduceOp::require_arg) arg[row * dim + col] = local_arg;
      col += gridDim.y * blockDim.x;
    }
  }
@@ -53,8 +51,7 @@ __global__ void SegmentReduceKernel(
 */
 template <typename IdType, typename DType>
 __global__ void ScatterAddKernel(
-    const DType *feat, const IdType *idx, DType *out,
-    int64_t n, int64_t dim) {
+    const DType* feat, const IdType* idx, DType* out, int64_t n, int64_t dim) {
  for (int row = blockIdx.x; row < n; row += gridDim.x) {
    const int write_row = idx[row];
    int col = blockIdx.y * blockDim.x + threadIdx.x;
@@ -73,7 +70,7 @@ __global__ void ScatterAddKernel(

 template <typename IdType, typename DType>
 __global__ void UpdateGradMinMaxHeteroKernel(
-    const DType *feat, const IdType *idx, const IdType *idx_type, DType *out,
+    const DType* feat, const IdType* idx, const IdType* idx_type, DType* out,
    int64_t n, int64_t dim, int type) {
  unsigned int tId = threadIdx.x;
  unsigned int laneId = tId & 31;
@@ -100,8 +97,7 @@ __global__ void UpdateGradMinMaxHeteroKernel(
 */
 template <typename IdType, typename DType>
 __global__ void BackwardSegmentCmpKernel(
-    const DType *feat, const IdType *arg, DType *out,
-    int64_t n, int64_t dim) {
+    const DType* feat, const IdType* arg, DType* out, int64_t n, int64_t dim) {
  for (int row = blockIdx.x; row < n; row += gridDim.x) {
    int col = blockIdx.y * blockDim.x + threadIdx.x;
    while (col < dim) {
@@ -122,11 +118,7 @@ __global__ void BackwardSegmentCmpKernel(
 * \param arg An auxiliary tensor storing ArgMax/Min information,
 */
 template <typename IdType, typename DType, typename ReduceOp>
-void SegmentReduce(
-    NDArray feat,
-    NDArray offsets,
-    NDArray out,
-    NDArray arg) {
+void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* offsets_data = offsets.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();
@@ -135,8 +127,7 @@ void SegmentReduce(
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  int64_t n = out->shape[0];
  int64_t dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
-    dim *= out->shape[i];
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];

  const int nbx = FindNumBlocks<'x'>(n);
  const int ntx = FindNumThreads(dim);
@@ -145,10 +136,9 @@ void SegmentReduce(
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
  // TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance.
-  CUDA_KERNEL_CALL((SegmentReduceKernel<IdType, DType, ReduceOp>),
-      nblks, nthrs, 0, stream,
-      feat_data, offsets_data, out_data, arg_data,
-      n, dim);
+  CUDA_KERNEL_CALL(
+      (SegmentReduceKernel<IdType, DType, ReduceOp>), nblks, nthrs, 0, stream,
+      feat_data, offsets_data, out_data, arg_data, n, dim);
 }

 /*!
@@ -159,19 +149,15 @@ void SegmentReduce(
 * \param out The output tensor.
 */
 template <typename IdType, typename DType>
-void ScatterAdd(
-    NDArray feat,
-    NDArray idx,
-    NDArray out) {
+void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* idx_data = idx.Ptr<IdType>();
-  DType *out_data = out.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();

  cudaStream_t stream = runtime::getCurrentCUDAStream();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
-    dim *= out->shape[i];
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];

  const int nbx = FindNumBlocks<'x'>(n);
  const int ntx = FindNumThreads(dim);
@@ -179,10 +165,9 @@ void ScatterAdd(
  const int nty = 1;
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
-  CUDA_KERNEL_CALL((ScatterAddKernel<IdType, DType>),
-                   nblks, nthrs, 0, stream,
-                   feat_data, idx_data, out_data,
-                   n, dim);
+  CUDA_KERNEL_CALL(
+      (ScatterAddKernel<IdType, DType>), nblks, nthrs, 0, stream, feat_data,
+      idx_data, out_data, n, dim);
 }

 /*!
@@ -195,24 +180,26 @@ void ScatterAdd(
 * \param list_out List of the output tensors.
 */
 template <typename IdType, typename DType>
-void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph,
-                const std::string& op,
-                const std::vector<NDArray>& list_feat,
-                const std::vector<NDArray>& list_idx,
-                const std::vector<NDArray>& list_idx_types,
-                std::vector<NDArray>* list_out) {
+void UpdateGradMinMax_hetero(
+    const HeteroGraphPtr& graph, const std::string& op,
+    const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
+    const std::vector<NDArray>& list_idx_types,
+    std::vector<NDArray>* list_out) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  if (op == "copy_lhs" || op == "copy_rhs") {
-    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(graph->NumVertexTypes(),
-    std::vector<dgl_id_t>());
+    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
+        graph->NumVertexTypes(), std::vector<dgl_id_t>());
    for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
      auto pair = graph->meta_graph()->FindEdge(etype);
      const dgl_id_t dst_ntype = pair.first;  // graph is reversed
      const dgl_id_t src_ntype = pair.second;
-      auto same_src_dst_ntype = std::find(std::begin(src_dst_ntypes[dst_ntype]),
-        std::end(src_dst_ntypes[dst_ntype]), src_ntype);
-      // if op is "copy_lhs", relation type with same src and dst node type will be updated once
-      if (op == "copy_lhs" && same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
+      auto same_src_dst_ntype = std::find(
+          std::begin(src_dst_ntypes[dst_ntype]),
+          std::end(src_dst_ntypes[dst_ntype]), src_ntype);
+      // if op is "copy_lhs", relation type with same src and dst node type will
+      // be updated once
+      if (op == "copy_lhs" &&
+          same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
        continue;
      src_dst_ntypes[dst_ntype].push_back(src_ntype);
      const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
@@ -229,35 +216,31 @@ void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph,
      const int nbx = FindNumBlocks<'x'>((n * th_per_row + ntx - 1) / ntx);
      const dim3 nblks(nbx);
      const dim3 nthrs(ntx);
-      CUDA_KERNEL_CALL((UpdateGradMinMaxHeteroKernel<IdType, DType>),
-                       nblks, nthrs, 0, stream,
-                       feat_data, idx_data, idx_type_data,
-                       out_data, n, dim, type);
+      CUDA_KERNEL_CALL(
+          (UpdateGradMinMaxHeteroKernel<IdType, DType>), nblks, nthrs, 0,
+          stream, feat_data, idx_data, idx_type_data, out_data, n, dim, type);
    }
  }
 }

 /*!
- * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max reducer.
- * \note math equation: out[arg[i, k], k] = feat[i, k]
- * \param feat The input tensor.
+ * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max
+ *        reducer.
+ * \note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
+ *       tensor.
 * \param arg The ArgMin/Max information, used for indexing.
 * \param out The output tensor.
 */
 template <typename IdType, typename DType>
-void BackwardSegmentCmp(
-    NDArray feat,
-    NDArray arg,
-    NDArray out) {
+void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* arg_data = arg.Ptr<IdType>();
-  DType *out_data = out.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();

  cudaStream_t stream = runtime::getCurrentCUDAStream();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
-    dim *= out->shape[i];
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];

  const int nbx = FindNumBlocks<'x'>(n);
  const int ntx = FindNumThreads(dim);
@@ -265,10 +248,9 @@ void BackwardSegmentCmp(
  const int nty = 1;
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
-  CUDA_KERNEL_CALL((BackwardSegmentCmpKernel<IdType, DType>),
-                   nblks, nthrs, 0, stream,
-                   feat_data, arg_data, out_data,
-                   n, dim);
+  CUDA_KERNEL_CALL(
+      (BackwardSegmentCmpKernel<IdType, DType>), nblks, nthrs, 0, stream,
+      feat_data, arg_data, out_data, n, dim);
 }

 }  // namespace cuda

--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
@@ -4,12 +4,14 @@
 * \brief COO operator GPU implementation
 */
 #include <dgl/array.h>
-#include <vector>
-#include <unordered_set>
+
 #include <numeric>
+#include <unordered_set>
+#include <vector>
+
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./atomic.cuh"
+#include "./utils.h"

 namespace dgl {

@@ -19,9 +21,8 @@ using namespace cuda;
 namespace aten {
 namespace impl {

-
 template <typename IdType>
-__device__ void _warpReduce(volatile IdType *sdata, IdType tid) {
+__device__ void _warpReduce(volatile IdType* sdata, IdType tid) {
  sdata[tid] += sdata[tid + 32];
  sdata[tid] += sdata[tid + 16];
  sdata[tid] += sdata[tid + 8];
@@ -32,10 +33,8 @@ __device__ void _warpReduce(volatile IdType *sdata, IdType tid) {

 template <typename IdType>
 __global__ void _COOGetRowNNZKernel(
-    const IdType* __restrict__ row_indices,
-    IdType* __restrict__ glb_cnt,
-    const int64_t row_query,
-    IdType nnz) {
+    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnt,
+    const int64_t row_query, IdType nnz) {
  __shared__ IdType local_cnt[1024];
  IdType tx = threadIdx.x;
  IdType bx = blockIdx.x;
@@ -80,10 +79,9 @@ int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
  IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
  NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
  _Fill(rst.Ptr<IdType>(), 1, IdType(0));
-  CUDA_KERNEL_CALL(_COOGetRowNNZKernel,
-      nb, nt, 0, stream,
-      coo.row.Ptr<IdType>(), rst.Ptr<IdType>(),
-      row, nnz);
+  CUDA_KERNEL_CALL(
+      _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+      rst.Ptr<IdType>(), row, nnz);
  rst = rst.CopyTo(DGLContext{kDGLCPU, 0});
  return *rst.Ptr<IdType>();
 }
@@ -93,8 +91,7 @@ template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t);

 template <typename IdType>
 __global__ void _COOGetAllRowNNZKernel(
-    const IdType* __restrict__ row_indices,
-    IdType* __restrict__ glb_cnts,
+    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnts,
    IdType nnz) {
  IdType eid = blockIdx.x * blockDim.x + threadIdx.x;
  while (eid < nnz) {
@@ -118,20 +115,18 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
    NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
    _Fill(rst.Ptr<IdType>(), 1, IdType(0));
-    CUDA_KERNEL_CALL(_COOGetRowNNZKernel,
-        nb, nt, 0, stream,
-        coo.row.Ptr<IdType>(), rst.Ptr<IdType>(),
-        row, nnz);
+    CUDA_KERNEL_CALL(
+        _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+        rst.Ptr<IdType>(), row, nnz);
    return rst;
  } else {
    IdType nt = 1024;
    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
    NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx);
    _Fill(in_degrees.Ptr<IdType>(), num_rows, IdType(0));
-    CUDA_KERNEL_CALL(_COOGetAllRowNNZKernel,
-        nb, nt, 0, stream,
-        coo.row.Ptr<IdType>(), in_degrees.Ptr<IdType>(),
-        nnz);
+    CUDA_KERNEL_CALL(
+        _COOGetAllRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+        in_degrees.Ptr<IdType>(), nnz);
    return IndexSelect(in_degrees, rows);
  }
 }

--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
@@ -14,31 +14,28 @@ namespace aten {
 namespace impl {

 /*  This is a cross-device access version of IndexSelectMultiKernel.
-*   Since the memory access over PCIe is more sensitive to the
-*   data access aligment (cacheline), we need a separate version here.
-*/
+ *   Since the memory access over PCIe is more sensitive to the
+ *   data access aligment (cacheline), we need a separate version here.
+ */
 template <typename DType, typename IdType>
 __global__ void IndexSelectMultiKernelAligned(
-        const DType* const array,
-        const int64_t num_feat,
-        const IdType* const index,
-        const int64_t length,
-        const int64_t arr_len,
-        DType* const out) {
-  int64_t out_row = blockIdx.x*blockDim.y+threadIdx.y;
+    const DType* const array, const int64_t num_feat, const IdType* const index,
+    const int64_t length, const int64_t arr_len, DType* const out) {
+  int64_t out_row = blockIdx.x * blockDim.y + threadIdx.y;

-  const int64_t stride = blockDim.y*gridDim.x;
+  const int64_t stride = blockDim.y * gridDim.x;

  while (out_row < length) {
    int64_t col = threadIdx.x;
    const int64_t in_row = index[out_row];
    assert(in_row >= 0 && in_row < arr_len);
    const int64_t idx_offset =
-      ((uint64_t)(&array[in_row*num_feat]) % CACHE_LINE_SIZE) / sizeof(DType);
+        ((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) /
+        sizeof(DType);
    col = col - idx_offset;
    while (col < num_feat) {
      if (col >= 0)
-        out[out_row*num_feat+col] = array[in_row*num_feat+col];
+        out[out_row * num_feat + col] = array[in_row * num_feat + col];
      col += blockDim.x;
    }
    out_row += stride;

--- a/src/array/filter.cc
+++ b/src/array/filter.cc
--- a/src/array/filter.h
+++ b/src/array/filter.h
@@ -4,12 +4,11 @@
 * \brief Object for selecting items in a set, or selecting items not in a set.
 */

-
 #ifndef DGL_ARRAY_FILTER_H_
 #define DGL_ARRAY_FILTER_H_

-#include <dgl/runtime/object.h>
 #include <dgl/array.h>
+#include <dgl/runtime/object.h>

 namespace dgl {
 namespace array {
@@ -28,8 +27,7 @@ class Filter : public runtime::Object {
   * @return The indices of the items from `test` that are selected by
   * this filter.
   */
-  virtual IdArray find_included_indices(
-      IdArray test) = 0;
+  virtual IdArray find_included_indices(IdArray test) = 0;

  /**
   * @brief From the test set of items, get the indices of those which are
@@ -40,8 +38,7 @@ class Filter : public runtime::Object {
   * @return The indices of the items from `test` that are not selected by this
   * filter.
   */
-  virtual IdArray find_excluded_indices(
-      IdArray test) = 0;
+  virtual IdArray find_excluded_indices(IdArray test) = 0;
 };

 DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
@@ -50,4 +47,3 @@ DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
 }  // namespace dgl

 #endif  // DGL_ARRAY_FILTER_H_
-
--- a/src/array/libra_partition.cc
+++ b/src/array/libra_partition.cc
--- a/src/array/union_partition.cc
+++ b/src/array/union_partition.cc
--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
--- a/src/array/uvm_array_op.h
+++ b/src/array/uvm_array_op.h
@@ -7,6 +7,7 @@
 #define DGL_ARRAY_UVM_ARRAY_OP_H_

 #include <dgl/array.h>
+
 #include <utility>

 namespace dgl {

--- a/src/c_api_common.cc
+++ b/src/c_api_common.cc
--- a/src/c_api_common.h
+++ b/src/c_api_common.h