[Misc] clang-format auto fix. (#4804)

* [Misc] clang-format auto fix. * manual * manual * manual * manual * todo * fix Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>

[Misc] clang-format auto fix. (#4804)
* [Misc] clang-format auto fix. * manual * manual * manual * manual * todo * fix Co-authored-by: Steve <ubuntu@ip-172-31-34-29.ap-northeast-1.compute.internal>
8ae50c42 · Hongzhi (Steve), Chen · GitHub · 81831111 · 8ae50c42 · 8ae50c42
Unverified Commit 8ae50c42 authored Nov 03, 2022 by Hongzhi (Steve), Chen Committed by GitHub Nov 03, 2022
17 changed files
--- a/src/array/cuda/ge_spmm.cuh
+++ b/src/array/cuda/ge_spmm.cuh
@@ -6,10 +6,10 @@
 #ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_
 #define DGL_ARRAY_CUDA_GE_SPMM_CUH_
-#include "macro.cuh"
-#include "atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
+#include "atomic.cuh"
+#include "macro.cuh"
 namespace dgl {
@@ -23,23 +23,19 @@ namespace cuda {
 * \note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
 *       The grid dimension x and y are reordered for better performance.
 */
-template <typename Idx, typename DType,
+template <typename Idx, typename DType, typename BinaryOp>
-          typename BinaryOp>
 __global__ void GESpMMKernel(
-    const DType* __restrict__ ufeat,
+    const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
-    const DType* __restrict__ efeat,
+    DType* __restrict__ out, const Idx* __restrict__ indptr,
-    DType* __restrict__ out,
+    const Idx* __restrict__ indices, const int64_t num_rows,
-    const Idx* __restrict__ indptr,
+    const int64_t num_cols, const int64_t feat_len) {
-    const Idx* __restrict__ indices,
+  const Idx rid =
-    const int64_t num_rows, const int64_t num_cols,
+      blockIdx.x * blockDim.y + threadIdx.y;        // over vertices dimension
-    const int64_t feat_len) {
-  const Idx rid = blockIdx.x * blockDim.y + threadIdx.y;  // over vertices dimension
  const Idx fid = (blockIdx.y * 64) + threadIdx.x;  // over feature dimension
  if (rid < num_rows && fid < feat_len) {
    const Idx low = __ldg(indptr + rid), high = __ldg(indptr + rid + 1);
-    DType accum_0 = 0.,
+    DType accum_0 = 0., accum_1 = 0.;
-          accum_1 = 0.;
    if (blockIdx.y != gridDim.y - 1) {  // fid + 32 < feat_len
      for (Idx left = low; left < high; left += 32) {
@@ -109,24 +105,21 @@ __global__ void GESpMMKernel(
        }
        out[feat_len * rid + fid] = accum_0;
-        if (fid + 32 < feat_len)
+        if (fid + 32 < feat_len) out[feat_len * rid + fid + 32] = accum_1;
-          out[feat_len * rid + fid + 32] = accum_1;
      }
    }
  }
 }
-template <typename Idx, typename DType,
+template <typename Idx, typename DType, typename BinaryOp>
-          typename BinaryOp>
 void GESpMMCsr(
-    const CSRMatrix& csr,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
-    NDArray ufeat, NDArray efeat,
+    int64_t feat_len) {
-    NDArray out, int64_t feat_len) {
+  const Idx* indptr = csr.indptr.Ptr<Idx>();
-  const Idx *indptr = csr.indptr.Ptr<Idx>();
+  const Idx* indices = csr.indices.Ptr<Idx>();
-  const Idx *indices = csr.indices.Ptr<Idx>();
+  const DType* ufeat_data = ufeat.Ptr<DType>();
-  const DType *ufeat_data = ufeat.Ptr<DType>();
+  const DType* efeat_data = efeat.Ptr<DType>();
-  const DType *efeat_data = efeat.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
-  DType *out_data = out.Ptr<DType>();
  cudaStream_t stream = runtime::getCurrentCUDAStream();
@@ -138,12 +131,10 @@ void GESpMMCsr(
  const dim3 nthrs(ntx, nty);
  const int sh_mem_size = 0;
-  CUDA_KERNEL_CALL((GESpMMKernel<Idx, DType, BinaryOp>),
+  CUDA_KERNEL_CALL(
-      nblks, nthrs, sh_mem_size, stream,
+      (GESpMMKernel<Idx, DType, BinaryOp>), nblks, nthrs, sh_mem_size, stream,
-      ufeat_data, efeat_data, out_data,
+      ufeat_data, efeat_data, out_data, indptr, indices, csr.num_rows,
-      indptr, indices,
+      csr.num_cols, feat_len);
-      csr.num_rows, csr.num_cols,
-      feat_len);
 }
 }  // namespace cuda

--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
@@ -8,12 +8,12 @@
 ///////////////////////// Dispatchers //////////////////////////
 /* Macro used for switching between broadcasting and non-broadcasting kernels.
 * It also copies the auxiliary information for calculating broadcasting offsets
 * to GPU.
 */
-#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...) do { \
+#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...)     \
+  do {                                                                        \
    const BcastOff &info = (BCAST);                                           \
    if (!info.use_bcast) {                                                    \
      constexpr bool UseBcast = false;                                        \
@@ -28,13 +28,15 @@
      constexpr bool UseBcast = true;                                         \
      const DGLContext ctx = (CTX);                                           \
      const auto device = runtime::DeviceAPI::Get(ctx);                       \
-    (LHS_OFF) = static_cast<int64_t*>(                                         \
+      (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
-      device->AllocWorkspace(ctx, sizeof(int64_t) * info.lhs_offset.size()));  \
+          ctx, sizeof(int64_t) * info.lhs_offset.size()));                    \
-    CUDA_CALL(cudaMemcpy((LHS_OFF), &info.lhs_offset[0],                       \
+      CUDA_CALL(cudaMemcpy(                                                   \
+          (LHS_OFF), &info.lhs_offset[0],                                     \
          sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
-    (RHS_OFF) = static_cast<int64_t*>(                                         \
+      (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
-      device->AllocWorkspace(ctx, sizeof(int64_t) * info.rhs_offset.size()));  \
+          ctx, sizeof(int64_t) * info.rhs_offset.size()));                    \
-    CUDA_CALL(cudaMemcpy((RHS_OFF), &info.rhs_offset[0],                       \
+      CUDA_CALL(cudaMemcpy(                                                   \
+          (RHS_OFF), &info.rhs_offset[0],                                     \
          sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
      if ((EDGE_MAP)) {                                                       \
        constexpr bool UseIdx = true;                                         \
@@ -46,6 +48,6 @@
      device->FreeWorkspace(ctx, (LHS_OFF));                                  \
      device->FreeWorkspace(ctx, (RHS_OFF));                                  \
    }                                                                         \
-} while (0)
+  } while (0)
 #endif  // DGL_ARRAY_CUDA_MACRO_CUH_
--- a/src/array/cuda/negative_sampling.cu
+++ b/src/array/cuda/negative_sampling.cu
@@ -4,14 +4,14 @@
 * \brief rowwise sampling
 */
-#include <dgl/random.h>
+#include <curand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/array_iterator.h>
-#include <curand_kernel.h>
+#include <dgl/random.h>
+#include "../../runtime/cuda/cuda_common.h"
 #include "./dgl_cub.cuh"
 #include "./utils.h"
-#include "../../runtime/cuda/cuda_common.h"
 using namespace dgl::runtime;
@@ -23,20 +23,15 @@ namespace {
 template <typename IdType>
 __global__ void _GlobalUniformNegativeSamplingKernel(
-    const IdType* __restrict__ indptr,
+    const IdType* __restrict__ indptr, const IdType* __restrict__ indices,
-    const IdType* __restrict__ indices,
+    IdType* __restrict__ row, IdType* __restrict__ col, int64_t num_row,
-    IdType* __restrict__ row,
+    int64_t num_col, int64_t num_samples, int num_trials,
-    IdType* __restrict__ col,
+    bool exclude_self_loops, int32_t random_seed) {
-    int64_t num_row,
-    int64_t num_col,
-    int64_t num_samples,
-    int num_trials,
-    bool exclude_self_loops,
-    int32_t random_seed) {
  int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
-  curandStatePhilox4_32_10_t rng;   // this allows generating 4 32-bit ints at a time
+  curandStatePhilox4_32_10_t
+      rng;  // this allows generating 4 32-bit ints at a time
  curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
  while (tx < num_samples) {
@@ -50,8 +45,7 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
      int64_t u = static_cast<int64_t>(((y_lo << 32L) | z) % num_row);
      int64_t v = static_cast<int64_t>(((y_hi << 32L) | w) % num_col);
-      if (exclude_self_loops && (u == v))
+      if (exclude_self_loops && (u == v)) continue;
-        continue;
      // binary search of v among indptr[u:u+1]
      int64_t b = indptr[u], e = indptr[u + 1] - 1;
@@ -81,48 +75,47 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
 template <typename DType>
 struct IsNotMinusOne {
-  __device__ __forceinline__ bool operator() (const std::pair<DType, DType>& a) {
+  __device__ __forceinline__ bool operator()(const std::pair<DType, DType>& a) {
    return a.first != -1;
  }
 };
 /*!
- * \brief Sort ordered pairs in ascending order, using \a tmp_major and \a tmp_minor as
+ * \brief Sort ordered pairs in ascending order, using \a tmp_major and \a
- * temporary buffers, each with \a n elements.
+ * tmp_minor as temporary buffers, each with \a n elements.
 */
 template <typename IdType>
 void SortOrderedPairs(
-    runtime::DeviceAPI* device,
+    runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
-    DGLContext ctx,
+    IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
-    IdType* major,
-    IdType* minor,
-    IdType* tmp_major,
-    IdType* tmp_minor,
-    int64_t n,
-    cudaStream_t stream) {
  // Sort ordered pairs in lexicographical order by two radix sorts since
  // cub's radix sorts are stable.
-  // We need a 2*n auxiliary storage to store the results form the first radix sort.
+  // We need a 2*n auxiliary storage to store the results form the first radix
+  // sort.
  size_t s1 = 0, s2 = 0;
  void* tmp1 = nullptr;
  void* tmp2 = nullptr;
  // Radix sort by minor key first, reorder the major key in the progress.
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream));
+      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
+      stream));
  tmp1 = device->AllocWorkspace(ctx, s1);
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream));
+      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
+      stream));
  // Radix sort by major key next.
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream));
+      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
-  tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) : tmp1;  // reuse buffer if s2 <= s1
+      stream));
+  tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
+                   : tmp1;  // reuse buffer if s2 <= s1
  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
-        tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream));
+      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
+      stream));
-  if (tmp1 != tmp2)
+  if (tmp1 != tmp2) device->FreeWorkspace(ctx, tmp2);
-    device->FreeWorkspace(ctx, tmp2);
  device->FreeWorkspace(ctx, tmp1);
 }
@@ -130,17 +123,14 @@ void SortOrderedPairs(
 template <DGLDeviceType XPU, typename IdType>
 std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
-    const CSRMatrix& csr,
+    const CSRMatrix& csr, int64_t num_samples, int num_trials,
-    int64_t num_samples,
+    bool exclude_self_loops, bool replace, double redundancy) {
-    int num_trials,
-    bool exclude_self_loops,
-    bool replace,
-    double redundancy) {
  auto ctx = csr.indptr->ctx;
  auto dtype = csr.indptr->dtype;
  const int64_t num_row = csr.num_rows;
  const int64_t num_col = csr.num_cols;
-  const int64_t num_actual_samples = static_cast<int64_t>(num_samples * (1 + redundancy));
+  const int64_t num_actual_samples =
+      static_cast<int64_t>(num_samples * (1 + redundancy));
  IdArray row = Full<IdType>(-1, num_actual_samples, ctx);
  IdArray col = Full<IdType>(-1, num_actual_samples, ctx);
  IdArray out_row = IdArray::Empty({num_actual_samples}, dtype, ctx);
@@ -156,22 +146,25 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  std::pair<IdArray, IdArray> result;
  int64_t num_out;
-  CUDA_KERNEL_CALL(_GlobalUniformNegativeSamplingKernel,
+  CUDA_KERNEL_CALL(
-      nb, nt, 0, stream,
+      _GlobalUniformNegativeSamplingKernel, nb, nt, 0, stream,
-      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(),
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), row_data, col_data,
-      row_data, col_data, num_row, num_col, num_actual_samples, num_trials,
+      num_row, num_col, num_actual_samples, num_trials, exclude_self_loops,
-      exclude_self_loops, RandomEngine::ThreadLocal()->RandInt32());
+      RandomEngine::ThreadLocal()->RandInt32());
  size_t tmp_size = 0;
-  int64_t* num_out_cuda = static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
+  int64_t* num_out_cuda =
+      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
  IsNotMinusOne<IdType> op;
  PairIterator<IdType> begin(row_data, col_data);
  PairIterator<IdType> out_begin(out_row_data, out_col_data);
  CUDA_CALL(cub::DeviceSelect::If(
-        nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream));
+      nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
+      stream));
  void* tmp = device->AllocWorkspace(ctx, tmp_size);
  CUDA_CALL(cub::DeviceSelect::If(
-        tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream));
+      tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
+      stream));
  num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
  if (!replace) {
@@ -182,28 +175,33 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
    PairIterator<IdType> unique_begin(unique_row_data, unique_col_data);
    SortOrderedPairs(
-        device, ctx, out_row_data, out_col_data, unique_row_data, unique_col_data,
+        device, ctx, out_row_data, out_col_data, unique_row_data,
-        num_out, stream);
+        unique_col_data, num_out, stream);
    size_t tmp_size_unique = 0;
    void* tmp_unique = nullptr;
    CUDA_CALL(cub::DeviceSelect::Unique(
-          nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream));
+        nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
-    tmp_unique = (tmp_size_unique > tmp_size) ?
+        num_out, stream));
-      device->AllocWorkspace(ctx, tmp_size_unique) :
+    tmp_unique = (tmp_size_unique > tmp_size)
-      tmp;      // reuse buffer
+                     ? device->AllocWorkspace(ctx, tmp_size_unique)
+                     : tmp;  // reuse buffer
    CUDA_CALL(cub::DeviceSelect::Unique(
-          tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream));
+        tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
+        num_out, stream));
    num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
    num_out = std::min(num_samples, num_out);
-    result = {unique_row.CreateView({num_out}, dtype), unique_col.CreateView({num_out}, dtype)};
+    result = {
+        unique_row.CreateView({num_out}, dtype),
+        unique_col.CreateView({num_out}, dtype)};
-    if (tmp_unique != tmp)
+    if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
-      device->FreeWorkspace(ctx, tmp_unique);
  } else {
    num_out = std::min(num_samples, num_out);
-    result = {out_row.CreateView({num_out}, dtype), out_col.CreateView({num_out}, dtype)};
+    result = {
+        out_row.CreateView({num_out}, dtype),
+        out_col.CreateView({num_out}, dtype)};
  }
  device->FreeWorkspace(ctx, tmp);
@@ -211,10 +209,10 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
  return result;
 }
-template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int32_t>(
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
-    const CSRMatrix&, int64_t, int, bool, bool, double);
+    kDGLCUDA, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
-template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<kDGLCUDA, int64_t>(
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
-    const CSRMatrix&, int64_t, int, bool, bool, double);
+    kDGLCUDA, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
 };  // namespace impl
 };  // namespace aten

--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -4,15 +4,15 @@
 * \brief uniform rowwise sampling
 */
+#include <curand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
-#include <curand_kernel.h>
 #include <numeric>
-#include "./dgl_cub.cuh"
 #include "../../array/cuda/atomic.cuh"
 #include "../../runtime/cuda/cuda_common.h"
+#include "./dgl_cub.cuh"
 using namespace dgl::aten::cuda;
@@ -25,29 +25,28 @@ namespace {
 constexpr int BLOCK_SIZE = 128;
 /**
-* @brief Compute the size of each row in the sampled CSR, without replacement.
+ * @brief Compute the size of each row in the sampled CSR, without replacement.
-*
+ *
-* @tparam IdType The type of node and edge indexes.
+ * @tparam IdType The type of node and edge indexes.
-* @param num_picks The number of non-zero entries to pick per row.
+ * @param num_picks The number of non-zero entries to pick per row.
-* @param num_rows The number of rows to pick.
+ * @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
+ * @param in_rows The set of rows to pick.
-* @param in_ptr The index where each row's edges start.
+ * @param in_ptr The index where each row's edges start.
-* @param out_deg The size of each row in the sampled matrix, as indexed by
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
-* `in_rows` (output).
+ * `in_rows` (output).
-*/
+ */
-template<typename IdType>
+template <typename IdType>
 __global__ void _CSRRowWiseSampleDegreeKernel(
-    const int64_t num_picks,
+    const int64_t num_picks, const int64_t num_rows,
-    const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
-    const IdType * const in_rows,
+    IdType* const out_deg) {
-    const IdType * const in_ptr,
-    IdType * const out_deg) {
  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
  if (tIdx < num_rows) {
    const int in_row = in_rows[tIdx];
    const int out_row = tIdx;
-    out_deg[out_row] = min(static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
+    out_deg[out_row] = min(
+        static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
    if (out_row == num_rows - 1) {
      // make the prefixsum work
@@ -57,23 +56,21 @@ __global__ void _CSRRowWiseSampleDegreeKernel(
 }
 /**
-* @brief Compute the size of each row in the sampled CSR, with replacement.
+ * @brief Compute the size of each row in the sampled CSR, with replacement.
-*
+ *
-* @tparam IdType The type of node and edge indexes.
+ * @tparam IdType The type of node and edge indexes.
-* @param num_picks The number of non-zero entries to pick per row.
+ * @param num_picks The number of non-zero entries to pick per row.
-* @param num_rows The number of rows to pick.
+ * @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
+ * @param in_rows The set of rows to pick.
-* @param in_ptr The index where each row's edges start.
+ * @param in_ptr The index where each row's edges start.
-* @param out_deg The size of each row in the sampled matrix, as indexed by
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
-* `in_rows` (output).
+ * `in_rows` (output).
-*/
+ */
-template<typename IdType>
+template <typename IdType>
 __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
-    const int64_t num_picks,
+    const int64_t num_picks, const int64_t num_rows,
-    const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
-    const IdType * const in_rows,
+    IdType* const out_deg) {
-    const IdType * const in_ptr,
-    IdType * const out_deg) {
  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
  if (tIdx < num_rows) {
@@ -94,41 +91,36 @@ __global__ void _CSRRowWiseSampleDegreeReplaceKernel(
 }
 /**
-* @brief Perform row-wise uniform sampling on a CSR matrix,
+ * @brief Perform row-wise uniform sampling on a CSR matrix,
-* and generate a COO matrix, without replacement.
+ * and generate a COO matrix, without replacement.
-*
+ *
-* @tparam IdType The ID type used for matrices.
+ * @tparam IdType The ID type used for matrices.
-* @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
-* @param rand_seed The random seed to use.
+ * @param rand_seed The random seed to use.
-* @param num_picks The number of non-zeros to pick per row.
+ * @param num_picks The number of non-zeros to pick per row.
-* @param num_rows The number of rows to pick.
+ * @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
+ * @param in_rows The set of rows to pick.
-* @param in_ptr The indptr array of the input CSR.
+ * @param in_ptr The indptr array of the input CSR.
-* @param in_index The indices array of the input CSR.
+ * @param in_index The indices array of the input CSR.
-* @param data The data array of the input CSR.
+ * @param data The data array of the input CSR.
-* @param out_ptr The offset to write each row to in the output COO.
+ * @param out_ptr The offset to write each row to in the output COO.
-* @param out_rows The rows of the output COO (output).
+ * @param out_rows The rows of the output COO (output).
-* @param out_cols The columns of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
-* @param out_idxs The data array of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
-*/
+ */
-template<typename IdType, int TILE_SIZE>
+template <typename IdType, int TILE_SIZE>
 __global__ void _CSRRowWiseSampleUniformKernel(
-    const uint64_t rand_seed,
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
-    const int64_t num_picks,
+    const IdType* const in_rows, const IdType* const in_ptr,
-    const int64_t num_rows,
+    const IdType* const in_index, const IdType* const data,
-    const IdType * const in_rows,
+    const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
-    const IdType * const in_ptr,
+    IdType* const out_idxs) {
-    const IdType * const in_index,
-    const IdType * const data,
-    const IdType * const out_ptr,
-    IdType * const out_rows,
-    IdType * const out_cols,
-    IdType * const out_idxs) {
  // we assign one warp per row
  assert(blockDim.x == BLOCK_SIZE);
  int64_t out_row = blockIdx.x * TILE_SIZE;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
  curandStatePhilox4_32_10_t rng;
  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
@@ -177,41 +169,36 @@ __global__ void _CSRRowWiseSampleUniformKernel(
 }
 /**
-* @brief Perform row-wise uniform sampling on a CSR matrix,
+ * @brief Perform row-wise uniform sampling on a CSR matrix,
-* and generate a COO matrix, with replacement.
+ * and generate a COO matrix, with replacement.
-*
+ *
-* @tparam IdType The ID type used for matrices.
+ * @tparam IdType The ID type used for matrices.
-* @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
-* @param rand_seed The random seed to use.
+ * @param rand_seed The random seed to use.
-* @param num_picks The number of non-zeros to pick per row.
+ * @param num_picks The number of non-zeros to pick per row.
-* @param num_rows The number of rows to pick.
+ * @param num_rows The number of rows to pick.
-* @param in_rows The set of rows to pick.
+ * @param in_rows The set of rows to pick.
-* @param in_ptr The indptr array of the input CSR.
+ * @param in_ptr The indptr array of the input CSR.
-* @param in_index The indices array of the input CSR.
+ * @param in_index The indices array of the input CSR.
-* @param data The data array of the input CSR.
+ * @param data The data array of the input CSR.
-* @param out_ptr The offset to write each row to in the output COO.
+ * @param out_ptr The offset to write each row to in the output COO.
-* @param out_rows The rows of the output COO (output).
+ * @param out_rows The rows of the output COO (output).
-* @param out_cols The columns of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
-* @param out_idxs The data array of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
-*/
+ */
-template<typename IdType, int TILE_SIZE>
+template <typename IdType, int TILE_SIZE>
 __global__ void _CSRRowWiseSampleUniformReplaceKernel(
-    const uint64_t rand_seed,
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
-    const int64_t num_picks,
+    const IdType* const in_rows, const IdType* const in_ptr,
-    const int64_t num_rows,
+    const IdType* const in_index, const IdType* const data,
-    const IdType * const in_rows,
+    const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
-    const IdType * const in_ptr,
+    IdType* const out_idxs) {
-    const IdType * const in_index,
-    const IdType * const data,
-    const IdType * const out_ptr,
-    IdType * const out_rows,
-    IdType * const out_cols,
-    IdType * const out_idxs) {
  // we assign one warp per row
  assert(blockDim.x == BLOCK_SIZE);
  int64_t out_row = blockIdx.x * TILE_SIZE;
-  const int64_t last_row = min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
  curandStatePhilox4_32_10_t rng;
  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
@@ -229,7 +216,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
        const int64_t out_idx = out_row_start + idx;
        out_rows[out_idx] = row;
        out_cols[out_idx] = in_index[in_row_start + edge];
-        out_idxs[out_idx] = data ? data[in_row_start + edge] : in_row_start + edge;
+        out_idxs[out_idx] =
+            data ? data[in_row_start + edge] : in_row_start + edge;
      }
    }
    out_row += 1;
@@ -248,11 +236,14 @@ COOMatrix _CSRRowWiseSamplingUniform(
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const int64_t num_rows = rows->shape[0];
-  const IdType * const slice_rows = static_cast<const IdType*>(rows->data);
+  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
-  IdArray picked_row = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_row =
-  IdArray picked_col = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
-  IdArray picked_idx = NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_col =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_idx =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
  IdType* const out_rows = static_cast<IdType*>(picked_row->data);
  IdType* const out_cols = static_cast<IdType*>(picked_col->data);
  IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
@@ -261,65 +252,52 @@ COOMatrix _CSRRowWiseSamplingUniform(
  const IdType* in_cols = mat.indices.Ptr<IdType>();
  const IdType* data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
  if (mat.is_pinned) {
-    CUDA_CALL(cudaHostGetDevicePointer(
+    CUDA_CALL(cudaHostGetDevicePointer(&in_ptr, mat.indptr.Ptr<IdType>(), 0));
-        &in_ptr, mat.indptr.Ptr<IdType>(), 0));
+    CUDA_CALL(cudaHostGetDevicePointer(&in_cols, mat.indices.Ptr<IdType>(), 0));
-    CUDA_CALL(cudaHostGetDevicePointer(
-        &in_cols, mat.indices.Ptr<IdType>(), 0));
    if (CSRHasData(mat)) {
-      CUDA_CALL(cudaHostGetDevicePointer(
+      CUDA_CALL(cudaHostGetDevicePointer(&data, mat.data.Ptr<IdType>(), 0));
-          &data, mat.data.Ptr<IdType>(), 0));
    }
  }
  // compute degree
-  IdType * out_deg = static_cast<IdType*>(
+  IdType* out_deg = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  if (replace) {
    const dim3 block(512);
    const dim3 grid((num_rows + block.x - 1) / block.x);
    CUDA_KERNEL_CALL(
-        _CSRRowWiseSampleDegreeReplaceKernel,
+        _CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks,
-        grid, block, 0, stream,
+        num_rows, slice_rows, in_ptr, out_deg);
-        num_picks, num_rows, slice_rows, in_ptr, out_deg);
  } else {
    const dim3 block(512);
    const dim3 grid((num_rows + block.x - 1) / block.x);
    CUDA_KERNEL_CALL(
-        _CSRRowWiseSampleDegreeKernel,
+        _CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks,
-        grid, block, 0, stream,
+        num_rows, slice_rows, in_ptr, out_deg);
-        num_picks, num_rows, slice_rows, in_ptr, out_deg);
  }
  // fill out_ptr
-  IdType * out_ptr = static_cast<IdType*>(
+  IdType* out_ptr = static_cast<IdType*>(
      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
  size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(nullptr, prefix_temp_size,
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-      out_deg,
+      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
-      out_ptr,
+  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-      num_rows+1,
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-      stream));
+      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
-  void * prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(prefix_temp, prefix_temp_size,
-      out_deg,
-      out_ptr,
-      num_rows+1,
-      stream));
  device->FreeWorkspace(ctx, prefix_temp);
  device->FreeWorkspace(ctx, out_deg);
  cudaEvent_t copyEvent;
  CUDA_CALL(cudaEventCreate(&copyEvent));
-  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and wait on
+  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
-  // a cudaevent
+  // wait on a cudaevent
  IdType new_len;
  // copy using the internal current stream
-  device->CopyDataFromTo(out_ptr, num_rows * sizeof(new_len), &new_len, 0,
+  device->CopyDataFromTo(
-      sizeof(new_len),
+      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
-      ctx,
+      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
-      DGLContext{kDGLCPU, 0},
-      mat.indptr->dtype);
  CUDA_CALL(cudaEventRecord(copyEvent, stream));
  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
@@ -331,36 +309,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
    const dim3 block(BLOCK_SIZE);
    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
    CUDA_KERNEL_CALL(
-        (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>),
+        (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>), grid, block,
-        grid, block, 0, stream,
+        0, stream, random_seed, num_picks, num_rows, slice_rows, in_ptr,
-        random_seed,
+        in_cols, data, out_ptr, out_rows, out_cols, out_idxs);
-        num_picks,
-        num_rows,
-        slice_rows,
-        in_ptr,
-        in_cols,
-        data,
-        out_ptr,
-        out_rows,
-        out_cols,
-        out_idxs);
  } else {  // without replacement
    const dim3 block(BLOCK_SIZE);
    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
    CUDA_KERNEL_CALL(
-        (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>),
+        (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>), grid, block, 0,
-        grid, block, 0, stream,
+        stream, random_seed, num_picks, num_rows, slice_rows, in_ptr, in_cols,
-        random_seed,
+        data, out_ptr, out_rows, out_cols, out_idxs);
-        num_picks,
-        num_rows,
-        slice_rows,
-        in_ptr,
-        in_cols,
-        data,
-        out_ptr,
-        out_rows,
-        out_cols,
-        out_idxs);
  }
  device->FreeWorkspace(ctx, out_ptr);
@@ -372,8 +330,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
  picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
-  return COOMatrix(mat.num_rows, mat.num_cols, picked_row,
+  return COOMatrix(
-      picked_col, picked_idx);
+      mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx);
 }
 template <DGLDeviceType XPU, typename IdType>
@@ -383,9 +341,11 @@ COOMatrix CSRRowWiseSamplingUniform(
    // Basically this is UnitGraph::InEdges().
    COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false);
    IdArray sliced_rows = IndexSelect(rows, coo.row);
-    return COOMatrix(mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
+    return COOMatrix(
+        mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
  } else {
-    return _CSRRowWiseSamplingUniform<XPU, IdType>(mat, rows, num_picks, replace);
+    return _CSRRowWiseSamplingUniform<XPU, IdType>(
+        mat, rows, num_picks, replace);
  }
 }

--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
@@ -8,9 +8,10 @@
 #include <string>
 #include <vector>
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./atomic.cuh"
+#include "./utils.h"
 namespace dgl {
@@ -24,11 +25,9 @@ namespace cuda {
 * \note each blockthread is responsible for aggregation on a row
 *       in the result tensor.
 */
-template <typename IdType, typename DType,
+template <typename IdType, typename DType, typename ReduceOp>
-          typename ReduceOp>
 __global__ void SegmentReduceKernel(
-    const DType* feat, const IdType* offsets,
+    const DType* feat, const IdType* offsets, DType* out, IdType* arg,
-    DType* out, IdType* arg,
    int64_t n, int64_t dim) {
  for (int row = blockIdx.x; row < n; row += gridDim.x) {
    int col = blockIdx.y * blockDim.x + threadIdx.x;
@@ -39,8 +38,7 @@ __global__ void SegmentReduceKernel(
        ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i);
      }
      out[row * dim + col] = local_accum;
-      if (ReduceOp::require_arg)
+      if (ReduceOp::require_arg) arg[row * dim + col] = local_arg;
-        arg[row * dim + col] = local_arg;
      col += gridDim.y * blockDim.x;
    }
  }
@@ -53,8 +51,7 @@ __global__ void SegmentReduceKernel(
 */
 template <typename IdType, typename DType>
 __global__ void ScatterAddKernel(
-    const DType *feat, const IdType *idx, DType *out,
+    const DType* feat, const IdType* idx, DType* out, int64_t n, int64_t dim) {
-    int64_t n, int64_t dim) {
  for (int row = blockIdx.x; row < n; row += gridDim.x) {
    const int write_row = idx[row];
    int col = blockIdx.y * blockDim.x + threadIdx.x;
@@ -73,7 +70,7 @@ __global__ void ScatterAddKernel(
 template <typename IdType, typename DType>
 __global__ void UpdateGradMinMaxHeteroKernel(
-    const DType *feat, const IdType *idx, const IdType *idx_type, DType *out,
+    const DType* feat, const IdType* idx, const IdType* idx_type, DType* out,
    int64_t n, int64_t dim, int type) {
  unsigned int tId = threadIdx.x;
  unsigned int laneId = tId & 31;
@@ -100,8 +97,7 @@ __global__ void UpdateGradMinMaxHeteroKernel(
 */
 template <typename IdType, typename DType>
 __global__ void BackwardSegmentCmpKernel(
-    const DType *feat, const IdType *arg, DType *out,
+    const DType* feat, const IdType* arg, DType* out, int64_t n, int64_t dim) {
-    int64_t n, int64_t dim) {
  for (int row = blockIdx.x; row < n; row += gridDim.x) {
    int col = blockIdx.y * blockDim.x + threadIdx.x;
    while (col < dim) {
@@ -122,11 +118,7 @@ __global__ void BackwardSegmentCmpKernel(
 * \param arg An auxiliary tensor storing ArgMax/Min information,
 */
 template <typename IdType, typename DType, typename ReduceOp>
-void SegmentReduce(
+void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
-    NDArray feat,
-    NDArray offsets,
-    NDArray out,
-    NDArray arg) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* offsets_data = offsets.Ptr<IdType>();
  DType* out_data = out.Ptr<DType>();
@@ -135,8 +127,7 @@ void SegmentReduce(
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  int64_t n = out->shape[0];
  int64_t dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
-    dim *= out->shape[i];
  const int nbx = FindNumBlocks<'x'>(n);
  const int ntx = FindNumThreads(dim);
@@ -145,10 +136,9 @@ void SegmentReduce(
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
  // TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance.
-  CUDA_KERNEL_CALL((SegmentReduceKernel<IdType, DType, ReduceOp>),
+  CUDA_KERNEL_CALL(
-      nblks, nthrs, 0, stream,
+      (SegmentReduceKernel<IdType, DType, ReduceOp>), nblks, nthrs, 0, stream,
-      feat_data, offsets_data, out_data, arg_data,
+      feat_data, offsets_data, out_data, arg_data, n, dim);
-      n, dim);
 }
 /*!
@@ -159,19 +149,15 @@ void SegmentReduce(
 * \param out The output tensor.
 */
 template <typename IdType, typename DType>
-void ScatterAdd(
+void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
-    NDArray feat,
-    NDArray idx,
-    NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* idx_data = idx.Ptr<IdType>();
-  DType *out_data = out.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
-    dim *= out->shape[i];
  const int nbx = FindNumBlocks<'x'>(n);
  const int ntx = FindNumThreads(dim);
@@ -179,10 +165,9 @@ void ScatterAdd(
  const int nty = 1;
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
-  CUDA_KERNEL_CALL((ScatterAddKernel<IdType, DType>),
+  CUDA_KERNEL_CALL(
-                   nblks, nthrs, 0, stream,
+      (ScatterAddKernel<IdType, DType>), nblks, nthrs, 0, stream, feat_data,
-                   feat_data, idx_data, out_data,
+      idx_data, out_data, n, dim);
-                   n, dim);
 }
 /*!
@@ -195,24 +180,26 @@ void ScatterAdd(
 * \param list_out List of the output tensors.
 */
 template <typename IdType, typename DType>
-void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph,
+void UpdateGradMinMax_hetero(
-                const std::string& op,
+    const HeteroGraphPtr& graph, const std::string& op,
-                const std::vector<NDArray>& list_feat,
+    const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
-                const std::vector<NDArray>& list_idx,
    const std::vector<NDArray>& list_idx_types,
    std::vector<NDArray>* list_out) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  if (op == "copy_lhs" || op == "copy_rhs") {
-    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(graph->NumVertexTypes(),
+    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
-    std::vector<dgl_id_t>());
+        graph->NumVertexTypes(), std::vector<dgl_id_t>());
    for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
      auto pair = graph->meta_graph()->FindEdge(etype);
      const dgl_id_t dst_ntype = pair.first;  // graph is reversed
      const dgl_id_t src_ntype = pair.second;
-      auto same_src_dst_ntype = std::find(std::begin(src_dst_ntypes[dst_ntype]),
+      auto same_src_dst_ntype = std::find(
+          std::begin(src_dst_ntypes[dst_ntype]),
          std::end(src_dst_ntypes[dst_ntype]), src_ntype);
-      // if op is "copy_lhs", relation type with same src and dst node type will be updated once
+      // if op is "copy_lhs", relation type with same src and dst node type will
-      if (op == "copy_lhs" && same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
+      // be updated once
+      if (op == "copy_lhs" &&
+          same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
        continue;
      src_dst_ntypes[dst_ntype].push_back(src_ntype);
      const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
@@ -229,35 +216,31 @@ void UpdateGradMinMax_hetero(const HeteroGraphPtr& graph,
      const int nbx = FindNumBlocks<'x'>((n * th_per_row + ntx - 1) / ntx);
      const dim3 nblks(nbx);
      const dim3 nthrs(ntx);
-      CUDA_KERNEL_CALL((UpdateGradMinMaxHeteroKernel<IdType, DType>),
+      CUDA_KERNEL_CALL(
-                       nblks, nthrs, 0, stream,
+          (UpdateGradMinMaxHeteroKernel<IdType, DType>), nblks, nthrs, 0,
-                       feat_data, idx_data, idx_type_data,
+          stream, feat_data, idx_data, idx_type_data, out_data, n, dim, type);
-                       out_data, n, dim, type);
    }
  }
 }
 /*!
- * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max reducer.
+ * \brief CUDA implementation of backward phase of Segment Reduce with Min/Max
- * \note math equation: out[arg[i, k], k] = feat[i, k]
+ *        reducer.
- * \param feat The input tensor.
+ * \note math equation: out[arg[i, k], k] = feat[i, k] \param feat The input
+ *       tensor.
 * \param arg The ArgMin/Max information, used for indexing.
 * \param out The output tensor.
 */
 template <typename IdType, typename DType>
-void BackwardSegmentCmp(
+void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
-    NDArray feat,
-    NDArray arg,
-    NDArray out) {
  const DType* feat_data = feat.Ptr<DType>();
  const IdType* arg_data = arg.Ptr<IdType>();
-  DType *out_data = out.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  int64_t n = feat->shape[0];
  int64_t dim = 1;
-  for (int i = 1; i < out->ndim; ++i)
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
-    dim *= out->shape[i];
  const int nbx = FindNumBlocks<'x'>(n);
  const int ntx = FindNumThreads(dim);
@@ -265,10 +248,9 @@ void BackwardSegmentCmp(
  const int nty = 1;
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
-  CUDA_KERNEL_CALL((BackwardSegmentCmpKernel<IdType, DType>),
+  CUDA_KERNEL_CALL(
-                   nblks, nthrs, 0, stream,
+      (BackwardSegmentCmpKernel<IdType, DType>), nblks, nthrs, 0, stream,
-                   feat_data, arg_data, out_data,
+      feat_data, arg_data, out_data, n, dim);
-                   n, dim);
 }
 }  // namespace cuda

--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
@@ -4,12 +4,14 @@
 * \brief COO operator GPU implementation
 */
 #include <dgl/array.h>
-#include <vector>
-#include <unordered_set>
 #include <numeric>
+#include <unordered_set>
+#include <vector>
 #include "../../runtime/cuda/cuda_common.h"
-#include "./utils.h"
 #include "./atomic.cuh"
+#include "./utils.h"
 namespace dgl {
@@ -19,9 +21,8 @@ using namespace cuda;
 namespace aten {
 namespace impl {
 template <typename IdType>
-__device__ void _warpReduce(volatile IdType *sdata, IdType tid) {
+__device__ void _warpReduce(volatile IdType* sdata, IdType tid) {
  sdata[tid] += sdata[tid + 32];
  sdata[tid] += sdata[tid + 16];
  sdata[tid] += sdata[tid + 8];
@@ -32,10 +33,8 @@ __device__ void _warpReduce(volatile IdType *sdata, IdType tid) {
 template <typename IdType>
 __global__ void _COOGetRowNNZKernel(
-    const IdType* __restrict__ row_indices,
+    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnt,
-    IdType* __restrict__ glb_cnt,
+    const int64_t row_query, IdType nnz) {
-    const int64_t row_query,
-    IdType nnz) {
  __shared__ IdType local_cnt[1024];
  IdType tx = threadIdx.x;
  IdType bx = blockIdx.x;
@@ -80,10 +79,9 @@ int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
  IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
  NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
  _Fill(rst.Ptr<IdType>(), 1, IdType(0));
-  CUDA_KERNEL_CALL(_COOGetRowNNZKernel,
+  CUDA_KERNEL_CALL(
-      nb, nt, 0, stream,
+      _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
-      coo.row.Ptr<IdType>(), rst.Ptr<IdType>(),
+      rst.Ptr<IdType>(), row, nnz);
-      row, nnz);
  rst = rst.CopyTo(DGLContext{kDGLCPU, 0});
  return *rst.Ptr<IdType>();
 }
@@ -93,8 +91,7 @@ template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t);
 template <typename IdType>
 __global__ void _COOGetAllRowNNZKernel(
-    const IdType* __restrict__ row_indices,
+    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnts,
-    IdType* __restrict__ glb_cnts,
    IdType nnz) {
  IdType eid = blockIdx.x * blockDim.x + threadIdx.x;
  while (eid < nnz) {
@@ -118,20 +115,18 @@ NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
    NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
    _Fill(rst.Ptr<IdType>(), 1, IdType(0));
-    CUDA_KERNEL_CALL(_COOGetRowNNZKernel,
+    CUDA_KERNEL_CALL(
-        nb, nt, 0, stream,
+        _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
-        coo.row.Ptr<IdType>(), rst.Ptr<IdType>(),
+        rst.Ptr<IdType>(), row, nnz);
-        row, nnz);
    return rst;
  } else {
    IdType nt = 1024;
    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
    NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx);
    _Fill(in_degrees.Ptr<IdType>(), num_rows, IdType(0));
-    CUDA_KERNEL_CALL(_COOGetAllRowNNZKernel,
+    CUDA_KERNEL_CALL(
-        nb, nt, 0, stream,
+        _COOGetAllRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
-        coo.row.Ptr<IdType>(), in_degrees.Ptr<IdType>(),
+        in_degrees.Ptr<IdType>(), nnz);
-        nnz);
    return IndexSelect(in_degrees, rows);
  }
 }

--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
@@ -4,17 +4,18 @@
 * \brief Array index select GPU implementation
 */
 #include <dgl/array.h>
 #include "../../../runtime/cuda/cuda_common.h"
 #include "../array_index_select.cuh"
-#include "./array_index_select_uvm.cuh"
 #include "../utils.h"
+#include "./array_index_select_uvm.cuh"
 namespace dgl {
 using runtime::NDArray;
 namespace aten {
 namespace impl {
-template<typename DType, typename IdType>
+template <typename DType, typename IdType>
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const IdType* idx_data = static_cast<IdType*>(index->data);
@@ -34,30 +35,30 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  }
  NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx);
-  if (len == 0)
+  if (len == 0) return ret;
-    return ret;
  DType* ret_data = static_cast<DType*>(ret->data);
  if (num_feat == 1) {
    const int nt = cuda::FindNumThreads(len);
    const int nb = (len + nt - 1) / nt;
-      CUDA_KERNEL_CALL(IndexSelectSingleKernel, nb, nt, 0,
+    CUDA_KERNEL_CALL(
-          stream, array_data, idx_data, len, arr_len, ret_data);
+        IndexSelectSingleKernel, nb, nt, 0, stream, array_data, idx_data, len,
+        arr_len, ret_data);
  } else {
    dim3 block(256, 1);
-      while (static_cast<int64_t>(block.x) >= 2*num_feat) {
+    while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
      block.x /= 2;
      block.y *= 2;
    }
-      const dim3 grid((len+block.y-1)/block.y);
+    const dim3 grid((len + block.y - 1) / block.y);
    if (num_feat * sizeof(DType) < 2 * CACHE_LINE_SIZE) {
-        CUDA_KERNEL_CALL(IndexSelectMultiKernel, grid, block, 0,
+      CUDA_KERNEL_CALL(
-            stream, array_data, num_feat, idx_data,
+          IndexSelectMultiKernel, grid, block, 0, stream, array_data, num_feat,
-            len, arr_len, ret_data);
+          idx_data, len, arr_len, ret_data);
    } else {
-        CUDA_KERNEL_CALL(IndexSelectMultiKernelAligned, grid, block, 0,
+      CUDA_KERNEL_CALL(
-            stream, array_data, num_feat, idx_data,
+          IndexSelectMultiKernelAligned, grid, block, 0, stream, array_data,
-            len, arr_len, ret_data);
+          num_feat, idx_data, len, arr_len, ret_data);
    }
  }
  return ret;
@@ -73,8 +74,7 @@ template NDArray IndexSelectCPUFromGPU<int32_t, int64_t>(NDArray, IdArray);
 template NDArray IndexSelectCPUFromGPU<int64_t, int32_t>(NDArray, IdArray);
 template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
+template <typename DType, typename IdType>
-template<typename DType, typename IdType>
 void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const DType* source_data = static_cast<DType*>(source->data);
@@ -94,24 +94,24 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
    num_feat *= source->shape[d];
  }
-  if (len == 0)
+  if (len == 0) return;
-    return;
  if (num_feat == 1) {
    const int nt = cuda::FindNumThreads(len);
    const int nb = (len + nt - 1) / nt;
-      CUDA_KERNEL_CALL(IndexScatterSingleKernel, nb, nt, 0,
+    CUDA_KERNEL_CALL(
-          stream, source_data, idx_data, len, arr_len, dest_data);
+        IndexScatterSingleKernel, nb, nt, 0, stream, source_data, idx_data, len,
+        arr_len, dest_data);
  } else {
    dim3 block(256, 1);
-      while (static_cast<int64_t>(block.x) >= 2*num_feat) {
+    while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
      block.x /= 2;
      block.y *= 2;
    }
-      const dim3 grid((len+block.y-1)/block.y);
+    const dim3 grid((len + block.y - 1) / block.y);
-      CUDA_KERNEL_CALL(IndexScatterMultiKernel, grid, block, 0,
+    CUDA_KERNEL_CALL(
-          stream, source_data, num_feat, idx_data,
+        IndexScatterMultiKernel, grid, block, 0, stream, source_data, num_feat,
-          len, arr_len, dest_data);
+        idx_data, len, arr_len, dest_data);
  }
 }

--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
@@ -14,31 +14,28 @@ namespace aten {
 namespace impl {
 /*  This is a cross-device access version of IndexSelectMultiKernel.
-*   Since the memory access over PCIe is more sensitive to the
+ *   Since the memory access over PCIe is more sensitive to the
-*   data access aligment (cacheline), we need a separate version here.
+ *   data access aligment (cacheline), we need a separate version here.
-*/
+ */
 template <typename DType, typename IdType>
 __global__ void IndexSelectMultiKernelAligned(
-        const DType* const array,
+    const DType* const array, const int64_t num_feat, const IdType* const index,
-        const int64_t num_feat,
+    const int64_t length, const int64_t arr_len, DType* const out) {
-        const IdType* const index,
+  int64_t out_row = blockIdx.x * blockDim.y + threadIdx.y;
-        const int64_t length,
-        const int64_t arr_len,
-        DType* const out) {
-  int64_t out_row = blockIdx.x*blockDim.y+threadIdx.y;
-  const int64_t stride = blockDim.y*gridDim.x;
+  const int64_t stride = blockDim.y * gridDim.x;
  while (out_row < length) {
    int64_t col = threadIdx.x;
    const int64_t in_row = index[out_row];
    assert(in_row >= 0 && in_row < arr_len);
    const int64_t idx_offset =
-      ((uint64_t)(&array[in_row*num_feat]) % CACHE_LINE_SIZE) / sizeof(DType);
+        ((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) /
+        sizeof(DType);
    col = col - idx_offset;
    while (col < num_feat) {
      if (col >= 0)
-        out[out_row*num_feat+col] = array[in_row*num_feat+col];
+        out[out_row * num_feat + col] = array[in_row * num_feat + col];
      col += blockDim.x;
    }
    out_row += stride;

--- a/src/array/filter.cc
+++ b/src/array/filter.cc
@@ -6,49 +6,49 @@
 #include "./filter.h"
-#include <dgl/runtime/registry.h>
-#include <dgl/runtime/packed_func.h>
 #include <dgl/packed_func_ext.h>
+#include <dgl/runtime/packed_func.h>
+#include <dgl/runtime/registry.h>
 namespace dgl {
 namespace array {
 using namespace dgl::runtime;
-template<DGLDeviceType XPU, typename IdType>
+template <DGLDeviceType XPU, typename IdType>
 FilterRef CreateSetFilter(IdArray set);
 DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
      IdArray array = args[0];
      auto ctx = array->ctx;
      // TODO(nv-dlasalle): Implement CPU version.
      if (ctx.device_type == kDGLCUDA) {
-    #ifdef DGL_USE_CUDA
+#ifdef DGL_USE_CUDA
        ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
          *rv = CreateSetFilter<kDGLCUDA, IdType>(array);
        });
-    #else
+#else
        LOG(FATAL) << "GPU support not compiled.";
-    #endif
+#endif
      } else {
        LOG(FATAL) << "CPU support not yet implemented.";
      }
-});
+    });
 DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindIncludedIndices")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
      FilterRef filter = args[0];
      IdArray array = args[1];
      *rv = filter->find_included_indices(array);
-});
+    });
 DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindExcludedIndices")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
      FilterRef filter = args[0];
      IdArray array = args[1];
      *rv = filter->find_excluded_indices(array);
-});
+    });
 }  // namespace array
 }  // namespace dgl
--- a/src/array/filter.h
+++ b/src/array/filter.h
@@ -4,12 +4,11 @@
 * \brief Object for selecting items in a set, or selecting items not in a set.
 */
 #ifndef DGL_ARRAY_FILTER_H_
 #define DGL_ARRAY_FILTER_H_
-#include <dgl/runtime/object.h>
 #include <dgl/array.h>
+#include <dgl/runtime/object.h>
 namespace dgl {
 namespace array {
@@ -28,8 +27,7 @@ class Filter : public runtime::Object {
   * @return The indices of the items from `test` that are selected by
   * this filter.
   */
-  virtual IdArray find_included_indices(
+  virtual IdArray find_included_indices(IdArray test) = 0;
-      IdArray test) = 0;
  /**
   * @brief From the test set of items, get the indices of those which are
@@ -40,8 +38,7 @@ class Filter : public runtime::Object {
   * @return The indices of the items from `test` that are not selected by this
   * filter.
   */
-  virtual IdArray find_excluded_indices(
+  virtual IdArray find_excluded_indices(IdArray test) = 0;
-      IdArray test) = 0;
 };
 DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
@@ -50,4 +47,3 @@ DGL_DEFINE_OBJECT_REF(FilterRef, Filter);
 }  // namespace dgl
 #endif  // DGL_ARRAY_FILTER_H_
--- a/src/array/libra_partition.cc
+++ b/src/array/libra_partition.cc
--- a/src/array/union_partition.cc
+++ b/src/array/union_partition.cc
--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
@@ -4,7 +4,9 @@
 * \brief DGL array utilities implementation
 */
 #include <dgl/array.h>
 #include <sstream>
 #include "../c_api_common.h"
 #include "./uvm_array_op.h"
@@ -35,10 +37,12 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
 #ifdef DGL_USE_CUDA
  CHECK(dest.IsPinned()) << "Destination array must be in pinned memory.";
  CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
-  CHECK_EQ(source->ctx.device_type, kDGLCUDA) << "Source array must be on the GPU.";
+  CHECK_EQ(source->ctx.device_type, kDGLCUDA)
+      << "Source array must be on the GPU.";
  CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source "
                                          "array must have the same dtype.";
-  CHECK_GE(dest->ndim, 1) << "Destination array must have at least 1 dimension.";
+  CHECK_GE(dest->ndim, 1)
+      << "Destination array must have at least 1 dimension.";
  CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";
  ATEN_DTYPE_BITS_ONLY_SWITCH(source->dtype, DType, "values", {
@@ -52,21 +56,19 @@ void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
 }
 DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
      NDArray array = args[0];
      IdArray index = args[1];
      *rv = IndexSelectCPUFromGPU(array, index);
    });
 DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexScatterGPUToCPU")
-.set_body([] (DGLArgs args, DGLRetValue* rv) {
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
      NDArray dest = args[0];
      IdArray index = args[1];
      NDArray source = args[2];
      IndexScatterGPUToCPU(dest, index, source);
    });
 }  // namespace aten
 }  // namespace dgl
--- a/src/array/uvm_array_op.h
+++ b/src/array/uvm_array_op.h
@@ -7,6 +7,7 @@
 #define DGL_ARRAY_UVM_ARRAY_OP_H_
 #include <dgl/array.h>
 #include <utility>
 namespace dgl {

--- a/src/c_api_common.cc
+++ b/src/c_api_common.cc
@@ -3,14 +3,15 @@
 * \file c_runtime_api.cc
 * \brief DGL C API common implementations
 */
-#include <dgl/graph_interface.h>
 #include "c_api_common.h"
+#include <dgl/graph_interface.h>
 using dgl::runtime::DGLArgs;
 using dgl::runtime::DGLArgValue;
 using dgl::runtime::DGLRetValue;
-using dgl::runtime::PackedFunc;
 using dgl::runtime::NDArray;
+using dgl::runtime::PackedFunc;
 namespace dgl {
@@ -27,7 +28,7 @@ PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector<NDArray>& vec) {
 }
 PackedFunc ConvertEdgeArrayToPackedFunc(const EdgeArray& ea) {
-  auto body = [ea] (DGLArgs args, DGLRetValue* rv) {
+  auto body = [ea](DGLArgs args, DGLRetValue* rv) {
    const int which = args[0];
    if (which == 0) {
      *rv = std::move(ea.src);

--- a/src/c_api_common.h
+++ b/src/c_api_common.h
@@ -6,15 +6,16 @@
 #ifndef DGL_C_API_COMMON_H_
 #define DGL_C_API_COMMON_H_
+#include <dgl/array.h>
+#include <dgl/graph_interface.h>
 #include <dgl/runtime/ndarray.h>
 #include <dgl/runtime/packed_func.h>
 #include <dgl/runtime/registry.h>
-#include <dgl/array.h>
-#include <dgl/graph_interface.h>
 #include <algorithm>
-#include <vector>
 #include <string>
 #include <utility>
+#include <vector>
 namespace dgl {
@@ -36,12 +37,12 @@ dgl::runtime::PackedFunc ConvertNDArrayVectorToPackedFunc(
 * The data type of the NDArray will be IdType, which must be an integer type.
 * The element type (DType) of the vector must be convertible to IdType.
 */
-template<typename IdType, typename DType>
+template <typename IdType, typename DType>
-dgl::runtime::NDArray CopyVectorToNDArray(
+dgl::runtime::NDArray CopyVectorToNDArray(const std::vector<DType>& vec) {
-    const std::vector<DType>& vec) {
  using dgl::runtime::NDArray;
  const int64_t len = vec.size();
-  NDArray a = NDArray::Empty({len}, DGLDataType{kDGLInt, sizeof(IdType) * 8, 1},
+  NDArray a = NDArray::Empty(
+      {len}, DGLDataType{kDGLInt, sizeof(IdType) * 8, 1},
      DGLContext{kDGLCPU, 0});
  std::copy(vec.begin(), vec.end(), static_cast<IdType*>(a->data));
  return a;