[Fix] Enable lint check for cuh files and fix compiler warnings (#4585)

* disable warning for tensorpipe * fix warning * enable lint check for cuh files * resolve comments

[Fix] Enable lint check for cuh files and fix compiler warnings (#4585)
* disable warning for tensorpipe * fix warning * enable lint check for cuh files * resolve comments
880b3b1f · Xin Yao · GitHub · 166b273b · 880b3b1f · 880b3b1f
Unverified Commit 880b3b1f authored Sep 21, 2022 by Xin Yao Committed by GitHub Sep 21, 2022
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -250,10 +250,15 @@ endif((NOT MSVC) AND USE_LIBXSMM)
 if(NOT MSVC)
  # Only build tensorpipe on linux
  string(REPLACE "-pedantic" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+  # Disable -Wall for third-party tensorpipe due to too many warnings
+  string(REPLACE "-Wall" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+  string(REPLACE "-Wall" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
  set(TP_STATIC_OR_SHARED STATIC)
  add_subdirectory(third_party/tensorpipe)
  list(APPEND DGL_LINKER_LIBS tensorpipe)
  target_include_directories(dgl PRIVATE third_party/tensorpipe)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
 endif(NOT MSVC)
 # Compile TVM Runtime and Featgraph

--- a/include/dgl/runtime/parallel_for.h
+++ b/include/dgl/runtime/parallel_for.h
@@ -13,6 +13,7 @@
 #include <exception>
 #include <vector>
 #include <atomic>
+#include <utility>
 namespace {
 int64_t divup(int64_t x, int64_t y) {

--- a/src/array/cpu/spmm.cc
+++ b/src/array/cpu/spmm.cc
@@ -29,8 +29,6 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
    SWITCH_BITS(bits, DType, {
      SWITCH_OP(op, Op, {
        DType *out_off = out.Ptr<DType>();
-        IdType* argX = Op::use_lhs ? static_cast<IdType*>(out_aux[0]->data) : nullptr;
-        IdType* argW = Op::use_rhs ? static_cast<IdType*>(out_aux[1]->data) : nullptr;
        if (reduce == "max") {
          std::fill(out_off, out_off + csr.num_rows * dim, cpu::op::Max<DType>::zero);
          cpu::SpMMCmpCsr<IdType, DType, Op, cpu::op::Max<DType>>(
@@ -103,7 +101,6 @@ void SpMMCsrHetero(const std::string& op, const std::string& reduce,
          const dgl_type_t src_id = ufeat_node_tids[etype];
          const dgl_type_t dst_id = out_node_tids[etype];
          CSRMatrix csr = vec_csr[etype];
-          DType *out_off = (*vec_out)[out_node_tids[etype]].Ptr<DType>();
          NDArray ufeat = (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id];
          NDArray efeat = (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype];
          NDArray out = (*vec_out)[dst_id];

--- a/src/array/cpu/spmm.h
+++ b/src/array/cpu/spmm.h
@@ -126,7 +126,7 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
  const IdType* edges = csr.data.Ptr<IdType>();
  const DType* X = ufeat.Ptr<DType>();
  const DType* W = efeat.Ptr<DType>();
-  int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len;
+  int64_t dim = bcast.out_len;
  DType* O = out.Ptr<DType>();
  CHECK_NOTNULL(indptr);
  CHECK_NOTNULL(O);

--- a/src/array/cuda/array_index_select.cuh
+++ b/src/array/cuda/array_index_select.cuh
@@ -93,4 +93,4 @@ __global__ void IndexScatterMultiKernel(
 }  // namespace aten
 }  // namespace dgl
-#endif
+#endif  // DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_
--- a/src/array/cuda/atomic.cuh
+++ b/src/array/cuda/atomic.cuh
@@ -3,8 +3,8 @@
 * \file array/cuda/atomic.cuh
 * \brief Atomic functions
 */
-#ifndef DGL_ARRAY_CUDA_ATOMIC_H_
+#ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
-#define DGL_ARRAY_CUDA_ATOMIC_H_
+#define DGL_ARRAY_CUDA_ATOMIC_CUH_
 #include <cuda_runtime.h>
 #include <cassert>
@@ -22,15 +22,15 @@ namespace cuda {
 template <int Bytes> struct Code { };
 template <> struct Code<2> {
-  typedef unsigned short int Type;
+  typedef unsigned short int Type;  // NOLINT
 };
 template <> struct Code<4> {
-  typedef unsigned int Type;
+  typedef unsigned int Type;  // NOLINT
 };
 template <> struct Code<8> {
-  typedef unsigned long long int Type;
+  typedef unsigned long long int Type;  // NOLINT
 };
 // Helper class for converting to/from atomicCAS compatible types.
@@ -76,10 +76,10 @@ template <> struct Cast<double> {
  }
 };
-static __device__ __forceinline__ unsigned short int atomicCASshort(
+static __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
-    unsigned short int *address,
+    unsigned short int *address,                                      // NOLINT
-    unsigned short int compare,
+    unsigned short int compare,                                       // NOLINT
-    unsigned short int val) {
+    unsigned short int val) {                                         // NOLINT
  static_assert(CUDART_VERSION >= 10000, "Requires at least CUDA 10");
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
  return atomicCAS(address, compare, val);
@@ -112,7 +112,7 @@ static __device__ __forceinline__ unsigned short int atomicCASshort(
 #define DEFINE_ATOMIC_HALF(NAME) \
  template <>                                                    \
  __device__ __forceinline__ half Atomic##NAME<half>(half* addr, half val) {  \
-    typedef unsigned short int CT;                               \
+    typedef uint16_t CT;                                         \
    CT* addr_as_ui = reinterpret_cast<CT*>(addr);                \
    CT old = *addr_as_ui;                                        \
    CT assumed = old;                                            \
@@ -282,4 +282,4 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
 }  // namespace aten
 }  // namespace dgl
-#endif  // DGL_ARRAY_CUDA_ATOMIC_H_
+#endif  // DGL_ARRAY_CUDA_ATOMIC_CUH_
--- a/src/array/cuda/dgl_cub.cuh
+++ b/src/array/cuda/dgl_cub.cuh
@@ -14,4 +14,4 @@ static_assert(false, "THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL.");
 #include "cub/cub.cuh"
-#endif
+#endif  // DGL_ARRAY_CUDA_DGL_CUB_CUH_
--- a/src/array/cuda/fp16.cuh
+++ b/src/array/cuda/fp16.cuh
@@ -5,61 +5,98 @@
 * \note this file is modified from TVM project:
 *       https://github.com/apache/tvm/blob/e561007f0c330e3d14c2bc8a3ef40fb741db9004/src/target/source/literal/cuda_half_t.h.
 */
-#ifndef DGL_ARRAY_FP16_CUH_
+#ifndef DGL_ARRAY_CUDA_FP16_CUH_
-#define DGL_ARRAY_FP16_CUH_
+#define DGL_ARRAY_CUDA_FP16_CUH_
 #ifdef USE_FP16
 #include <cuda_fp16.h>
+#include <algorithm>
-static __device__ __forceinline__ half max(half a, half b)
+static __device__ __forceinline__ half max(half a, half b) {
-{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hgt(__half(a), __half(b)) ? a : b;
 #else
-  return __half(max(float(a), float(b)));
+  return __half(max(float(a), float(b)));  // NOLINT
 #endif
 }
-static __device__ __forceinline__ half min(half a, half b)
+static __device__ __forceinline__ half min(half a, half b) {
-{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(__half(a), __half(b)) ? a : b;
 #else
-  return __half(min(float(a), float(b)));
+  return __half(min(float(a), float(b)));  // NOLINT
 #endif
 }
 #ifdef __CUDACC__
 // Arithmetic FP16 operations for architecture >= 5.3 are already defined in cuda_fp16.h
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
-__device__ __forceinline__ __half operator+(const __half& lh, const __half& rh) { return __half(float(lh) + float(rh)); }
+__device__ __forceinline__ __half operator+(const __half& lh, const __half& rh) {
-__device__ __forceinline__ __half operator-(const __half& lh, const __half& rh) { return __half(float(lh) - float(rh)); }
+  return __half(float(lh) + float(rh));  // NOLINT
-__device__ __forceinline__ __half operator*(const __half& lh, const __half& rh) { return __half(float(lh) * float(rh)); }
+}
-__device__ __forceinline__ __half operator/(const __half& lh, const __half& rh) { return __half(float(lh) / float(rh)); }
+__device__ __forceinline__ __half operator-(const __half& lh, const __half& rh) {
+  return __half(float(lh) - float(rh));  // NOLINT
+}
+__device__ __forceinline__ __half operator*(const __half& lh, const __half& rh) {
+  return __half(float(lh) * float(rh));  // NOLINT
+}
+__device__ __forceinline__ __half operator/(const __half& lh, const __half& rh) {
+  return __half(float(lh) / float(rh));  // NOLINT
+}
-__device__ __forceinline__ __half& operator+=(__half& lh, const __half& rh) { lh = __half(float(lh) + float(rh)); return lh; }
+__device__ __forceinline__ __half& operator+=(__half& lh, const __half& rh) {  // NOLINT
-__device__ __forceinline__ __half& operator-=(__half& lh, const __half& rh) { lh = __half(float(lh) - float(rh)); return lh; }
+  lh = __half(float(lh) + float(rh)); return lh;  // NOLINT
-__device__ __forceinline__ __half& operator*=(__half& lh, const __half& rh) { lh = __half(float(lh) * float(rh)); return lh; }
+}
-__device__ __forceinline__ __half& operator/=(__half& lh, const __half& rh) { lh = __half(float(lh) / float(rh)); return lh; }
+__device__ __forceinline__ __half& operator-=(__half& lh, const __half& rh) {  // NOLINT
+  lh = __half(float(lh) - float(rh)); return lh;  // NOLINT
+}
+__device__ __forceinline__ __half& operator*=(__half& lh, const __half& rh) {  // NOLINT
+  lh = __half(float(lh) * float(rh)); return lh;  // NOLINT
+}
+__device__ __forceinline__ __half& operator/=(__half& lh, const __half& rh) {  // NOLINT
+  lh = __half(float(lh) / float(rh)); return lh;  // NOLINT
+}
-__device__ __forceinline__ __half& operator++(__half& h)      { h = __half(float(h) + 1.0f); return h; }
+__device__ __forceinline__ __half& operator++(__half& h) {  // NOLINT
-__device__ __forceinline__ __half& operator--(__half& h)      { h = __half(float(h) - 1.0f); return h; }
+  h = __half(float(h) + 1.0f); return h;  // NOLINT
-__device__ __forceinline__ __half  operator++(__half& h, int) { __half ret = h; h = __half(float(h) + 1.0f); return ret; }
+}
-__device__ __forceinline__ __half  operator--(__half& h, int) { __half ret = h; h = __half(float(h) - 1.0f); return ret; }
+__device__ __forceinline__ __half& operator--(__half& h) {  // NOLINT
+  h = __half(float(h) - 1.0f); return h;  // NOLINT
+}
+__device__ __forceinline__ __half  operator++(__half& h, int) {  // NOLINT
+  __half ret = h; h = __half(float(h) + 1.0f); return ret;  // NOLINT
+}
+__device__ __forceinline__ __half  operator--(__half& h, int) {  // NOLINT
+  __half ret = h; h = __half(float(h) - 1.0f); return ret;  // NOLINT
+}
 __device__ __forceinline__ __half operator+(const __half& h) { return h; }
-__device__ __forceinline__ __half operator-(const __half& h) { return __half(-float(h)); }
+__device__ __forceinline__ __half operator-(const __half& h) {
+  return __half(-float(h));  // NOLINT
+}
-__device__ __forceinline__ bool operator==(const __half& lh, const __half& rh) { return float(lh) == float(rh); }
+__device__ __forceinline__ bool operator==(const __half& lh, const __half& rh) {
-__device__ __forceinline__ bool operator!=(const __half& lh, const __half& rh) { return float(lh) != float(rh); }
+  return float(lh) == float(rh);  // NOLINT
-__device__ __forceinline__ bool operator> (const __half& lh, const __half& rh) { return float(lh) >  float(rh); }
+}
-__device__ __forceinline__ bool operator< (const __half& lh, const __half& rh) { return float(lh) <  float(rh); }
+__device__ __forceinline__ bool operator!=(const __half& lh, const __half& rh) {
-__device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) { return float(lh) >= float(rh); }
+  return float(lh) != float(rh);  // NOLINT
-__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) { return float(lh) <= float(rh); }
+}
+__device__ __forceinline__ bool operator> (const __half& lh, const __half& rh) {
+  return float(lh) >  float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator< (const __half& lh, const __half& rh) {
+  return float(lh) <  float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
+  return float(lh) >= float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
+  return float(lh) <= float(rh);  // NOLINT
+}
 #endif  // __CUDA_ARCH__ < 530
 #endif  // __CUDACC__
 #endif  // USE_FP16
-#endif  // DGL_ARRAY_FP16_CUH_
+#endif  // DGL_ARRAY_CUDA_FP16_CUH_
--- a/src/array/cuda/functor.cuh
+++ b/src/array/cuda/functor.cuh
@@ -6,9 +6,10 @@
 #ifndef DGL_ARRAY_CUDA_FUNCTOR_CUH_
 #define DGL_ARRAY_CUDA_FUNCTOR_CUH_
+#include <cmath>
+#include <limits>
 #include "./atomic.cuh"
 #include "./fp16.cuh"
-#include <cmath>
 namespace dgl {
 namespace aten {
@@ -128,7 +129,7 @@ template <typename Idx,
 struct _Sum {
  static constexpr __host__ __device__ __forceinline__ DType zero() {
    return 0.;
-  };
+  }
  static constexpr bool require_arg = false;
  static __device__ __forceinline__ void Call(
    DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
@@ -155,7 +156,7 @@ struct _Sum {
 template <typename Idx,
          typename DType,
-          bool atomic=false>
+          bool atomic = false>
 struct Sum: _Sum<Idx, DType, atomic> { };
 #ifdef USE_FP16
@@ -163,7 +164,7 @@ template <typename Idx, bool atomic>
 struct Sum<Idx, half, atomic>: _Sum<Idx, half, atomic> {
  static constexpr __host__ __device__ __forceinline__ half zero() {
    return __float2half_rn(0.);
-  };
+  }
 };
 #endif  // USE_FP16
@@ -173,7 +174,7 @@ template <typename Idx,
 struct _Max {
  static constexpr __host__ __device__ __forceinline__ DType zero() {
    return -std::numeric_limits<DType>::infinity();
-  };
+  }
  static constexpr bool require_arg = true;
  static __device__ __forceinline__ void Call(
    DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
@@ -216,7 +217,7 @@ struct _Max {
 template <typename Idx,
          typename DType,
-          bool atomic=false>
+          bool atomic = false>
 struct Max : _Max<Idx, DType, atomic> { };
 #ifdef USE_FP16
@@ -225,7 +226,7 @@ template <typename Idx,
 struct Max<Idx, half, atomic> : _Max<Idx, half, atomic> {
  static constexpr __host__ __device__ __forceinline__ half zero() {
    return __float2half_rn(-6.550400e+04f);
-  };
+  }
 };
 #endif
@@ -235,7 +236,7 @@ template <typename Idx,
 struct _Min {
  static constexpr __host__ __device__ __forceinline__ DType zero() {
    return std::numeric_limits<DType>::infinity();
-  };
+  }
  static constexpr bool require_arg = true;
  static __device__ __forceinline__ void Call(
    DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
@@ -278,7 +279,7 @@ struct _Min {
 template <typename Idx,
          typename DType,
-          bool atomic=false>
+          bool atomic = false>
 struct Min : _Min<Idx, DType, atomic> { };
 #ifdef USE_FP16
@@ -287,7 +288,7 @@ template <typename Idx,
 struct Min<Idx, half, atomic> : _Min<Idx, half, atomic> {
  static constexpr __host__ __device__ __forceinline__ half zero() {
    return __float2half_rn(6.550400e+04f);
-  };
+  }
 };
 #endif  // USE_FP16

--- a/src/array/cuda/ge_spmm.cuh
+++ b/src/array/cuda/ge_spmm.cuh
@@ -46,7 +46,7 @@ __global__ void GESpMMKernel(
        if (left + 32 <= high) {
 #pragma unroll
          for (Idx i = 0; i < 32; ++i) {
-            const Idx eid = left + i; 
+            const Idx eid = left + i;
            const Idx cid = __ldg(indices + eid);
            const Idx offset = feat_len * cid + fid;
            if (BinaryOp::use_rhs) {
@@ -59,7 +59,7 @@ __global__ void GESpMMKernel(
          }
        } else {
          for (Idx i = 0; left + i < high; ++i) {
-            const Idx eid = left + i; 
+            const Idx eid = left + i;
            const Idx cid = __ldg(indices + eid);
            const Idx offset = feat_len * cid + fid;
            if (BinaryOp::use_rhs) {
@@ -82,8 +82,8 @@ __global__ void GESpMMKernel(
        if (left + 32 <= high) {
 #pragma unroll
          for (int i = 0; i < 32; ++i) {
-            const Idx eid = left + i; 
+            const Idx eid = left + i;
-            const Idx cid = __ldg(indices + eid); 
+            const Idx cid = __ldg(indices + eid);
            const Idx offset = feat_len * cid;
            if (BinaryOp::use_rhs) {
              accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid);
@@ -95,8 +95,8 @@ __global__ void GESpMMKernel(
          }
        } else {
          for (int i = 0; i + left < high; ++i) {
-            const Idx eid = left + i; 
+            const Idx eid = left + i;
-            const Idx cid = __ldg(indices + eid); 
+            const Idx cid = __ldg(indices + eid);
            const Idx offset = feat_len * cid;
            if (BinaryOp::use_rhs) {
              accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid);
@@ -129,7 +129,7 @@ void GESpMMCsr(
  DType *out_data = out.Ptr<DType>();
  cudaStream_t stream = runtime::getCurrentCUDAStream();
  const int ntx = 32;
  const int nty = 32;
  const int nby = (feat_len + (ntx * 2) - 1) / (ntx * 2);
@@ -150,4 +150,4 @@ void GESpMMCsr(
 }  // namespace aten
 }  // namespace dgl
-#endif
+#endif  // DGL_ARRAY_CUDA_GE_SPMM_CUH_
--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
@@ -48,4 +48,4 @@
  }                                                                            \
 } while (0)
-#endif
+#endif  // DGL_ARRAY_CUDA_MACRO_CUH_
--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
@@ -294,7 +294,8 @@ void SDDMMCoo(
    const dim3 nblks(nbx, nby);
    const dim3 nthrs(ntx, nty);
    BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
-      CUDA_KERNEL_CALL((SDDMMCooTreeReduceKernel<Idx, DType, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+      CUDA_KERNEL_CALL(
+          (SDDMMCooTreeReduceKernel<Idx, DType, UseBcast, UseIdx, LhsTarget, RhsTarget>),
          nblks, nthrs, 0, stream,
          lhs_data, rhs_data, out_data,
          row, col, edge_map,
@@ -376,4 +377,4 @@ void SDDMMCsr(
 }  // namespace aten
 }  // namespace dgl
-#endif
+#endif  // DGL_ARRAY_CUDA_SDDMM_CUH_
--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
@@ -3,9 +3,11 @@
 * \file array/cuda/segment_reduce.cuh
 * \brief Segment reduce kernel function header.
 */
-#ifndef DGL_ARRAY_SEGMENT_REDUCE_CUH_
+#ifndef DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
-#define DGL_ARRAY_SEGMENT_REDUCE_CUH_
+#define DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
+#include <string>
+#include <vector>
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
 #include "./atomic.cuh"
@@ -27,7 +29,7 @@ template <typename IdType, typename DType,
 __global__ void SegmentReduceKernel(
    const DType* feat, const IdType* offsets,
    DType* out, IdType* arg,
-    int64_t n, int64_t dim){
+    int64_t n, int64_t dim) {
  for (int row = blockIdx.x; row < n; row += gridDim.x) {
    int col = blockIdx.y * blockDim.x + threadIdx.x;
    while (col < dim) {
@@ -81,7 +83,7 @@ __global__ void UpdateGradMinMaxHeteroKernel(
  unsigned int row = warpId;
  while (row < n) {
-    for(unsigned int col = laneId; col < dim; col += warp_size) {
+    for (unsigned int col = laneId; col < dim; col += warp_size) {
      if (type == idx_type[row * dim + col]) {
        const int write_row = idx[row * dim + col];
        cuda::AtomicAdd(out + write_row * dim + col, feat[row * dim + col]);
@@ -273,4 +275,4 @@ void BackwardSegmentCmp(
 }  // namespace aten
 }  // namespace dgl
-#endif
+#endif  // DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -7,6 +7,7 @@
 #define DGL_ARRAY_CUDA_SPMM_CUH_
 #include <dgl/bcast.h>
+#include <limits>
 #include "macro.cuh"
 #include "fp16.cuh"
 #include "atomic.cuh"
@@ -615,7 +616,7 @@ __global__ void SpMMCmpCsrHeteroKernel(
  while (ty < num_rows) {
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    while (tx < out_len) {
-      DType new_out = out[ty * out_len + tx];//ReduceOp::zero();
+      DType new_out = out[ty * out_len + tx];  // ReduceOp::zero();
      Idx local_argu = 0, local_arge = 0;
      const int lhs_add = UseBcast ? ubcast_off[tx] : tx;
      const int rhs_add = UseBcast ? ebcast_off[tx] : tx;
@@ -698,7 +699,6 @@ void SpMMCoo(
  const int nty = CUDA_MAX_NUM_THREADS / ntx;
  const int nbx = (len + ntx - 1) / ntx;
  const int nby = FindNumBlocks<'y'>((E + nty - 1) / nty);
-  //LOG(INFO) << "nblks=(" << nbx << ", " << nby << ") nthrs=(" << ntx << ", " << nty << ")";
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
  const bool use_idx = !IsNullArray(coo.data);
@@ -761,9 +761,8 @@ void SpMMCsr(
          rhs_len = bcast.rhs_len;
  const int ntx = FindNumThreads(len);
  const int nty = CUDA_MAX_NUM_THREADS / ntx;
-  const int nby= (len + ntx - 1) / ntx;
+  const int nby = (len + ntx - 1) / ntx;
  const int nbx = FindNumBlocks<'x'>((csr.num_rows + nty - 1) / nty);
-  //LOG(INFO) << "nblks=(" << nbx << ", " << nby << ") nthrs=(" << ntx << ", " << nty << ")";
  const dim3 nblks(nbx, nby);
  const dim3 nthrs(ntx, nty);
  const bool use_idx = !IsNullArray(csr.data);
@@ -851,4 +850,4 @@ void SpMMCmpCsrHetero(
 }  // namespace aten
 }  // namespace dgl
-#endif
+#endif  // DGL_ARRAY_CUDA_SPMM_CUH_
--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
@@ -4,8 +4,8 @@
 * \brief Array index select GPU kernel implementation
 */
-#ifndef DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_UVM_CUH_
+#ifndef DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
-#define DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_UVM_CUH_
+#define DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
 #define CACHE_LINE_SIZE 128
@@ -49,4 +49,4 @@ __global__ void IndexSelectMultiKernelAligned(
 }  // namespace aten
 }  // namespace dgl
-#endif
+#endif  // DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
--- a/src/array/kernel.cc
+++ b/src/array/kernel.cc
@@ -329,7 +329,6 @@ void Edge_softmax_forward(const std::string& op,
          NDArray efeat,
          NDArray out) {
  // TODO(zhejiang): add gpu op for edge_softmax
-  SparseFormat format = graph->SelectFormat(0, CSC_CODE);
  const auto& bcast = CalcBcastOff(op, ufeat, efeat);
  ATEN_XPU_SWITCH(graph->Context().device_type, XPU, "edge_softmax", {
@@ -351,7 +350,6 @@ void Edge_softmax_backward(const std::string& op,
          NDArray back_out,
          NDArray ufeat) {
  // TODO(zhejiang): add gpu op for edge_softmax
-  SparseFormat format = graph->SelectFormat(0, CSC_CODE);
  const auto& bcast = CalcBcastOff(op, ufeat, sds);
  ATEN_XPU_SWITCH(graph->Context().device_type, XPU, "edge_softmax_back", {

--- a/src/array/libra_partition.cc
+++ b/src/array/libra_partition.cc
@@ -270,7 +270,8 @@ void LibraVertexCut(
    for (int64_t i=0; i < N_e; i++) {
      if (out_ptr[i] == c)
-        fprintf(fp, "%ld,%ld,%f\n", u_ptr[i], v_ptr[i], w_ptr[i]);
+        fprintf(fp, "%ld,%ld,%ld\n", static_cast<int64_t>(u_ptr[i]),
+            static_cast<int64_t>(v_ptr[i]), w_ptr[i]);
    }
    fclose(fp);
  }
@@ -283,7 +284,7 @@ void LibraVertexCut(
  printf("\nTotal replication: %ld\n", replication_list.size());
  for (uint64_t i=0; i < replication_list.size(); i++)
-    fprintf(fp, "%ld\n", replication_list[i]);
+    fprintf(fp, "%ld\n", static_cast<int64_t>(replication_list[i]));
  printf("Community weights:\n");
  for (int64_t c=0; c < nc; c++)

--- a/src/graph/gk_ops.cc
+++ b/src/graph/gk_ops.cc
@@ -37,12 +37,16 @@ gk_csr_t *Convert2GKCsr(const aten::CSRMatrix mat, bool is_row) {
  size_t num_ptrs;
  if (is_row) {
    num_ptrs = gk_csr->nrows + 1;
-    gk_indptr = gk_csr->rowptr = gk_zmalloc(gk_csr->nrows+1, "gk_csr_ExtractPartition: rowptr");
+    gk_indptr = gk_csr->rowptr = gk_zmalloc(gk_csr->nrows+1,
-    gk_indices = gk_csr->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
+        const_cast<char*>("gk_csr_ExtractPartition: rowptr"));
+    gk_indices = gk_csr->rowind = gk_imalloc(nnz,
+        const_cast<char*>("gk_csr_ExtractPartition: rowind"));
  } else {
    num_ptrs = gk_csr->ncols + 1;
-    gk_indptr = gk_csr->colptr = gk_zmalloc(gk_csr->ncols+1, "gk_csr_ExtractPartition: colptr");
+    gk_indptr = gk_csr->colptr = gk_zmalloc(gk_csr->ncols+1,
-    gk_indices = gk_csr->colind = gk_imalloc(nnz, "gk_csr_ExtractPartition: colind");
+        const_cast<char*>("gk_csr_ExtractPartition: colptr"));
+    gk_indices = gk_csr->colind = gk_imalloc(nnz,
+        const_cast<char*>("gk_csr_ExtractPartition: colind"));
  }
  for (size_t i = 0; i < num_ptrs; i++) {

--- a/src/graph/heterograph_capi.cc
+++ b/src/graph/heterograph_capi.cc
@@ -772,10 +772,10 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroSortInEdges")
 DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes")
 .set_body([] (DGLArgs args, DGLRetValue* rv) {
    GraphRef metagraph = args[0];
-    std::unordered_set<int64_t> dst_set;
+    std::unordered_set<uint64_t> dst_set;
-    std::unordered_set<int64_t> src_set;
+    std::unordered_set<uint64_t> src_set;
-    for (int64_t eid = 0; eid < metagraph->NumEdges(); ++eid) {
+    for (uint64_t eid = 0; eid < metagraph->NumEdges(); ++eid) {
      auto edge = metagraph->FindEdge(eid);
      auto src = edge.first;
      auto dst = edge.second;
@@ -785,16 +785,16 @@ DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes")
    List<Value> srclist, dstlist;
    List<List<Value>> ret_list;
-    for (int64_t nid = 0; nid < metagraph->NumVertices(); ++nid) {
+    for (uint64_t nid = 0; nid < metagraph->NumVertices(); ++nid) {
      auto is_dst = dst_set.count(nid);
      auto is_src = src_set.count(nid);
      if (is_dst && is_src)
        return;
      else if (is_dst)
-        dstlist.push_back(Value(MakeValue(nid)));
+        dstlist.push_back(Value(MakeValue(static_cast<int64_t>(nid))));
      else
        // If a node type is isolated, put it in srctype as defined in the Python docstring.
-        srclist.push_back(Value(MakeValue(nid)));
+        srclist.push_back(Value(MakeValue(static_cast<int64_t>(nid))));
    }
    ret_list.push_back(srclist);
    ret_list.push_back(dstlist);

--- a/src/graph/sampler.cc
+++ b/src/graph/sampler.cc
@@ -935,7 +935,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_NeighborSampling")
        if (aten::IsNullArray(probability)) {
          prob = nullptr;
        } else {
-          CHECK(probability->shape[0] == gptr->NumEdges())
+          CHECK(probability->shape[0] == static_cast<int64_t>(gptr->NumEdges()))
            << "transition probability must have same number of elements as edges";
          CHECK(probability.IsContiguous())
            << "transition probability must be contiguous tensor";