Unverified Commit 880b3b1f authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Fix] Enable lint check for cuh files and fix compiler warnings (#4585)

* disable warning for tensorpipe

* fix warning

* enable lint check for cuh files

* resolve comments
parent 166b273b
...@@ -250,10 +250,15 @@ endif((NOT MSVC) AND USE_LIBXSMM) ...@@ -250,10 +250,15 @@ endif((NOT MSVC) AND USE_LIBXSMM)
if(NOT MSVC) if(NOT MSVC)
# Only build tensorpipe on linux # Only build tensorpipe on linux
string(REPLACE "-pedantic" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) string(REPLACE "-pedantic" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
# Disable -Wall for third-party tensorpipe due to too many warnings
string(REPLACE "-Wall" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
string(REPLACE "-Wall" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
set(TP_STATIC_OR_SHARED STATIC) set(TP_STATIC_OR_SHARED STATIC)
add_subdirectory(third_party/tensorpipe) add_subdirectory(third_party/tensorpipe)
list(APPEND DGL_LINKER_LIBS tensorpipe) list(APPEND DGL_LINKER_LIBS tensorpipe)
target_include_directories(dgl PRIVATE third_party/tensorpipe) target_include_directories(dgl PRIVATE third_party/tensorpipe)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
endif(NOT MSVC) endif(NOT MSVC)
# Compile TVM Runtime and Featgraph # Compile TVM Runtime and Featgraph
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <exception> #include <exception>
#include <vector> #include <vector>
#include <atomic> #include <atomic>
#include <utility>
namespace { namespace {
int64_t divup(int64_t x, int64_t y) { int64_t divup(int64_t x, int64_t y) {
......
...@@ -29,8 +29,6 @@ void SpMMCsr(const std::string& op, const std::string& reduce, ...@@ -29,8 +29,6 @@ void SpMMCsr(const std::string& op, const std::string& reduce,
SWITCH_BITS(bits, DType, { SWITCH_BITS(bits, DType, {
SWITCH_OP(op, Op, { SWITCH_OP(op, Op, {
DType *out_off = out.Ptr<DType>(); DType *out_off = out.Ptr<DType>();
IdType* argX = Op::use_lhs ? static_cast<IdType*>(out_aux[0]->data) : nullptr;
IdType* argW = Op::use_rhs ? static_cast<IdType*>(out_aux[1]->data) : nullptr;
if (reduce == "max") { if (reduce == "max") {
std::fill(out_off, out_off + csr.num_rows * dim, cpu::op::Max<DType>::zero); std::fill(out_off, out_off + csr.num_rows * dim, cpu::op::Max<DType>::zero);
cpu::SpMMCmpCsr<IdType, DType, Op, cpu::op::Max<DType>>( cpu::SpMMCmpCsr<IdType, DType, Op, cpu::op::Max<DType>>(
...@@ -103,7 +101,6 @@ void SpMMCsrHetero(const std::string& op, const std::string& reduce, ...@@ -103,7 +101,6 @@ void SpMMCsrHetero(const std::string& op, const std::string& reduce,
const dgl_type_t src_id = ufeat_node_tids[etype]; const dgl_type_t src_id = ufeat_node_tids[etype];
const dgl_type_t dst_id = out_node_tids[etype]; const dgl_type_t dst_id = out_node_tids[etype];
CSRMatrix csr = vec_csr[etype]; CSRMatrix csr = vec_csr[etype];
DType *out_off = (*vec_out)[out_node_tids[etype]].Ptr<DType>();
NDArray ufeat = (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id]; NDArray ufeat = (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id];
NDArray efeat = (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype]; NDArray efeat = (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype];
NDArray out = (*vec_out)[dst_id]; NDArray out = (*vec_out)[dst_id];
......
...@@ -126,7 +126,7 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, ...@@ -126,7 +126,7 @@ void SpMMSumCsr(const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat,
const IdType* edges = csr.data.Ptr<IdType>(); const IdType* edges = csr.data.Ptr<IdType>();
const DType* X = ufeat.Ptr<DType>(); const DType* X = ufeat.Ptr<DType>();
const DType* W = efeat.Ptr<DType>(); const DType* W = efeat.Ptr<DType>();
int64_t dim = bcast.out_len, lhs_dim = bcast.lhs_len, rhs_dim = bcast.rhs_len; int64_t dim = bcast.out_len;
DType* O = out.Ptr<DType>(); DType* O = out.Ptr<DType>();
CHECK_NOTNULL(indptr); CHECK_NOTNULL(indptr);
CHECK_NOTNULL(O); CHECK_NOTNULL(O);
......
...@@ -93,4 +93,4 @@ __global__ void IndexScatterMultiKernel( ...@@ -93,4 +93,4 @@ __global__ void IndexScatterMultiKernel(
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
#endif #endif // DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
* \file array/cuda/atomic.cuh * \file array/cuda/atomic.cuh
* \brief Atomic functions * \brief Atomic functions
*/ */
#ifndef DGL_ARRAY_CUDA_ATOMIC_H_ #ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
#define DGL_ARRAY_CUDA_ATOMIC_H_ #define DGL_ARRAY_CUDA_ATOMIC_CUH_
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cassert> #include <cassert>
...@@ -22,15 +22,15 @@ namespace cuda { ...@@ -22,15 +22,15 @@ namespace cuda {
template <int Bytes> struct Code { }; template <int Bytes> struct Code { };
template <> struct Code<2> { template <> struct Code<2> {
typedef unsigned short int Type; typedef unsigned short int Type; // NOLINT
}; };
template <> struct Code<4> { template <> struct Code<4> {
typedef unsigned int Type; typedef unsigned int Type; // NOLINT
}; };
template <> struct Code<8> { template <> struct Code<8> {
typedef unsigned long long int Type; typedef unsigned long long int Type; // NOLINT
}; };
// Helper class for converting to/from atomicCAS compatible types. // Helper class for converting to/from atomicCAS compatible types.
...@@ -76,10 +76,10 @@ template <> struct Cast<double> { ...@@ -76,10 +76,10 @@ template <> struct Cast<double> {
} }
}; };
static __device__ __forceinline__ unsigned short int atomicCASshort( static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT
unsigned short int *address, unsigned short int *address, // NOLINT
unsigned short int compare, unsigned short int compare, // NOLINT
unsigned short int val) { unsigned short int val) { // NOLINT
static_assert(CUDART_VERSION >= 10000, "Requires at least CUDA 10"); static_assert(CUDART_VERSION >= 10000, "Requires at least CUDA 10");
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
return atomicCAS(address, compare, val); return atomicCAS(address, compare, val);
...@@ -112,7 +112,7 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( ...@@ -112,7 +112,7 @@ static __device__ __forceinline__ unsigned short int atomicCASshort(
#define DEFINE_ATOMIC_HALF(NAME) \ #define DEFINE_ATOMIC_HALF(NAME) \
template <> \ template <> \
__device__ __forceinline__ half Atomic##NAME<half>(half* addr, half val) { \ __device__ __forceinline__ half Atomic##NAME<half>(half* addr, half val) { \
typedef unsigned short int CT; \ typedef uint16_t CT; \
CT* addr_as_ui = reinterpret_cast<CT*>(addr); \ CT* addr_as_ui = reinterpret_cast<CT*>(addr); \
CT old = *addr_as_ui; \ CT old = *addr_as_ui; \
CT assumed = old; \ CT assumed = old; \
...@@ -282,4 +282,4 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) { ...@@ -282,4 +282,4 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
#endif // DGL_ARRAY_CUDA_ATOMIC_H_ #endif // DGL_ARRAY_CUDA_ATOMIC_CUH_
...@@ -14,4 +14,4 @@ static_assert(false, "THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL."); ...@@ -14,4 +14,4 @@ static_assert(false, "THRUST_CUB_WRAPPED_NAMESPACE must be defined for DGL.");
#include "cub/cub.cuh" #include "cub/cub.cuh"
#endif #endif // DGL_ARRAY_CUDA_DGL_CUB_CUH_
...@@ -5,61 +5,98 @@ ...@@ -5,61 +5,98 @@
* \note this file is modified from TVM project: * \note this file is modified from TVM project:
* https://github.com/apache/tvm/blob/e561007f0c330e3d14c2bc8a3ef40fb741db9004/src/target/source/literal/cuda_half_t.h. * https://github.com/apache/tvm/blob/e561007f0c330e3d14c2bc8a3ef40fb741db9004/src/target/source/literal/cuda_half_t.h.
*/ */
#ifndef DGL_ARRAY_FP16_CUH_ #ifndef DGL_ARRAY_CUDA_FP16_CUH_
#define DGL_ARRAY_FP16_CUH_ #define DGL_ARRAY_CUDA_FP16_CUH_
#ifdef USE_FP16 #ifdef USE_FP16
#include <cuda_fp16.h> #include <cuda_fp16.h>
#include <algorithm>
static __device__ __forceinline__ half max(half a, half b) static __device__ __forceinline__ half max(half a, half b) {
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(__half(a), __half(b)) ? a : b; return __hgt(__half(a), __half(b)) ? a : b;
#else #else
return __half(max(float(a), float(b))); return __half(max(float(a), float(b))); // NOLINT
#endif #endif
} }
static __device__ __forceinline__ half min(half a, half b) static __device__ __forceinline__ half min(half a, half b) {
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(__half(a), __half(b)) ? a : b; return __hlt(__half(a), __half(b)) ? a : b;
#else #else
return __half(min(float(a), float(b))); return __half(min(float(a), float(b))); // NOLINT
#endif #endif
} }
#ifdef __CUDACC__ #ifdef __CUDACC__
// Arithmetic FP16 operations for architecture >= 5.3 are already defined in cuda_fp16.h // Arithmetic FP16 operations for architecture >= 5.3 are already defined in cuda_fp16.h
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
__device__ __forceinline__ __half operator+(const __half& lh, const __half& rh) { return __half(float(lh) + float(rh)); } __device__ __forceinline__ __half operator+(const __half& lh, const __half& rh) {
__device__ __forceinline__ __half operator-(const __half& lh, const __half& rh) { return __half(float(lh) - float(rh)); } return __half(float(lh) + float(rh)); // NOLINT
__device__ __forceinline__ __half operator*(const __half& lh, const __half& rh) { return __half(float(lh) * float(rh)); } }
__device__ __forceinline__ __half operator/(const __half& lh, const __half& rh) { return __half(float(lh) / float(rh)); } __device__ __forceinline__ __half operator-(const __half& lh, const __half& rh) {
return __half(float(lh) - float(rh)); // NOLINT
}
__device__ __forceinline__ __half operator*(const __half& lh, const __half& rh) {
return __half(float(lh) * float(rh)); // NOLINT
}
__device__ __forceinline__ __half operator/(const __half& lh, const __half& rh) {
return __half(float(lh) / float(rh)); // NOLINT
}
__device__ __forceinline__ __half& operator+=(__half& lh, const __half& rh) { lh = __half(float(lh) + float(rh)); return lh; } __device__ __forceinline__ __half& operator+=(__half& lh, const __half& rh) { // NOLINT
__device__ __forceinline__ __half& operator-=(__half& lh, const __half& rh) { lh = __half(float(lh) - float(rh)); return lh; } lh = __half(float(lh) + float(rh)); return lh; // NOLINT
__device__ __forceinline__ __half& operator*=(__half& lh, const __half& rh) { lh = __half(float(lh) * float(rh)); return lh; } }
__device__ __forceinline__ __half& operator/=(__half& lh, const __half& rh) { lh = __half(float(lh) / float(rh)); return lh; } __device__ __forceinline__ __half& operator-=(__half& lh, const __half& rh) { // NOLINT
lh = __half(float(lh) - float(rh)); return lh; // NOLINT
}
__device__ __forceinline__ __half& operator*=(__half& lh, const __half& rh) { // NOLINT
lh = __half(float(lh) * float(rh)); return lh; // NOLINT
}
__device__ __forceinline__ __half& operator/=(__half& lh, const __half& rh) { // NOLINT
lh = __half(float(lh) / float(rh)); return lh; // NOLINT
}
__device__ __forceinline__ __half& operator++(__half& h) { h = __half(float(h) + 1.0f); return h; } __device__ __forceinline__ __half& operator++(__half& h) { // NOLINT
__device__ __forceinline__ __half& operator--(__half& h) { h = __half(float(h) - 1.0f); return h; } h = __half(float(h) + 1.0f); return h; // NOLINT
__device__ __forceinline__ __half operator++(__half& h, int) { __half ret = h; h = __half(float(h) + 1.0f); return ret; } }
__device__ __forceinline__ __half operator--(__half& h, int) { __half ret = h; h = __half(float(h) - 1.0f); return ret; } __device__ __forceinline__ __half& operator--(__half& h) { // NOLINT
h = __half(float(h) - 1.0f); return h; // NOLINT
}
__device__ __forceinline__ __half operator++(__half& h, int) { // NOLINT
__half ret = h; h = __half(float(h) + 1.0f); return ret; // NOLINT
}
__device__ __forceinline__ __half operator--(__half& h, int) { // NOLINT
__half ret = h; h = __half(float(h) - 1.0f); return ret; // NOLINT
}
__device__ __forceinline__ __half operator+(const __half& h) { return h; } __device__ __forceinline__ __half operator+(const __half& h) { return h; }
__device__ __forceinline__ __half operator-(const __half& h) { return __half(-float(h)); } __device__ __forceinline__ __half operator-(const __half& h) {
return __half(-float(h)); // NOLINT
}
__device__ __forceinline__ bool operator==(const __half& lh, const __half& rh) { return float(lh) == float(rh); } __device__ __forceinline__ bool operator==(const __half& lh, const __half& rh) {
__device__ __forceinline__ bool operator!=(const __half& lh, const __half& rh) { return float(lh) != float(rh); } return float(lh) == float(rh); // NOLINT
__device__ __forceinline__ bool operator> (const __half& lh, const __half& rh) { return float(lh) > float(rh); } }
__device__ __forceinline__ bool operator< (const __half& lh, const __half& rh) { return float(lh) < float(rh); } __device__ __forceinline__ bool operator!=(const __half& lh, const __half& rh) {
__device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) { return float(lh) >= float(rh); } return float(lh) != float(rh); // NOLINT
__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) { return float(lh) <= float(rh); } }
__device__ __forceinline__ bool operator> (const __half& lh, const __half& rh) {
return float(lh) > float(rh); // NOLINT
}
__device__ __forceinline__ bool operator< (const __half& lh, const __half& rh) {
return float(lh) < float(rh); // NOLINT
}
__device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
return float(lh) >= float(rh); // NOLINT
}
__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
return float(lh) <= float(rh); // NOLINT
}
#endif // __CUDA_ARCH__ < 530 #endif // __CUDA_ARCH__ < 530
#endif // __CUDACC__ #endif // __CUDACC__
#endif // USE_FP16 #endif // USE_FP16
#endif // DGL_ARRAY_FP16_CUH_ #endif // DGL_ARRAY_CUDA_FP16_CUH_
...@@ -6,9 +6,10 @@ ...@@ -6,9 +6,10 @@
#ifndef DGL_ARRAY_CUDA_FUNCTOR_CUH_ #ifndef DGL_ARRAY_CUDA_FUNCTOR_CUH_
#define DGL_ARRAY_CUDA_FUNCTOR_CUH_ #define DGL_ARRAY_CUDA_FUNCTOR_CUH_
#include <cmath>
#include <limits>
#include "./atomic.cuh" #include "./atomic.cuh"
#include "./fp16.cuh" #include "./fp16.cuh"
#include <cmath>
namespace dgl { namespace dgl {
namespace aten { namespace aten {
...@@ -128,7 +129,7 @@ template <typename Idx, ...@@ -128,7 +129,7 @@ template <typename Idx,
struct _Sum { struct _Sum {
static constexpr __host__ __device__ __forceinline__ DType zero() { static constexpr __host__ __device__ __forceinline__ DType zero() {
return 0.; return 0.;
}; }
static constexpr bool require_arg = false; static constexpr bool require_arg = false;
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
...@@ -155,7 +156,7 @@ struct _Sum { ...@@ -155,7 +156,7 @@ struct _Sum {
template <typename Idx, template <typename Idx,
typename DType, typename DType,
bool atomic=false> bool atomic = false>
struct Sum: _Sum<Idx, DType, atomic> { }; struct Sum: _Sum<Idx, DType, atomic> { };
#ifdef USE_FP16 #ifdef USE_FP16
...@@ -163,7 +164,7 @@ template <typename Idx, bool atomic> ...@@ -163,7 +164,7 @@ template <typename Idx, bool atomic>
struct Sum<Idx, half, atomic>: _Sum<Idx, half, atomic> { struct Sum<Idx, half, atomic>: _Sum<Idx, half, atomic> {
static constexpr __host__ __device__ __forceinline__ half zero() { static constexpr __host__ __device__ __forceinline__ half zero() {
return __float2half_rn(0.); return __float2half_rn(0.);
}; }
}; };
#endif // USE_FP16 #endif // USE_FP16
...@@ -173,7 +174,7 @@ template <typename Idx, ...@@ -173,7 +174,7 @@ template <typename Idx,
struct _Max { struct _Max {
static constexpr __host__ __device__ __forceinline__ DType zero() { static constexpr __host__ __device__ __forceinline__ DType zero() {
return -std::numeric_limits<DType>::infinity(); return -std::numeric_limits<DType>::infinity();
}; }
static constexpr bool require_arg = true; static constexpr bool require_arg = true;
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
...@@ -216,7 +217,7 @@ struct _Max { ...@@ -216,7 +217,7 @@ struct _Max {
template <typename Idx, template <typename Idx,
typename DType, typename DType,
bool atomic=false> bool atomic = false>
struct Max : _Max<Idx, DType, atomic> { }; struct Max : _Max<Idx, DType, atomic> { };
#ifdef USE_FP16 #ifdef USE_FP16
...@@ -225,7 +226,7 @@ template <typename Idx, ...@@ -225,7 +226,7 @@ template <typename Idx,
struct Max<Idx, half, atomic> : _Max<Idx, half, atomic> { struct Max<Idx, half, atomic> : _Max<Idx, half, atomic> {
static constexpr __host__ __device__ __forceinline__ half zero() { static constexpr __host__ __device__ __forceinline__ half zero() {
return __float2half_rn(-6.550400e+04f); return __float2half_rn(-6.550400e+04f);
}; }
}; };
#endif #endif
...@@ -235,7 +236,7 @@ template <typename Idx, ...@@ -235,7 +236,7 @@ template <typename Idx,
struct _Min { struct _Min {
static constexpr __host__ __device__ __forceinline__ DType zero() { static constexpr __host__ __device__ __forceinline__ DType zero() {
return std::numeric_limits<DType>::infinity(); return std::numeric_limits<DType>::infinity();
}; }
static constexpr bool require_arg = true; static constexpr bool require_arg = true;
static __device__ __forceinline__ void Call( static __device__ __forceinline__ void Call(
DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
...@@ -278,7 +279,7 @@ struct _Min { ...@@ -278,7 +279,7 @@ struct _Min {
template <typename Idx, template <typename Idx,
typename DType, typename DType,
bool atomic=false> bool atomic = false>
struct Min : _Min<Idx, DType, atomic> { }; struct Min : _Min<Idx, DType, atomic> { };
#ifdef USE_FP16 #ifdef USE_FP16
...@@ -287,7 +288,7 @@ template <typename Idx, ...@@ -287,7 +288,7 @@ template <typename Idx,
struct Min<Idx, half, atomic> : _Min<Idx, half, atomic> { struct Min<Idx, half, atomic> : _Min<Idx, half, atomic> {
static constexpr __host__ __device__ __forceinline__ half zero() { static constexpr __host__ __device__ __forceinline__ half zero() {
return __float2half_rn(6.550400e+04f); return __float2half_rn(6.550400e+04f);
}; }
}; };
#endif // USE_FP16 #endif // USE_FP16
......
...@@ -46,7 +46,7 @@ __global__ void GESpMMKernel( ...@@ -46,7 +46,7 @@ __global__ void GESpMMKernel(
if (left + 32 <= high) { if (left + 32 <= high) {
#pragma unroll #pragma unroll
for (Idx i = 0; i < 32; ++i) { for (Idx i = 0; i < 32; ++i) {
const Idx eid = left + i; const Idx eid = left + i;
const Idx cid = __ldg(indices + eid); const Idx cid = __ldg(indices + eid);
const Idx offset = feat_len * cid + fid; const Idx offset = feat_len * cid + fid;
if (BinaryOp::use_rhs) { if (BinaryOp::use_rhs) {
...@@ -59,7 +59,7 @@ __global__ void GESpMMKernel( ...@@ -59,7 +59,7 @@ __global__ void GESpMMKernel(
} }
} else { } else {
for (Idx i = 0; left + i < high; ++i) { for (Idx i = 0; left + i < high; ++i) {
const Idx eid = left + i; const Idx eid = left + i;
const Idx cid = __ldg(indices + eid); const Idx cid = __ldg(indices + eid);
const Idx offset = feat_len * cid + fid; const Idx offset = feat_len * cid + fid;
if (BinaryOp::use_rhs) { if (BinaryOp::use_rhs) {
...@@ -82,8 +82,8 @@ __global__ void GESpMMKernel( ...@@ -82,8 +82,8 @@ __global__ void GESpMMKernel(
if (left + 32 <= high) { if (left + 32 <= high) {
#pragma unroll #pragma unroll
for (int i = 0; i < 32; ++i) { for (int i = 0; i < 32; ++i) {
const Idx eid = left + i; const Idx eid = left + i;
const Idx cid = __ldg(indices + eid); const Idx cid = __ldg(indices + eid);
const Idx offset = feat_len * cid; const Idx offset = feat_len * cid;
if (BinaryOp::use_rhs) { if (BinaryOp::use_rhs) {
accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid); accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid);
...@@ -95,8 +95,8 @@ __global__ void GESpMMKernel( ...@@ -95,8 +95,8 @@ __global__ void GESpMMKernel(
} }
} else { } else {
for (int i = 0; i + left < high; ++i) { for (int i = 0; i + left < high; ++i) {
const Idx eid = left + i; const Idx eid = left + i;
const Idx cid = __ldg(indices + eid); const Idx cid = __ldg(indices + eid);
const Idx offset = feat_len * cid; const Idx offset = feat_len * cid;
if (BinaryOp::use_rhs) { if (BinaryOp::use_rhs) {
accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid); accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid);
...@@ -129,7 +129,7 @@ void GESpMMCsr( ...@@ -129,7 +129,7 @@ void GESpMMCsr(
DType *out_data = out.Ptr<DType>(); DType *out_data = out.Ptr<DType>();
cudaStream_t stream = runtime::getCurrentCUDAStream(); cudaStream_t stream = runtime::getCurrentCUDAStream();
const int ntx = 32; const int ntx = 32;
const int nty = 32; const int nty = 32;
const int nby = (feat_len + (ntx * 2) - 1) / (ntx * 2); const int nby = (feat_len + (ntx * 2) - 1) / (ntx * 2);
...@@ -150,4 +150,4 @@ void GESpMMCsr( ...@@ -150,4 +150,4 @@ void GESpMMCsr(
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
#endif #endif // DGL_ARRAY_CUDA_GE_SPMM_CUH_
...@@ -48,4 +48,4 @@ ...@@ -48,4 +48,4 @@
} \ } \
} while (0) } while (0)
#endif #endif // DGL_ARRAY_CUDA_MACRO_CUH_
...@@ -294,7 +294,8 @@ void SDDMMCoo( ...@@ -294,7 +294,8 @@ void SDDMMCoo(
const dim3 nblks(nbx, nby); const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty); const dim3 nthrs(ntx, nty);
BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, { BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
CUDA_KERNEL_CALL((SDDMMCooTreeReduceKernel<Idx, DType, UseBcast, UseIdx, LhsTarget, RhsTarget>), CUDA_KERNEL_CALL(
(SDDMMCooTreeReduceKernel<Idx, DType, UseBcast, UseIdx, LhsTarget, RhsTarget>),
nblks, nthrs, 0, stream, nblks, nthrs, 0, stream,
lhs_data, rhs_data, out_data, lhs_data, rhs_data, out_data,
row, col, edge_map, row, col, edge_map,
...@@ -376,4 +377,4 @@ void SDDMMCsr( ...@@ -376,4 +377,4 @@ void SDDMMCsr(
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
#endif #endif // DGL_ARRAY_CUDA_SDDMM_CUH_
...@@ -3,9 +3,11 @@ ...@@ -3,9 +3,11 @@
* \file array/cuda/segment_reduce.cuh * \file array/cuda/segment_reduce.cuh
* \brief Segment reduce kernel function header. * \brief Segment reduce kernel function header.
*/ */
#ifndef DGL_ARRAY_SEGMENT_REDUCE_CUH_ #ifndef DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
#define DGL_ARRAY_SEGMENT_REDUCE_CUH_ #define DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
#include <string>
#include <vector>
#include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_common.h"
#include "./utils.h" #include "./utils.h"
#include "./atomic.cuh" #include "./atomic.cuh"
...@@ -27,7 +29,7 @@ template <typename IdType, typename DType, ...@@ -27,7 +29,7 @@ template <typename IdType, typename DType,
__global__ void SegmentReduceKernel( __global__ void SegmentReduceKernel(
const DType* feat, const IdType* offsets, const DType* feat, const IdType* offsets,
DType* out, IdType* arg, DType* out, IdType* arg,
int64_t n, int64_t dim){ int64_t n, int64_t dim) {
for (int row = blockIdx.x; row < n; row += gridDim.x) { for (int row = blockIdx.x; row < n; row += gridDim.x) {
int col = blockIdx.y * blockDim.x + threadIdx.x; int col = blockIdx.y * blockDim.x + threadIdx.x;
while (col < dim) { while (col < dim) {
...@@ -81,7 +83,7 @@ __global__ void UpdateGradMinMaxHeteroKernel( ...@@ -81,7 +83,7 @@ __global__ void UpdateGradMinMaxHeteroKernel(
unsigned int row = warpId; unsigned int row = warpId;
while (row < n) { while (row < n) {
for(unsigned int col = laneId; col < dim; col += warp_size) { for (unsigned int col = laneId; col < dim; col += warp_size) {
if (type == idx_type[row * dim + col]) { if (type == idx_type[row * dim + col]) {
const int write_row = idx[row * dim + col]; const int write_row = idx[row * dim + col];
cuda::AtomicAdd(out + write_row * dim + col, feat[row * dim + col]); cuda::AtomicAdd(out + write_row * dim + col, feat[row * dim + col]);
...@@ -273,4 +275,4 @@ void BackwardSegmentCmp( ...@@ -273,4 +275,4 @@ void BackwardSegmentCmp(
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
#endif #endif // DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#define DGL_ARRAY_CUDA_SPMM_CUH_ #define DGL_ARRAY_CUDA_SPMM_CUH_
#include <dgl/bcast.h> #include <dgl/bcast.h>
#include <limits>
#include "macro.cuh" #include "macro.cuh"
#include "fp16.cuh" #include "fp16.cuh"
#include "atomic.cuh" #include "atomic.cuh"
...@@ -615,7 +616,7 @@ __global__ void SpMMCmpCsrHeteroKernel( ...@@ -615,7 +616,7 @@ __global__ void SpMMCmpCsrHeteroKernel(
while (ty < num_rows) { while (ty < num_rows) {
int tx = blockIdx.x * blockDim.x + threadIdx.x; int tx = blockIdx.x * blockDim.x + threadIdx.x;
while (tx < out_len) { while (tx < out_len) {
DType new_out = out[ty * out_len + tx];//ReduceOp::zero(); DType new_out = out[ty * out_len + tx]; // ReduceOp::zero();
Idx local_argu = 0, local_arge = 0; Idx local_argu = 0, local_arge = 0;
const int lhs_add = UseBcast ? ubcast_off[tx] : tx; const int lhs_add = UseBcast ? ubcast_off[tx] : tx;
const int rhs_add = UseBcast ? ebcast_off[tx] : tx; const int rhs_add = UseBcast ? ebcast_off[tx] : tx;
...@@ -698,7 +699,6 @@ void SpMMCoo( ...@@ -698,7 +699,6 @@ void SpMMCoo(
const int nty = CUDA_MAX_NUM_THREADS / ntx; const int nty = CUDA_MAX_NUM_THREADS / ntx;
const int nbx = (len + ntx - 1) / ntx; const int nbx = (len + ntx - 1) / ntx;
const int nby = FindNumBlocks<'y'>((E + nty - 1) / nty); const int nby = FindNumBlocks<'y'>((E + nty - 1) / nty);
//LOG(INFO) << "nblks=(" << nbx << ", " << nby << ") nthrs=(" << ntx << ", " << nty << ")";
const dim3 nblks(nbx, nby); const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty); const dim3 nthrs(ntx, nty);
const bool use_idx = !IsNullArray(coo.data); const bool use_idx = !IsNullArray(coo.data);
...@@ -761,9 +761,8 @@ void SpMMCsr( ...@@ -761,9 +761,8 @@ void SpMMCsr(
rhs_len = bcast.rhs_len; rhs_len = bcast.rhs_len;
const int ntx = FindNumThreads(len); const int ntx = FindNumThreads(len);
const int nty = CUDA_MAX_NUM_THREADS / ntx; const int nty = CUDA_MAX_NUM_THREADS / ntx;
const int nby= (len + ntx - 1) / ntx; const int nby = (len + ntx - 1) / ntx;
const int nbx = FindNumBlocks<'x'>((csr.num_rows + nty - 1) / nty); const int nbx = FindNumBlocks<'x'>((csr.num_rows + nty - 1) / nty);
//LOG(INFO) << "nblks=(" << nbx << ", " << nby << ") nthrs=(" << ntx << ", " << nty << ")";
const dim3 nblks(nbx, nby); const dim3 nblks(nbx, nby);
const dim3 nthrs(ntx, nty); const dim3 nthrs(ntx, nty);
const bool use_idx = !IsNullArray(csr.data); const bool use_idx = !IsNullArray(csr.data);
...@@ -851,4 +850,4 @@ void SpMMCmpCsrHetero( ...@@ -851,4 +850,4 @@ void SpMMCmpCsrHetero(
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
#endif #endif // DGL_ARRAY_CUDA_SPMM_CUH_
...@@ -4,8 +4,8 @@ ...@@ -4,8 +4,8 @@
* \brief Array index select GPU kernel implementation * \brief Array index select GPU kernel implementation
*/ */
#ifndef DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_UVM_CUH_ #ifndef DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
#define DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_UVM_CUH_ #define DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
#define CACHE_LINE_SIZE 128 #define CACHE_LINE_SIZE 128
...@@ -49,4 +49,4 @@ __global__ void IndexSelectMultiKernelAligned( ...@@ -49,4 +49,4 @@ __global__ void IndexSelectMultiKernelAligned(
} // namespace aten } // namespace aten
} // namespace dgl } // namespace dgl
#endif #endif // DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
...@@ -329,7 +329,6 @@ void Edge_softmax_forward(const std::string& op, ...@@ -329,7 +329,6 @@ void Edge_softmax_forward(const std::string& op,
NDArray efeat, NDArray efeat,
NDArray out) { NDArray out) {
// TODO(zhejiang): add gpu op for edge_softmax // TODO(zhejiang): add gpu op for edge_softmax
SparseFormat format = graph->SelectFormat(0, CSC_CODE);
const auto& bcast = CalcBcastOff(op, ufeat, efeat); const auto& bcast = CalcBcastOff(op, ufeat, efeat);
ATEN_XPU_SWITCH(graph->Context().device_type, XPU, "edge_softmax", { ATEN_XPU_SWITCH(graph->Context().device_type, XPU, "edge_softmax", {
...@@ -351,7 +350,6 @@ void Edge_softmax_backward(const std::string& op, ...@@ -351,7 +350,6 @@ void Edge_softmax_backward(const std::string& op,
NDArray back_out, NDArray back_out,
NDArray ufeat) { NDArray ufeat) {
// TODO(zhejiang): add gpu op for edge_softmax // TODO(zhejiang): add gpu op for edge_softmax
SparseFormat format = graph->SelectFormat(0, CSC_CODE);
const auto& bcast = CalcBcastOff(op, ufeat, sds); const auto& bcast = CalcBcastOff(op, ufeat, sds);
ATEN_XPU_SWITCH(graph->Context().device_type, XPU, "edge_softmax_back", { ATEN_XPU_SWITCH(graph->Context().device_type, XPU, "edge_softmax_back", {
......
...@@ -270,7 +270,8 @@ void LibraVertexCut( ...@@ -270,7 +270,8 @@ void LibraVertexCut(
for (int64_t i=0; i < N_e; i++) { for (int64_t i=0; i < N_e; i++) {
if (out_ptr[i] == c) if (out_ptr[i] == c)
fprintf(fp, "%ld,%ld,%f\n", u_ptr[i], v_ptr[i], w_ptr[i]); fprintf(fp, "%ld,%ld,%ld\n", static_cast<int64_t>(u_ptr[i]),
static_cast<int64_t>(v_ptr[i]), w_ptr[i]);
} }
fclose(fp); fclose(fp);
} }
...@@ -283,7 +284,7 @@ void LibraVertexCut( ...@@ -283,7 +284,7 @@ void LibraVertexCut(
printf("\nTotal replication: %ld\n", replication_list.size()); printf("\nTotal replication: %ld\n", replication_list.size());
for (uint64_t i=0; i < replication_list.size(); i++) for (uint64_t i=0; i < replication_list.size(); i++)
fprintf(fp, "%ld\n", replication_list[i]); fprintf(fp, "%ld\n", static_cast<int64_t>(replication_list[i]));
printf("Community weights:\n"); printf("Community weights:\n");
for (int64_t c=0; c < nc; c++) for (int64_t c=0; c < nc; c++)
......
...@@ -37,12 +37,16 @@ gk_csr_t *Convert2GKCsr(const aten::CSRMatrix mat, bool is_row) { ...@@ -37,12 +37,16 @@ gk_csr_t *Convert2GKCsr(const aten::CSRMatrix mat, bool is_row) {
size_t num_ptrs; size_t num_ptrs;
if (is_row) { if (is_row) {
num_ptrs = gk_csr->nrows + 1; num_ptrs = gk_csr->nrows + 1;
gk_indptr = gk_csr->rowptr = gk_zmalloc(gk_csr->nrows+1, "gk_csr_ExtractPartition: rowptr"); gk_indptr = gk_csr->rowptr = gk_zmalloc(gk_csr->nrows+1,
gk_indices = gk_csr->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind"); const_cast<char*>("gk_csr_ExtractPartition: rowptr"));
gk_indices = gk_csr->rowind = gk_imalloc(nnz,
const_cast<char*>("gk_csr_ExtractPartition: rowind"));
} else { } else {
num_ptrs = gk_csr->ncols + 1; num_ptrs = gk_csr->ncols + 1;
gk_indptr = gk_csr->colptr = gk_zmalloc(gk_csr->ncols+1, "gk_csr_ExtractPartition: colptr"); gk_indptr = gk_csr->colptr = gk_zmalloc(gk_csr->ncols+1,
gk_indices = gk_csr->colind = gk_imalloc(nnz, "gk_csr_ExtractPartition: colind"); const_cast<char*>("gk_csr_ExtractPartition: colptr"));
gk_indices = gk_csr->colind = gk_imalloc(nnz,
const_cast<char*>("gk_csr_ExtractPartition: colind"));
} }
for (size_t i = 0; i < num_ptrs; i++) { for (size_t i = 0; i < num_ptrs; i++) {
......
...@@ -772,10 +772,10 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroSortInEdges") ...@@ -772,10 +772,10 @@ DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroSortInEdges")
DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes") DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes")
.set_body([] (DGLArgs args, DGLRetValue* rv) { .set_body([] (DGLArgs args, DGLRetValue* rv) {
GraphRef metagraph = args[0]; GraphRef metagraph = args[0];
std::unordered_set<int64_t> dst_set; std::unordered_set<uint64_t> dst_set;
std::unordered_set<int64_t> src_set; std::unordered_set<uint64_t> src_set;
for (int64_t eid = 0; eid < metagraph->NumEdges(); ++eid) { for (uint64_t eid = 0; eid < metagraph->NumEdges(); ++eid) {
auto edge = metagraph->FindEdge(eid); auto edge = metagraph->FindEdge(eid);
auto src = edge.first; auto src = edge.first;
auto dst = edge.second; auto dst = edge.second;
...@@ -785,16 +785,16 @@ DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes") ...@@ -785,16 +785,16 @@ DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes")
List<Value> srclist, dstlist; List<Value> srclist, dstlist;
List<List<Value>> ret_list; List<List<Value>> ret_list;
for (int64_t nid = 0; nid < metagraph->NumVertices(); ++nid) { for (uint64_t nid = 0; nid < metagraph->NumVertices(); ++nid) {
auto is_dst = dst_set.count(nid); auto is_dst = dst_set.count(nid);
auto is_src = src_set.count(nid); auto is_src = src_set.count(nid);
if (is_dst && is_src) if (is_dst && is_src)
return; return;
else if (is_dst) else if (is_dst)
dstlist.push_back(Value(MakeValue(nid))); dstlist.push_back(Value(MakeValue(static_cast<int64_t>(nid))));
else else
// If a node type is isolated, put it in srctype as defined in the Python docstring. // If a node type is isolated, put it in srctype as defined in the Python docstring.
srclist.push_back(Value(MakeValue(nid))); srclist.push_back(Value(MakeValue(static_cast<int64_t>(nid))));
} }
ret_list.push_back(srclist); ret_list.push_back(srclist);
ret_list.push_back(dstlist); ret_list.push_back(dstlist);
......
...@@ -935,7 +935,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_NeighborSampling") ...@@ -935,7 +935,7 @@ DGL_REGISTER_GLOBAL("sampling._CAPI_NeighborSampling")
if (aten::IsNullArray(probability)) { if (aten::IsNullArray(probability)) {
prob = nullptr; prob = nullptr;
} else { } else {
CHECK(probability->shape[0] == gptr->NumEdges()) CHECK(probability->shape[0] == static_cast<int64_t>(gptr->NumEdges()))
<< "transition probability must have same number of elements as edges"; << "transition probability must have same number of elements as edges";
CHECK(probability.IsContiguous()) CHECK(probability.IsContiguous())
<< "transition probability must be contiguous tensor"; << "transition probability must be contiguous tensor";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment