update atomicAdd and csr2coo.hip

910d6a98 · sangwzh · 8f9dcabf · 910d6a98 · 910d6a98
Commit 910d6a98 authored Sep 25, 2024 by sangwzh
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 13 deletions

src/array/cuda/atomic.cuh src/array/cuda/atomic.cuh +11 -10

src/array/cuda/csr2coo.hip src/array/cuda/csr2coo.hip +4 -3

No files found.
--- a/src/array/cuda/atomic.cuh
+++ b/src/array/cuda/atomic.cuh
@@ -169,7 +169,7 @@ static __host__ __device__ __forceinline__ unsigned short int atomicCASshort(  /
    return Cast<dtype>::Decode(old);                               \
  }
-#define DEFINE_ATOMIC_16BIT_BF(NAME, dtype)                           \
+#define DEFINE_ATOMIC_16BIT_MAX(NAME, dtype)                           \
  template <>                                                      \
  __device__ __forceinline__ dtype Atomic##NAME<dtype>(            \
      dtype * addr, dtype val) {                                   \
@@ -181,12 +181,12 @@ static __host__ __device__ __forceinline__ unsigned short int atomicCASshort(  /
      assumed = old;                                               \
      old = atomicCASshort(                                        \
          addr_as_ui, assumed,                                     \
-          Cast<dtype>::Encode(max((double)val, (double)dtype(old)))); \
+          Cast<dtype>::Encode(dtype(max((float)val, (float)dtype(old))))); \
    } while (assumed != old);                                      \
    return Cast<dtype>::Decode(old);                               \
  }
-#define DEFINE_ATOMIC_16BIT_Min(NAME, dtype)                           \
+#define DEFINE_ATOMIC_16BIT_MIN(NAME, dtype)                           \
  template <>                                                      \
  __device__ __forceinline__ dtype Atomic##NAME<dtype>(            \
      dtype * addr, dtype val) {                                   \
@@ -198,24 +198,25 @@ static __host__ __device__ __forceinline__ unsigned short int atomicCASshort(  /
      assumed = old;                                               \
      old = atomicCASshort(                                        \
          addr_as_ui, assumed,                                     \
-          Cast<dtype>::Encode(min(val, dtype(old)))); \
+          Cast<dtype>::Encode(dtype(min((float)val,(float)old)))); \
    } while (assumed != old);                                      \
    return Cast<dtype>::Decode(old);                               \
  }
-#define OP(a, b) max((double)a, (double)b)
+#define OP(a, b) max(a, b)
 DEFINE_ATOMIC(Max)
-DEFINE_ATOMIC_16BIT(Max, half)
+DEFINE_ATOMIC_16BIT_MAX(Max, half)
 #if BF16_ENABLED
-DEFINE_ATOMIC_16BIT_BF(Max, __hip_bfloat16)
+#define OP_BF(a, b) max_bf((float)a, (float)b)
+DEFINE_ATOMIC_16BIT_MAX(Max, __hip_bfloat16)
 #endif  // BF16_ENABLED
 #undef OP
-#define OP(a, b) min((double)a, (double)b)
+#define OP(a, b) min(a, b)
 DEFINE_ATOMIC(Min)
-DEFINE_ATOMIC_16BIT(Min, half)
+DEFINE_ATOMIC_16BIT_MIN(Min, half)
 #if BF16_ENABLED
-DEFINE_ATOMIC_16BIT_BF(Min, __hip_bfloat16)
+DEFINE_ATOMIC_16BIT_MIN(Min, __hip_bfloat16)
 #endif  // BF16_ENABLED
 #undef OP

--- a/src/array/cuda/csr2coo.hip
+++ b/src/array/cuda/csr2coo.hip
@@ -9,6 +9,7 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <hipcub/backend/rocprim/device/device_copy.hpp>
 #include <hipcub/hipcub.hpp>
@@ -103,7 +104,7 @@ __global__ void _RepeatKernel(
 }
-#if 0
+#if 1
 template <>
 COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
  const auto& ctx = csr.indptr->ctx;
@@ -126,14 +127,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
  constexpr int64_t max_copy_at_once = std::numeric_limits<int32_t>::max();
  for (int64_t i = 0; i < csr.num_rows; i += max_copy_at_once) {
    std::size_t temp_storage_bytes = 0;
-    CUDA_CALL(cub::DeviceCopy::Batched(
+    CUDA_CALL(hipcub::DeviceCopy::Batched(
        nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
        stream));
    auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
-    CUDA_CALL(cub::DeviceCopy::Batched(
+    CUDA_CALL(hipcub::DeviceCopy::Batched(
        temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
        buffer_sizes + i, ::min(csr.num_rows - i, max_copy_at_once),
        stream));