Merge branch 'dtk25.04' of http://developer.sourcefind.cn/codes/OpenDAS/dgl into 2.2.1

74d88bf8 · sangwz · 2a1ac588 · 314cedc1 · 74d88bf8 · 74d88bf8
Commit 74d88bf8 authored Feb 20, 2025 by sangwz
20 changed files
--- a/graphbolt/src/cuda/expand_indptr.cu
+++ b/graphbolt/src/cuda/expand_indptr.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
 * @file cuda/expand_indptr.cu
 * @brief ExpandIndptr operator implementation on CUDA.
 */
+#include <hip/hip_runtime.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <limits>
-
-#include "./common.h"
+#include <hipcub/backend/rocprim/device/device_copy.hpp>
+#include "common.h"

 namespace graphbolt {
 namespace ops {
@@ -86,7 +88,7 @@ torch::Tensor ExpandIndptrImpl(
                      CUB_CALL(
                          DeviceCopy::Batched, input_buffer + i,
                          output_buffer + i, buffer_sizes + i,
-                          std::min(num_rows - i, max_copy_at_once));
+                          ::min(num_rows - i, max_copy_at_once));
                    }
                  }));
            }));

--- a/graphbolt/src/cuda/gpu_cache.h
+++ b/graphbolt/src/cuda/gpu_cache.h
@@ -20,7 +20,7 @@ namespace cuda {
 class GpuCache : public torch::CustomClassHolder {
  using key_t = long long;
  constexpr static int set_associativity = 2;
-  constexpr static int WARP_SIZE = 32;
+  constexpr static int WARP_SIZE = 64;
  constexpr static int bucket_size = WARP_SIZE * set_associativity;
  using gpu_cache_t = ::gpu_cache::gpu_cache<
      key_t, uint64_t, std::numeric_limits<key_t>::max(), set_associativity,

--- a/graphbolt/src/cuda/gpu_cache.cu
+++ b/graphbolt/src/cuda/gpu_cache.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -6,8 +7,8 @@
 */
 #include <numeric>

-#include "./common.h"
-#include "./gpu_cache.h"
+#include "common.h"
+#include "gpu_cache.h"

 namespace graphbolt {
 namespace cuda {

--- a/graphbolt/src/cuda/index_select_csc_impl.cu
+++ b/graphbolt/src/cuda/index_select_csc_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -10,12 +12,12 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>

-#include "./common.h"
-#include "./max_uva_threads.h"
-#include "./utils.h"
+#include "common.h"
+#include "max_uva_threads.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
      torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
  const dim3 block(BLOCK_SIZE);
  const dim3 grid(
-      (std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
+      (::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
       BLOCK_SIZE - 1) /
      BLOCK_SIZE);

@@ -178,8 +180,10 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCImpl(
        return GRAPHBOLT_DISPATCH_ELEMENT_SIZES(
            indices.element_size(), "UVAIndexSelectCSCCopyIndices", ([&] {
              return UVAIndexSelectCSCCopyIndices<indptr_t, element_size_t>(
-                  indices, num_nodes, in_degree.data_ptr<indptr_t>(),
-                  sliced_indptr.data_ptr<indptr_t>(),
+                  // indices, num_nodes, in_degree.data_ptr<indptr_t>(),
+                  indices, num_nodes, cuda::getTensorDevicePointer<indptr_t>(in_degree),
+                  // sliced_indptr.data_ptr<indptr_t>(),
+                  cuda::getTensorDevicePointer<indptr_t>(sliced_indptr),
                  sorted_idx.data_ptr<int64_t>(), nodes.options(),
                  sliced_indptr.scalar_type(), output_size);
            }));
@@ -220,7 +224,7 @@ void IndexSelectCSCCopyIndices(
  for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
    CUB_CALL(
        DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
-        buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once));
+        buffer_sizes + i, ::min(num_nodes - i, max_copy_at_once));
  }
 }


--- a/graphbolt/src/cuda/index_select_impl.cu
+++ b/graphbolt/src/cuda/index_select_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -9,9 +11,9 @@

 #include <numeric>

-#include "./common.h"
-#include "./max_uva_threads.h"
-#include "./utils.h"
+#include "common.h"
+#include "max_uva_threads.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -110,7 +112,15 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
      {return_len, original_feature_size}, torch::TensorOptions()
                                               .dtype(input.dtype())
                                               .device(c10::DeviceType::CUDA));
-  DType* input_ptr = reinterpret_cast<DType*>(input.data_ptr());
+  DType* input_ptr = nullptr;                                               
+  if(input.is_pinned())
+  {
+    CUDA_CALL(hipHostGetDevicePointer((void**)&input_ptr, input.data_ptr(), 0));
+  }
+  else{
+    input_ptr= reinterpret_cast<DType*>(input.data_ptr());
+
+  }
  DType* ret_ptr = reinterpret_cast<DType*>(ret.data_ptr());

  // Sort the index to improve the memory access pattern.
@@ -124,7 +134,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
    // Use a single thread to process each output row to avoid wasting threads.
    const int num_threads = cuda::FindNumThreads(return_len);
    const int num_blocks =
-        (std::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
+        (::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
         num_threads - 1) /
        num_threads;
    CUDA_KERNEL_CALL(
@@ -137,7 +147,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
      block.x >>= 1;
      block.y <<= 1;
    }
-    const dim3 grid(std::min(
+    const dim3 grid(::min(
        (return_len + block.y - 1) / block.y,
        cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
    if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {

--- a/graphbolt/src/cuda/insubgraph.cu
+++ b/graphbolt/src/cuda/insubgraph.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -8,7 +9,7 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/cuda_sampling_ops.h>

-#include "./common.h"
+#include "common.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/cuda/isin.cu
+++ b/graphbolt/src/cuda/isin.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -7,7 +8,7 @@
 #include <graphbolt/cuda_ops.h>
 #include <thrust/binary_search.h>

-#include "./common.h"
+#include "common.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/cuda/max_uva_threads.cc
+++ b/graphbolt/src/cuda/max_uva_threads.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
 * @file cuda/max_uva_threads.cc
 * @brief Max uva threads variable setter function.
 */
-#include "./max_uva_threads.h"
+#include "max_uva_threads.h"

 namespace graphbolt {
 namespace cuda {

--- a/graphbolt/src/cuda/neighbor_sampler.cu
+++ b/graphbolt/src/cuda/neighbor_sampler.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "hip/hip_bf16.h"
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -5,8 +8,8 @@
 * @brief Index select operator implementation on CUDA.
 */
 #include <c10/core/ScalarType.h>
-#include <curand_kernel.h>
 #include <graphbolt/continuous_seed.h>
+#include <hiprand/hiprand_kernel.h>
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/cuda_sampling_ops.h>
 #include <thrust/copy.h>
@@ -14,21 +17,43 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
+#include <hipcub/backend/rocprim/device/device_copy.hpp>

 #include <algorithm>
 #include <array>
-#include <cub/cub.cuh>
-#if __CUDA_ARCH__ >= 700
-#include <cuda/atomic>
-#endif  // __CUDA_ARCH__ >= 700
+#include <hipcub/hipcub.hpp>
 #include <limits>
 #include <numeric>
 #include <type_traits>

 #include "../random.h"
 #include "../utils.h"
-#include "./common.h"
-#include "./utils.h"
+#include "common.h"
+#include "utils.h"
+
+namespace rocprim{
+namespace detail{
+
+template<>
+struct float_bit_mask<__hip_bfloat16>
+{
+    static constexpr uint16_t sign_bit = 0x8000;
+    static constexpr uint16_t exponent = 0x7F80;
+    static constexpr uint16_t mantissa = 0x007F;
+    using bit_type = uint16_t;
+};
+
+template<>
+struct radix_key_codec_base<__hip_bfloat16> : radix_key_codec_floating<__hip_bfloat16, unsigned short> { 
+};
+}
+}
+#if HIP_VERSION_MAJOR<6
+__host__ __device__ bool operator>(const __hip_bfloat16& a, const __hip_bfloat16& b)
+{
+  return float(a)>float(b);
+}
+#endif

 namespace graphbolt {
 namespace ops {
@@ -109,13 +134,23 @@ __global__ void _ComputeRandoms(
    edge_id_t* edge_ids) {
  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = gridDim.x * blockDim.x;
+  hiprandStatePhilox4_32_10_t rng;
  const auto labor = indices != nullptr;

+  if (!labor) {
+    hiprand_init(random_seed, i, 0, &rng);
+  }
  while (i < num_edges) {
    const auto row_position = csr_rows[i];
    const auto row_offset = i - sub_indptr[row_position];
    const auto in_idx = sliced_indptr[row_position] + row_offset;
-    const auto rnd = random_seed.uniform(labor ? indices[in_idx] : i);
+
+    if (labor) {
+      constexpr uint64_t kCurandSeed = 999961;
+      hiprand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
+    }
+
+    const auto rnd = hiprand_uniform(&rng);
    const auto prob =
        sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
    const auto exp_rnd = -__logf(rnd);
@@ -216,9 +251,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
  }
  // Finally, copy the adjusted fanout values to the device memory.
  auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
-  CUDA_CALL(cudaMemcpyAsync(
+  CUDA_CALL(hipMemcpyAsync(
      fanouts_device.get(), fanouts_pinned_ptr,
-      sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice,
+      sizeof(int64_t) * fanouts.size(), hipMemcpyHostToDevice,
      cuda::GetCurrentStream()));
  auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, seeds);
  auto in_degree = std::get<0>(in_degree_and_sliced_indptr);

--- a/graphbolt/src/cuda/sampling_utils.cu
+++ b/graphbolt/src/cuda/sampling_utils.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -7,10 +8,10 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

-#include "./common.h"
-#include "./utils.h"
+#include "common.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -55,7 +56,8 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
                THRUST_CALL(
                    for_each, iota, iota + num_nodes,
                    SliceFunc<indptr_t, nodes_t>{
-                        nodes.data_ptr<nodes_t>(), indptr.data_ptr<indptr_t>(),
+                        // nodes.data_ptr<nodes_t>(), indptr.data_ptr<indptr_t>(),
+                        cuda::getTensorDevicePointer<nodes_t>(nodes), cuda::getTensorDevicePointer<indptr_t>(indptr),
                        in_degree.data_ptr<indptr_t>(),
                        sliced_indptr.data_ptr<indptr_t>()});
              }));
@@ -71,8 +73,8 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
          using indptr_t = scalar_t;
          CUB_CALL(
              DeviceAdjacentDifference::SubtractLeftCopy,
-              indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
-              num_nodes + 1, cub::Difference{});
+              cuda::getTensorDevicePointer<indptr_t>(indptr), in_degree.data_ptr<indptr_t>(),
+              num_nodes + 1, hipcub::Difference{});
        }));
    in_degree = in_degree.slice(0, 1);
    return {in_degree, sliced_indptr};
@@ -126,7 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
        CUB_CALL(
            DeviceAdjacentDifference::SubtractLeftCopy,
            new_sub_indptr.data_ptr<indptr_t>(),
-            new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{});
+            new_indegree.data_ptr<indptr_t>(), num_rows + 1, hipcub::Difference{});
      }));
  // Discard the first element of the SubtractLeftCopy result and ensure that
  // new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is

--- a/graphbolt/src/cuda/sort_impl.cu
+++ b/graphbolt/src/cuda/sort_impl.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -6,10 +7,10 @@
 */
 #include <c10/core/ScalarType.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

-#include "./common.h"
-#include "./utils.h"
+#include "common.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/cuda/unique_and_compact_impl.cu
+++ b/graphbolt/src/cuda/unique_and_compact_impl.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
 * @file cuda/unique_and_compact_impl.cu
 * @brief Unique and compact operator implementation on CUDA.
 */
+#include <hip/hip_runtime.h>
 #include <graphbolt/cuda_ops.h>
 #include <thrust/binary_search.h>
 #include <thrust/functional.h>
 #include <thrust/gather.h>
 #include <thrust/logical.h>

-#include <cub/cub.cuh>
 #include <mutex>
+#include <hipcub/hipcub.hpp>
 #include <type_traits>
 #include <unordered_map>

@@ -119,12 +121,17 @@ UniqueAndCompactBatchedSortBased(
        // The code block above synchronizes, ensuring safe access to
        // max_id_src and max_id_dst.
        if (num_bits == 0) {
+//
          index_t max_id = 0;
          for (std::size_t i = 0; i < max_id_src.size(); i++) {
            max_id = std::max(max_id, static_cast<index_t>(max_id_src[i]));
            max_id = std::max(max_id, static_cast<index_t>(max_id_dst[i]));
          }
          num_bits = cuda::NumberOfBits(1ll + max_id);
+//          num_bits = cuda::NumberOfBits(
+//              1 + ::max(
+//                     static_cast<scalar_t>(max_id_src),
+//                     static_cast<scalar_t>(max_id_dst)));
        }

        // Sort the only_src tensor so that we can unique it later.

--- a/graphbolt/src/cuda/utils.h
+++ b/graphbolt/src/cuda/utils.h
@@ -101,6 +101,17 @@ __device__ indices_t UpperBound(const indptr_t* A, indices_t n, indptr_t x) {
  return l;
 }

+template<typename DType>
+inline DType*  getTensorDevicePointer(torch::Tensor inputTensor)
+{
+  DType* ret = inputTensor.data_ptr<DType>();
+  if(inputTensor.is_pinned())
+  {
+    CUDA_CALL(hipHostGetDevicePointer((void**)&ret, (void*)ret, 0));
+  }
+  return ret;
+}
+
 }  // namespace cuda
 }  // namespace graphbolt


--- a/graphbolt/src/expand_indptr.cc
+++ b/graphbolt/src/expand_indptr.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -7,8 +8,8 @@
 #include <graphbolt/cuda_ops.h>
 #include <torch/autograd.h>

-#include "./macro.h"
-#include "./utils.h"
+#include "macro.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file fused_csc_sampling_graph.cc
@@ -24,6 +25,7 @@
 #include "./shared_memory_helper.h"
 #include "./utils.h"

+
 namespace {
 torch::optional<torch::Dict<std::string, torch::Tensor>> TensorizeDict(
    const torch::optional<torch::Dict<std::string, int64_t>>& dict) {

--- a/graphbolt/src/index_select.cc
+++ b/graphbolt/src/index_select.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file index_select.cc
@@ -8,8 +9,8 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/fused_csc_sampling_graph.h>

-#include "./macro.h"
-#include "./utils.h"
+#include "macro.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/isin.cc
+++ b/graphbolt/src/isin.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *
@@ -8,8 +9,8 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/isin.h>

-#include "./macro.h"
-#include "./utils.h"
+#include "macro.h"
+#include "utils.h"

 namespace {
 static constexpr int kSearchGrainSize = 4096;

--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file python_binding.cc
@@ -10,15 +11,17 @@
 #include <graphbolt/unique_and_compact.h>

 #ifdef GRAPHBOLT_USE_CUDA
-#include "./cuda/max_uva_threads.h"
+#include "cuda/max_uva_threads.h"
 #endif
+
 #include "./cnumpy.h"
 #include "./expand_indptr.h"
 #include "./index_select.h"
 #include "./random.h"

+
 #ifdef GRAPHBOLT_USE_CUDA
-#include "./cuda/gpu_cache.h"
+#include "cuda/gpu_cache.h"
 #endif

 namespace graphbolt {

--- a/graphbolt/src/random.cc
+++ b/graphbolt/src/random.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file random.cc
 * @brief Random Engine.
 */

-#include "./random.h"
+#include "random.h"

 #include <torch/torch.h>


--- a/graphbolt/src/shared_memory_helper.cc
+++ b/graphbolt/src/shared_memory_helper.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *
 * @file shared_memory_helper.cc
 * @brief Share memory helper implementation.
 */
-#include "./shared_memory_helper.h"
+#include "shared_memory_helper.h"

 #include <graphbolt/serialize.h>
 #include <graphbolt/shared_memory.h>