update src and graphbolt code

6ac701f8 · sangwzh · 1547bd93 · 6ac701f8 · 6ac701f8 · 6ac701f8
Commit 6ac701f8 authored Sep 13, 2024 by sangwzh
20 changed files
--- a/graphbolt/src/cuda/common.h
+++ b/graphbolt/src/cuda/common.h
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2017-2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -7,11 +8,11 @@
 #ifndef GRAPHBOLT_CUDA_COMMON_H_
 #define GRAPHBOLT_CUDA_COMMON_H_

-#include <ATen/cuda/CUDAEvent.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAException.h>
-#include <c10/cuda/CUDAStream.h>
-#include <cuda_runtime.h>
+#include <ATen/hip/HIPEvent.h>
+#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
+#include <c10/hip/HIPException.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+#include <hip/hip_runtime.h>
 #include <torch/script.h>

 #include <memory>
@@ -26,8 +27,8 @@ namespace cuda {
 * that uses torch's CUDA memory pool and the current cuda stream:
 *
 * cuda::CUDAWorkspaceAllocator allocator;
- * const auto stream = torch::cuda::getDefaultCUDAStream();
- * const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+ * const auto stream = torch::hip::getDefaultHIPStreamMasqueradingAsCUDA();
+ * const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
 *
 * Now, one can pass exec_policy to thrust functions
 *
@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator {
  CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;

  void operator()(void* ptr) const {
-    c10::cuda::CUDACachingAllocator::raw_delete(ptr);
+    c10::hip::HIPCachingAllocator::raw_delete(ptr);
  }

  // Required by thrust to satisfy allocator requirements.
  value_type* allocate(std::ptrdiff_t size) const {
    return reinterpret_cast<value_type*>(
-        c10::cuda::CUDACachingAllocator::raw_alloc(size));
+        c10::hip::HIPCachingAllocator::raw_alloc(size));
  }

  // Required by thrust to satisfy allocator requirements.
@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator {

 inline auto GetAllocator() { return CUDAWorkspaceAllocator{}; }

-inline auto GetCurrentStream() { return c10::cuda::getCurrentCUDAStream(); }
+inline auto GetCurrentStream() { return c10::hip::getCurrentHIPStreamMasqueradingAsCUDA(); }

 template <typename T>
 inline bool is_zero(T size) {
@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) {
  return size.x == 0 || size.y == 0 || size.z == 0;
 }

-#define CUDA_CALL(func) C10_CUDA_CHECK((func))
+#define CUDA_CALL(func) C10_HIP_CHECK((func))

 #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...)          \
  {                                                                 \
    if (!graphbolt::cuda::is_zero((nblks)) &&                       \
        !graphbolt::cuda::is_zero((nthrs))) {                       \
      auto stream = graphbolt::cuda::GetCurrentStream();            \
-      (kernel)<<<(nblks), (nthrs), (shmem), stream>>>(__VA_ARGS__); \
-      C10_CUDA_KERNEL_LAUNCH_CHECK();                               \
+     hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), stream, __VA_ARGS__); \
+      C10_HIP_KERNEL_LAUNCH_CHECK();                               \
    }                                                               \
  }

@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) {
    auto allocator = graphbolt::cuda::GetAllocator();                         \
    auto stream = graphbolt::cuda::GetCurrentStream();                        \
    size_t workspace_size = 0;                                                \
-    CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream));         \
+    CUDA_CALL(hipcub::fn(nullptr, workspace_size, __VA_ARGS__, stream));         \
    auto workspace = allocator.AllocateStorage<char>(workspace_size);         \
-    CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
+    CUDA_CALL(hipcub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
  }

 #define THRUST_CALL(fn, ...)                                                 \
  [&] {                                                                      \
    auto allocator = graphbolt::cuda::GetAllocator();                        \
    auto stream = graphbolt::cuda::GetCurrentStream();                       \
-    const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); \
+    const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); \
    return thrust::fn(exec_policy, __VA_ARGS__);                             \
  }()

@@ -126,7 +127,7 @@ template <typename scalar_t>
 struct CopyScalar {
  CopyScalar() : is_ready_(true) { init_pinned_storage(); }

-  void record(at::cuda::CUDAStream stream = GetCurrentStream()) {
+  void record(at::hip::HIPStreamMasqueradingAsCUDA stream = GetCurrentStream()) {
    copy_event_.record(stream);
    is_ready_ = false;
  }
@@ -138,9 +139,9 @@ struct CopyScalar {
  CopyScalar(const scalar_t* device_ptr) {
    init_pinned_storage();
    auto stream = GetCurrentStream();
-    CUDA_CALL(cudaMemcpyAsync(
+    CUDA_CALL(hipMemcpyAsync(
        reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr()), device_ptr,
-        sizeof(scalar_t), cudaMemcpyDeviceToHost, stream));
+        sizeof(scalar_t), hipMemcpyDeviceToHost, stream));
    record(stream);
  }


--- a/graphbolt/src/cuda/cumsum.cu
+++ b/graphbolt/src/cuda/cumsum.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
 * @file cuda/cumsum.cu
 * @brief Cumsum operators implementation on CUDA.
 */
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

-#include "./common.h"
+#include "common.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/cuda/expand_indptr.cu
+++ b/graphbolt/src/cuda/expand_indptr.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -8,10 +9,10 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <limits>

-#include "./common.h"
+#include "common.h"

 namespace graphbolt {
 namespace ops {
@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl(
                      CUB_CALL(
                          DeviceCopy::Batched, input_buffer + i,
                          output_buffer + i, buffer_sizes + i,
-                          std::min(num_rows - i, max_copy_at_once));
+                          ::min(num_rows - i, max_copy_at_once));
                    }
                  }));
            }));

--- a/graphbolt/src/cuda/gpu_cache.cu
+++ b/graphbolt/src/cuda/gpu_cache.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -6,8 +7,8 @@
 */
 #include <numeric>

-#include "./common.h"
-#include "./gpu_cache.h"
+#include "common.h"
+#include "gpu_cache.h"

 namespace graphbolt {
 namespace cuda {

--- a/graphbolt/src/cuda/index_select_csc_impl.cu
+++ b/graphbolt/src/cuda/index_select_csc_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -10,12 +12,12 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>

-#include "./common.h"
-#include "./max_uva_threads.h"
-#include "./utils.h"
+#include "common.h"
+#include "max_uva_threads.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
      torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
  const dim3 block(BLOCK_SIZE);
  const dim3 grid(
-      (std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
+      (::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
       BLOCK_SIZE - 1) /
      BLOCK_SIZE);

@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices(
  for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
    CUB_CALL(
        DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
-        buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once));
+        buffer_sizes + i, ::min(num_nodes - i, max_copy_at_once));
  }
 }


--- a/graphbolt/src/cuda/index_select_impl.cu
+++ b/graphbolt/src/cuda/index_select_impl.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -9,9 +11,9 @@

 #include <numeric>

-#include "./common.h"
-#include "./max_uva_threads.h"
-#include "./utils.h"
+#include "common.h"
+#include "max_uva_threads.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
    // Use a single thread to process each output row to avoid wasting threads.
    const int num_threads = cuda::FindNumThreads(return_len);
    const int num_blocks =
-        (std::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
+        (::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
         num_threads - 1) /
        num_threads;
    CUDA_KERNEL_CALL(
@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
      block.x >>= 1;
      block.y <<= 1;
    }
-    const dim3 grid(std::min(
+    const dim3 grid(::min(
        (return_len + block.y - 1) / block.y,
        cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
    if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {

--- a/graphbolt/src/cuda/insubgraph.cu
+++ b/graphbolt/src/cuda/insubgraph.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -8,7 +9,7 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/cuda_sampling_ops.h>

-#include "./common.h"
+#include "common.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/cuda/isin.cu
+++ b/graphbolt/src/cuda/isin.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -7,7 +8,7 @@
 #include <graphbolt/cuda_ops.h>
 #include <thrust/binary_search.h>

-#include "./common.h"
+#include "common.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/cuda/max_uva_threads.cc
+++ b/graphbolt/src/cuda/max_uva_threads.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
 * @file cuda/max_uva_threads.cc
 * @brief Max uva threads variable setter function.
 */
-#include "./max_uva_threads.h"
+#include "max_uva_threads.h"

 namespace graphbolt {
 namespace cuda {

--- a/graphbolt/src/cuda/neighbor_sampler.cu
+++ b/graphbolt/src/cuda/neighbor_sampler.cu
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include "hip/hip_bf16.h"
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -5,7 +8,7 @@
 * @brief Index select operator implementation on CUDA.
 */
 #include <c10/core/ScalarType.h>
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/cuda_sampling_ops.h>
 #include <thrust/gather.h>
@@ -15,14 +18,14 @@

 #include <algorithm>
 #include <array>
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <limits>
 #include <numeric>
 #include <type_traits>

 #include "../random.h"
-#include "./common.h"
-#include "./utils.h"
+#include "common.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms(
    const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride = gridDim.x * blockDim.x;
-  curandStatePhilox4_32_10_t rng;
+  hiprandStatePhilox4_32_10_t rng;
  const auto labor = indices != nullptr;

  if (!labor) {
-    curand_init(random_seed, i, 0, &rng);
+    hiprand_init(random_seed, i, 0, &rng);
  }

  while (i < num_edges) {
@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms(

    if (labor) {
      constexpr uint64_t kCurandSeed = 999961;
-      curand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
+      hiprand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
    }

-    const auto rnd = curand_uniform(&rng);
+    const auto rnd = hiprand_uniform(&rng);
    const auto prob =
        sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
    const auto exp_rnd = -__logf(rnd);
@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
  }
  // Finally, copy the adjusted fanout values to the device memory.
  auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
-  CUDA_CALL(cudaMemcpyAsync(
+  CUDA_CALL(hipMemcpyAsync(
      fanouts_device.get(), fanouts_pinned_ptr,
-      sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice,
+      sizeof(int64_t) * fanouts.size(), hipMemcpyHostToDevice,
      cuda::GetCurrentStream()));
  auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
  auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                  "Selected edge_id_t must be capable of storing edge_ids.");
              // Using bfloat16 for random numbers works just as reliably as
              // float32 and provides around %30 percent speedup.
-              using rnd_t = nv_bfloat16;
+              using rnd_t = __hip_bfloat16;
              auto randoms =
                  allocator.AllocateStorage<rnd_t>(num_edges.value());
              auto randoms_sorted =
@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
                CUB_CALL(
                    DeviceCopy::Batched, input_buffer_it + i,
                    output_buffer_it + i, sampled_degree + i,
-                    std::min(num_rows - i, max_copy_at_once));
+                    ::min(num_rows - i, max_copy_at_once));
              }
            }));


--- a/graphbolt/src/cuda/sampling_utils.cu
+++ b/graphbolt/src/cuda/sampling_utils.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -7,10 +8,10 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

-#include "./common.h"
-#include "./utils.h"
+#include "common.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
          CUB_CALL(
              DeviceAdjacentDifference::SubtractLeftCopy,
              indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
-              num_nodes + 1, cub::Difference{});
+              num_nodes + 1, hipcub::Difference{});
        }));
    in_degree = in_degree.slice(0, 1);
    return {in_degree, sliced_indptr};
@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
        CUB_CALL(
            DeviceAdjacentDifference::SubtractLeftCopy,
            new_sub_indptr.data_ptr<indptr_t>(),
-            new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{});
+            new_indegree.data_ptr<indptr_t>(), num_rows + 1, hipcub::Difference{});
      }));
  // Discard the first element of the SubtractLeftCopy result and ensure that
  // new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is

--- a/graphbolt/src/cuda/sort_impl.cu
+++ b/graphbolt/src/cuda/sort_impl.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -6,10 +7,10 @@
 */
 #include <c10/core/ScalarType.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>

-#include "./common.h"
-#include "./utils.h"
+#include "common.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/cuda/unique_and_compact_impl.cu
+++ b/graphbolt/src/cuda/unique_and_compact_impl.cu
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -10,11 +11,11 @@
 #include <thrust/gather.h>
 #include <thrust/logical.h>

-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <type_traits>

-#include "./common.h"
-#include "./utils.h"
+#include "common.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {
@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
        // and max_id_dst.
        if (num_bits == 0) {
          num_bits = cuda::NumberOfBits(
-              1 + std::max(
+              1 + ::max(
                      static_cast<scalar_t>(max_id_src),
                      static_cast<scalar_t>(max_id_dst)));
        }

--- a/graphbolt/src/expand_indptr.cc
+++ b/graphbolt/src/expand_indptr.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -6,8 +7,8 @@
 */
 #include <graphbolt/cuda_ops.h>

-#include "./macro.h"
-#include "./utils.h"
+#include "macro.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/fused_csc_sampling_graph.cc
+++ b/graphbolt/src/fused_csc_sampling_graph.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file fused_csc_sampling_graph.cc
@@ -17,10 +18,10 @@
 #include <tuple>
 #include <vector>

-#include "./macro.h"
-#include "./random.h"
-#include "./shared_memory_helper.h"
-#include "./utils.h"
+#include "macro.h"
+#include "random.h"
+#include "shared_memory_helper.h"
+#include "utils.h"

 namespace {
 torch::optional<torch::Dict<std::string, torch::Tensor>> TensorizeDict(

--- a/graphbolt/src/index_select.cc
+++ b/graphbolt/src/index_select.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file index_select.cc
@@ -6,8 +7,8 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/fused_csc_sampling_graph.h>

-#include "./macro.h"
-#include "./utils.h"
+#include "macro.h"
+#include "utils.h"

 namespace graphbolt {
 namespace ops {

--- a/graphbolt/src/isin.cc
+++ b/graphbolt/src/isin.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *
@@ -8,8 +9,8 @@
 #include <graphbolt/cuda_ops.h>
 #include <graphbolt/isin.h>

-#include "./macro.h"
-#include "./utils.h"
+#include "macro.h"
+#include "utils.h"

 namespace {
 static constexpr int kSearchGrainSize = 4096;

--- a/graphbolt/src/python_binding.cc
+++ b/graphbolt/src/python_binding.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file python_binding.cc
@@ -10,14 +11,14 @@
 #include <graphbolt/unique_and_compact.h>

 #ifdef GRAPHBOLT_USE_CUDA
-#include "./cuda/max_uva_threads.h"
+#include "cuda/max_uva_threads.h"
 #endif
-#include "./expand_indptr.h"
-#include "./index_select.h"
-#include "./random.h"
+#include "expand_indptr.h"
+#include "index_select.h"
+#include "random.h"

 #ifdef GRAPHBOLT_USE_CUDA
-#include "./cuda/gpu_cache.h"
+#include "cuda/gpu_cache.h"
 #endif

 namespace graphbolt {

--- a/graphbolt/src/random.cc
+++ b/graphbolt/src/random.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 * @file random.cc
 * @brief Random Engine.
 */

-#include "./random.h"
+#include "random.h"

 #include <torch/torch.h>


--- a/graphbolt/src/shared_memory_helper.cc
+++ b/graphbolt/src/shared_memory_helper.cc
+// !!! This is a file automatically generated by hipify!!!
 /**
 *  Copyright (c) 2023 by Contributors
 *
 * @file shared_memory_helper.cc
 * @brief Share memory helper implementation.
 */
-#include "./shared_memory_helper.h"
+#include "shared_memory_helper.h"

 #include <graphbolt/serialize.h>
 #include <graphbolt/shared_memory.h>