"vscode:/vscode.git/clone" did not exist on "68bd6934b1e683b6dcf2c9257db05ea5af69f1c5"
Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2017-2023 by Contributors * Copyright (c) 2017-2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -7,11 +8,11 @@ ...@@ -7,11 +8,11 @@
#ifndef GRAPHBOLT_CUDA_COMMON_H_ #ifndef GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_ #define GRAPHBOLT_CUDA_COMMON_H_
#include <ATen/cuda/CUDAEvent.h> #include <ATen/hip/HIPEvent.h>
#include <c10/cuda/CUDACachingAllocator.h> #include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
#include <c10/cuda/CUDAException.h> #include <c10/hip/HIPException.h>
#include <c10/cuda/CUDAStream.h> #include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
#include <cuda_runtime.h> #include <hip/hip_runtime.h>
#include <torch/script.h> #include <torch/script.h>
#include <memory> #include <memory>
...@@ -26,8 +27,8 @@ namespace cuda { ...@@ -26,8 +27,8 @@ namespace cuda {
* that uses torch's CUDA memory pool and the current cuda stream: * that uses torch's CUDA memory pool and the current cuda stream:
* *
* cuda::CUDAWorkspaceAllocator allocator; * cuda::CUDAWorkspaceAllocator allocator;
* const auto stream = torch::cuda::getDefaultCUDAStream(); * const auto stream = torch::hip::getDefaultHIPStreamMasqueradingAsCUDA();
* const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); * const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
* *
* Now, one can pass exec_policy to thrust functions * Now, one can pass exec_policy to thrust functions
* *
...@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator { ...@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator {
CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default; CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;
void operator()(void* ptr) const { void operator()(void* ptr) const {
c10::cuda::CUDACachingAllocator::raw_delete(ptr); c10::hip::HIPCachingAllocator::raw_delete(ptr);
} }
// Required by thrust to satisfy allocator requirements. // Required by thrust to satisfy allocator requirements.
value_type* allocate(std::ptrdiff_t size) const { value_type* allocate(std::ptrdiff_t size) const {
return reinterpret_cast<value_type*>( return reinterpret_cast<value_type*>(
c10::cuda::CUDACachingAllocator::raw_alloc(size)); c10::hip::HIPCachingAllocator::raw_alloc(size));
} }
// Required by thrust to satisfy allocator requirements. // Required by thrust to satisfy allocator requirements.
...@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator { ...@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator {
inline auto GetAllocator() { return CUDAWorkspaceAllocator{}; } inline auto GetAllocator() { return CUDAWorkspaceAllocator{}; }
inline auto GetCurrentStream() { return c10::cuda::getCurrentCUDAStream(); } inline auto GetCurrentStream() { return c10::hip::getCurrentHIPStreamMasqueradingAsCUDA(); }
template <typename T> template <typename T>
inline bool is_zero(T size) { inline bool is_zero(T size) {
...@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) { ...@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) {
return size.x == 0 || size.y == 0 || size.z == 0; return size.x == 0 || size.y == 0 || size.z == 0;
} }
#define CUDA_CALL(func) C10_CUDA_CHECK((func)) #define CUDA_CALL(func) C10_HIP_CHECK((func))
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \ #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \
{ \ { \
if (!graphbolt::cuda::is_zero((nblks)) && \ if (!graphbolt::cuda::is_zero((nblks)) && \
!graphbolt::cuda::is_zero((nthrs))) { \ !graphbolt::cuda::is_zero((nthrs))) { \
auto stream = graphbolt::cuda::GetCurrentStream(); \ auto stream = graphbolt::cuda::GetCurrentStream(); \
(kernel)<<<(nblks), (nthrs), (shmem), stream>>>(__VA_ARGS__); \ hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), stream, __VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \ C10_HIP_KERNEL_LAUNCH_CHECK(); \
} \ } \
} }
...@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) { ...@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) {
auto allocator = graphbolt::cuda::GetAllocator(); \ auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \ auto stream = graphbolt::cuda::GetCurrentStream(); \
size_t workspace_size = 0; \ size_t workspace_size = 0; \
CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \ CUDA_CALL(hipcub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
auto workspace = allocator.AllocateStorage<char>(workspace_size); \ auto workspace = allocator.AllocateStorage<char>(workspace_size); \
CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \ CUDA_CALL(hipcub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
} }
#define THRUST_CALL(fn, ...) \ #define THRUST_CALL(fn, ...) \
[&] { \ [&] { \
auto allocator = graphbolt::cuda::GetAllocator(); \ auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \ auto stream = graphbolt::cuda::GetCurrentStream(); \
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); \ const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); \
return thrust::fn(exec_policy, __VA_ARGS__); \ return thrust::fn(exec_policy, __VA_ARGS__); \
}() }()
...@@ -126,7 +127,7 @@ template <typename scalar_t> ...@@ -126,7 +127,7 @@ template <typename scalar_t>
struct CopyScalar { struct CopyScalar {
CopyScalar() : is_ready_(true) { init_pinned_storage(); } CopyScalar() : is_ready_(true) { init_pinned_storage(); }
void record(at::cuda::CUDAStream stream = GetCurrentStream()) { void record(at::hip::HIPStreamMasqueradingAsCUDA stream = GetCurrentStream()) {
copy_event_.record(stream); copy_event_.record(stream);
is_ready_ = false; is_ready_ = false;
} }
...@@ -138,9 +139,9 @@ struct CopyScalar { ...@@ -138,9 +139,9 @@ struct CopyScalar {
CopyScalar(const scalar_t* device_ptr) { CopyScalar(const scalar_t* device_ptr) {
init_pinned_storage(); init_pinned_storage();
auto stream = GetCurrentStream(); auto stream = GetCurrentStream();
CUDA_CALL(cudaMemcpyAsync( CUDA_CALL(hipMemcpyAsync(
reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr()), device_ptr, reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr()), device_ptr,
sizeof(scalar_t), cudaMemcpyDeviceToHost, stream)); sizeof(scalar_t), hipMemcpyDeviceToHost, stream));
record(stream); record(stream);
} }
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/cumsum.cu * @file cuda/cumsum.cu
* @brief Cumsum operators implementation on CUDA. * @brief Cumsum operators implementation on CUDA.
*/ */
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "./common.h" #include "common.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -8,10 +9,10 @@ ...@@ -8,10 +9,10 @@
#include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h> #include <thrust/iterator/transform_iterator.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <limits> #include <limits>
#include "./common.h" #include "common.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
...@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl( ...@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl(
CUB_CALL( CUB_CALL(
DeviceCopy::Batched, input_buffer + i, DeviceCopy::Batched, input_buffer + i,
output_buffer + i, buffer_sizes + i, output_buffer + i, buffer_sizes + i,
std::min(num_rows - i, max_copy_at_once)); ::min(num_rows - i, max_copy_at_once));
} }
})); }));
})); }));
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -6,8 +7,8 @@ ...@@ -6,8 +7,8 @@
*/ */
#include <numeric> #include <numeric>
#include "./common.h" #include "common.h"
#include "./gpu_cache.h" #include "gpu_cache.h"
namespace graphbolt { namespace graphbolt {
namespace cuda { namespace cuda {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -10,12 +12,12 @@ ...@@ -10,12 +12,12 @@
#include <thrust/iterator/transform_iterator.h> #include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h> #include <thrust/iterator/zip_iterator.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <numeric> #include <numeric>
#include "./common.h" #include "common.h"
#include "./max_uva_threads.h" #include "max_uva_threads.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
...@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices( ...@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
torch::empty(output_size.value(), options.dtype(indices.scalar_type())); torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
const dim3 block(BLOCK_SIZE); const dim3 block(BLOCK_SIZE);
const dim3 grid( const dim3 grid(
(std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) + (::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
BLOCK_SIZE - 1) / BLOCK_SIZE - 1) /
BLOCK_SIZE); BLOCK_SIZE);
...@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices( ...@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices(
for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) { for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
CUB_CALL( CUB_CALL(
DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i, DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once)); buffer_sizes + i, ::min(num_nodes - i, max_copy_at_once));
} }
} }
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -9,9 +11,9 @@ ...@@ -9,9 +11,9 @@
#include <numeric> #include <numeric>
#include "./common.h" #include "common.h"
#include "./max_uva_threads.h" #include "max_uva_threads.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
...@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) { ...@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
// Use a single thread to process each output row to avoid wasting threads. // Use a single thread to process each output row to avoid wasting threads.
const int num_threads = cuda::FindNumThreads(return_len); const int num_threads = cuda::FindNumThreads(return_len);
const int num_blocks = const int num_blocks =
(std::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) + (::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
num_threads - 1) / num_threads - 1) /
num_threads; num_threads;
CUDA_KERNEL_CALL( CUDA_KERNEL_CALL(
...@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) { ...@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
block.x >>= 1; block.x >>= 1;
block.y <<= 1; block.y <<= 1;
} }
const dim3 grid(std::min( const dim3 grid(::min(
(return_len + block.y - 1) / block.y, (return_len + block.y - 1) / block.y,
cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE)); cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) { if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -8,7 +9,7 @@ ...@@ -8,7 +9,7 @@
#include <graphbolt/cuda_ops.h> #include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h> #include <graphbolt/cuda_sampling_ops.h>
#include "./common.h" #include "common.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -7,7 +8,7 @@ ...@@ -7,7 +8,7 @@
#include <graphbolt/cuda_ops.h> #include <graphbolt/cuda_ops.h>
#include <thrust/binary_search.h> #include <thrust/binary_search.h>
#include "./common.h" #include "common.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/max_uva_threads.cc * @file cuda/max_uva_threads.cc
* @brief Max uva threads variable setter function. * @brief Max uva threads variable setter function.
*/ */
#include "./max_uva_threads.h" #include "max_uva_threads.h"
namespace graphbolt { namespace graphbolt {
namespace cuda { namespace cuda {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "hip/hip_bf16.h"
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -5,7 +8,7 @@ ...@@ -5,7 +8,7 @@
* @brief Index select operator implementation on CUDA. * @brief Index select operator implementation on CUDA.
*/ */
#include <c10/core/ScalarType.h> #include <c10/core/ScalarType.h>
#include <curand_kernel.h> #include <hiprand/hiprand_kernel.h>
#include <graphbolt/cuda_ops.h> #include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h> #include <graphbolt/cuda_sampling_ops.h>
#include <thrust/gather.h> #include <thrust/gather.h>
...@@ -15,14 +18,14 @@ ...@@ -15,14 +18,14 @@
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <limits> #include <limits>
#include <numeric> #include <numeric>
#include <type_traits> #include <type_traits>
#include "../random.h" #include "../random.h"
#include "./common.h" #include "common.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
...@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms( ...@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms(
const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) { const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
int64_t i = blockIdx.x * blockDim.x + threadIdx.x; int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = gridDim.x * blockDim.x; const int stride = gridDim.x * blockDim.x;
curandStatePhilox4_32_10_t rng; hiprandStatePhilox4_32_10_t rng;
const auto labor = indices != nullptr; const auto labor = indices != nullptr;
if (!labor) { if (!labor) {
curand_init(random_seed, i, 0, &rng); hiprand_init(random_seed, i, 0, &rng);
} }
while (i < num_edges) { while (i < num_edges) {
...@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms( ...@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms(
if (labor) { if (labor) {
constexpr uint64_t kCurandSeed = 999961; constexpr uint64_t kCurandSeed = 999961;
curand_init(kCurandSeed, random_seed, indices[in_idx], &rng); hiprand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
} }
const auto rnd = curand_uniform(&rng); const auto rnd = hiprand_uniform(&rng);
const auto prob = const auto prob =
sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1); sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
const auto exp_rnd = -__logf(rnd); const auto exp_rnd = -__logf(rnd);
...@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors( ...@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
} }
// Finally, copy the adjusted fanout values to the device memory. // Finally, copy the adjusted fanout values to the device memory.
auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size()); auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
CUDA_CALL(cudaMemcpyAsync( CUDA_CALL(hipMemcpyAsync(
fanouts_device.get(), fanouts_pinned_ptr, fanouts_device.get(), fanouts_pinned_ptr,
sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice, sizeof(int64_t) * fanouts.size(), hipMemcpyHostToDevice,
cuda::GetCurrentStream())); cuda::GetCurrentStream()));
auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes); auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
auto in_degree = std::get<0>(in_degree_and_sliced_indptr); auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
...@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors( ...@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
"Selected edge_id_t must be capable of storing edge_ids."); "Selected edge_id_t must be capable of storing edge_ids.");
// Using bfloat16 for random numbers works just as reliably as // Using bfloat16 for random numbers works just as reliably as
// float32 and provides around %30 percent speedup. // float32 and provides around %30 percent speedup.
using rnd_t = nv_bfloat16; using rnd_t = __hip_bfloat16;
auto randoms = auto randoms =
allocator.AllocateStorage<rnd_t>(num_edges.value()); allocator.AllocateStorage<rnd_t>(num_edges.value());
auto randoms_sorted = auto randoms_sorted =
...@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors( ...@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
CUB_CALL( CUB_CALL(
DeviceCopy::Batched, input_buffer_it + i, DeviceCopy::Batched, input_buffer_it + i,
output_buffer_it + i, sampled_degree + i, output_buffer_it + i, sampled_degree + i,
std::min(num_rows - i, max_copy_at_once)); ::min(num_rows - i, max_copy_at_once));
} }
})); }));
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -7,10 +8,10 @@ ...@@ -7,10 +8,10 @@
#include <thrust/for_each.h> #include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h> #include <thrust/iterator/counting_iterator.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "./common.h" #include "common.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
...@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr( ...@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
CUB_CALL( CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy, DeviceAdjacentDifference::SubtractLeftCopy,
indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(), indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
num_nodes + 1, cub::Difference{}); num_nodes + 1, hipcub::Difference{});
})); }));
in_degree = in_degree.slice(0, 1); in_degree = in_degree.slice(0, 1);
return {in_degree, sliced_indptr}; return {in_degree, sliced_indptr};
...@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero( ...@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
CUB_CALL( CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy, DeviceAdjacentDifference::SubtractLeftCopy,
new_sub_indptr.data_ptr<indptr_t>(), new_sub_indptr.data_ptr<indptr_t>(),
new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{}); new_indegree.data_ptr<indptr_t>(), num_rows + 1, hipcub::Difference{});
})); }));
// Discard the first element of the SubtractLeftCopy result and ensure that // Discard the first element of the SubtractLeftCopy result and ensure that
// new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is // new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -6,10 +7,10 @@ ...@@ -6,10 +7,10 @@
*/ */
#include <c10/core/ScalarType.h> #include <c10/core/ScalarType.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include "./common.h" #include "common.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -10,11 +11,11 @@ ...@@ -10,11 +11,11 @@
#include <thrust/gather.h> #include <thrust/gather.h>
#include <thrust/logical.h> #include <thrust/logical.h>
#include <cub/cub.cuh> #include <hipcub/hipcub.hpp>
#include <type_traits> #include <type_traits>
#include "./common.h" #include "common.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
...@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact( ...@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
// and max_id_dst. // and max_id_dst.
if (num_bits == 0) { if (num_bits == 0) {
num_bits = cuda::NumberOfBits( num_bits = cuda::NumberOfBits(
1 + std::max( 1 + ::max(
static_cast<scalar_t>(max_id_src), static_cast<scalar_t>(max_id_src),
static_cast<scalar_t>(max_id_dst))); static_cast<scalar_t>(max_id_dst)));
} }
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
...@@ -6,8 +7,8 @@ ...@@ -6,8 +7,8 @@
*/ */
#include <graphbolt/cuda_ops.h> #include <graphbolt/cuda_ops.h>
#include "./macro.h" #include "macro.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* @file fused_csc_sampling_graph.cc * @file fused_csc_sampling_graph.cc
...@@ -17,10 +18,10 @@ ...@@ -17,10 +18,10 @@
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "./macro.h" #include "macro.h"
#include "./random.h" #include "random.h"
#include "./shared_memory_helper.h" #include "shared_memory_helper.h"
#include "./utils.h" #include "utils.h"
namespace { namespace {
torch::optional<torch::Dict<std::string, torch::Tensor>> TensorizeDict( torch::optional<torch::Dict<std::string, torch::Tensor>> TensorizeDict(
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* @file index_select.cc * @file index_select.cc
...@@ -6,8 +7,8 @@ ...@@ -6,8 +7,8 @@
#include <graphbolt/cuda_ops.h> #include <graphbolt/cuda_ops.h>
#include <graphbolt/fused_csc_sampling_graph.h> #include <graphbolt/fused_csc_sampling_graph.h>
#include "./macro.h" #include "macro.h"
#include "./utils.h" #include "utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* *
...@@ -8,8 +9,8 @@ ...@@ -8,8 +9,8 @@
#include <graphbolt/cuda_ops.h> #include <graphbolt/cuda_ops.h>
#include <graphbolt/isin.h> #include <graphbolt/isin.h>
#include "./macro.h" #include "macro.h"
#include "./utils.h" #include "utils.h"
namespace { namespace {
static constexpr int kSearchGrainSize = 4096; static constexpr int kSearchGrainSize = 4096;
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* @file python_binding.cc * @file python_binding.cc
...@@ -10,14 +11,14 @@ ...@@ -10,14 +11,14 @@
#include <graphbolt/unique_and_compact.h> #include <graphbolt/unique_and_compact.h>
#ifdef GRAPHBOLT_USE_CUDA #ifdef GRAPHBOLT_USE_CUDA
#include "./cuda/max_uva_threads.h" #include "cuda/max_uva_threads.h"
#endif #endif
#include "./expand_indptr.h" #include "expand_indptr.h"
#include "./index_select.h" #include "index_select.h"
#include "./random.h" #include "random.h"
#ifdef GRAPHBOLT_USE_CUDA #ifdef GRAPHBOLT_USE_CUDA
#include "./cuda/gpu_cache.h" #include "cuda/gpu_cache.h"
#endif #endif
namespace graphbolt { namespace graphbolt {
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* @file random.cc * @file random.cc
* @brief Random Engine. * @brief Random Engine.
*/ */
#include "./random.h" #include "random.h"
#include <torch/torch.h> #include <torch/torch.h>
......
// !!! This is a file automatically generated by hipify!!!
/** /**
* Copyright (c) 2023 by Contributors * Copyright (c) 2023 by Contributors
* *
* @file shared_memory_helper.cc * @file shared_memory_helper.cc
* @brief Share memory helper implementation. * @brief Share memory helper implementation.
*/ */
#include "./shared_memory_helper.h" #include "shared_memory_helper.h"
#include <graphbolt/serialize.h> #include <graphbolt/serialize.h>
#include <graphbolt/shared_memory.h> #include <graphbolt/shared_memory.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment