"doc/vscode:/vscode.git/clone" did not exist on "ca1dc1e7d16958893aa4ef3e005ad419e55a4b71"
Commit 6ac701f8 authored by sangwzh's avatar sangwzh
Browse files

update src and graphbolt code

parent 1547bd93
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2017-2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -7,11 +8,11 @@
#ifndef GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_
#include <ATen/cuda/CUDAEvent.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/cuda/CUDAException.h>
#include <c10/cuda/CUDAStream.h>
#include <cuda_runtime.h>
#include <ATen/hip/HIPEvent.h>
#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
#include <c10/hip/HIPException.h>
#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
#include <hip/hip_runtime.h>
#include <torch/script.h>
#include <memory>
......@@ -26,8 +27,8 @@ namespace cuda {
* that uses torch's CUDA memory pool and the current cuda stream:
*
* cuda::CUDAWorkspaceAllocator allocator;
* const auto stream = torch::cuda::getDefaultCUDAStream();
* const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
* const auto stream = torch::hip::getDefaultHIPStreamMasqueradingAsCUDA();
* const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
*
* Now, one can pass exec_policy to thrust functions
*
......@@ -47,13 +48,13 @@ struct CUDAWorkspaceAllocator {
CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;
void operator()(void* ptr) const {
c10::cuda::CUDACachingAllocator::raw_delete(ptr);
c10::hip::HIPCachingAllocator::raw_delete(ptr);
}
// Required by thrust to satisfy allocator requirements.
value_type* allocate(std::ptrdiff_t size) const {
return reinterpret_cast<value_type*>(
c10::cuda::CUDACachingAllocator::raw_alloc(size));
c10::hip::HIPCachingAllocator::raw_alloc(size));
}
// Required by thrust to satisfy allocator requirements.
......@@ -69,7 +70,7 @@ struct CUDAWorkspaceAllocator {
inline auto GetAllocator() { return CUDAWorkspaceAllocator{}; }
inline auto GetCurrentStream() { return c10::cuda::getCurrentCUDAStream(); }
inline auto GetCurrentStream() { return c10::hip::getCurrentHIPStreamMasqueradingAsCUDA(); }
template <typename T>
inline bool is_zero(T size) {
......@@ -81,15 +82,15 @@ inline bool is_zero<dim3>(dim3 size) {
return size.x == 0 || size.y == 0 || size.z == 0;
}
#define CUDA_CALL(func) C10_CUDA_CHECK((func))
#define CUDA_CALL(func) C10_HIP_CHECK((func))
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...) \
{ \
if (!graphbolt::cuda::is_zero((nblks)) && \
!graphbolt::cuda::is_zero((nthrs))) { \
auto stream = graphbolt::cuda::GetCurrentStream(); \
(kernel)<<<(nblks), (nthrs), (shmem), stream>>>(__VA_ARGS__); \
C10_CUDA_KERNEL_LAUNCH_CHECK(); \
hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), stream, __VA_ARGS__); \
C10_HIP_KERNEL_LAUNCH_CHECK(); \
} \
}
......@@ -98,16 +99,16 @@ inline bool is_zero<dim3>(dim3 size) {
auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
size_t workspace_size = 0; \
CUDA_CALL(cub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
CUDA_CALL(hipcub::fn(nullptr, workspace_size, __VA_ARGS__, stream)); \
auto workspace = allocator.AllocateStorage<char>(workspace_size); \
CUDA_CALL(cub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
CUDA_CALL(hipcub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
}
#define THRUST_CALL(fn, ...) \
[&] { \
auto allocator = graphbolt::cuda::GetAllocator(); \
auto stream = graphbolt::cuda::GetCurrentStream(); \
const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); \
const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); \
return thrust::fn(exec_policy, __VA_ARGS__); \
}()
......@@ -126,7 +127,7 @@ template <typename scalar_t>
struct CopyScalar {
CopyScalar() : is_ready_(true) { init_pinned_storage(); }
void record(at::cuda::CUDAStream stream = GetCurrentStream()) {
void record(at::hip::HIPStreamMasqueradingAsCUDA stream = GetCurrentStream()) {
copy_event_.record(stream);
is_ready_ = false;
}
......@@ -138,9 +139,9 @@ struct CopyScalar {
CopyScalar(const scalar_t* device_ptr) {
init_pinned_storage();
auto stream = GetCurrentStream();
CUDA_CALL(cudaMemcpyAsync(
CUDA_CALL(hipMemcpyAsync(
reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr()), device_ptr,
sizeof(scalar_t), cudaMemcpyDeviceToHost, stream));
sizeof(scalar_t), hipMemcpyDeviceToHost, stream));
record(stream);
}
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/cumsum.cu
* @brief Cumsum operators implementation on CUDA.
*/
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "./common.h"
#include "common.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -8,10 +9,10 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <limits>
#include "./common.h"
#include "common.h"
namespace graphbolt {
namespace ops {
......@@ -86,7 +87,7 @@ torch::Tensor ExpandIndptrImpl(
CUB_CALL(
DeviceCopy::Batched, input_buffer + i,
output_buffer + i, buffer_sizes + i,
std::min(num_rows - i, max_copy_at_once));
::min(num_rows - i, max_copy_at_once));
}
}));
}));
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -6,8 +7,8 @@
*/
#include <numeric>
#include "./common.h"
#include "./gpu_cache.h"
#include "common.h"
#include "gpu_cache.h"
namespace graphbolt {
namespace cuda {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -10,12 +12,12 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <numeric>
#include "./common.h"
#include "./max_uva_threads.h"
#include "./utils.h"
#include "common.h"
#include "max_uva_threads.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
const dim3 block(BLOCK_SIZE);
const dim3 grid(
(std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
(::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
BLOCK_SIZE - 1) /
BLOCK_SIZE);
......@@ -220,7 +222,7 @@ void IndexSelectCSCCopyIndices(
for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
CUB_CALL(
DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once));
buffer_sizes + i, ::min(num_nodes - i, max_copy_at_once));
}
}
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -9,9 +11,9 @@
#include <numeric>
#include "./common.h"
#include "./max_uva_threads.h"
#include "./utils.h"
#include "common.h"
#include "max_uva_threads.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -124,7 +126,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
// Use a single thread to process each output row to avoid wasting threads.
const int num_threads = cuda::FindNumThreads(return_len);
const int num_blocks =
(std::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
(::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
num_threads - 1) /
num_threads;
CUDA_KERNEL_CALL(
......@@ -137,7 +139,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
block.x >>= 1;
block.y <<= 1;
}
const dim3 grid(std::min(
const dim3 grid(::min(
(return_len + block.y - 1) / block.y,
cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -8,7 +9,7 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include "./common.h"
#include "common.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -7,7 +8,7 @@
#include <graphbolt/cuda_ops.h>
#include <thrust/binary_search.h>
#include "./common.h"
#include "common.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/max_uva_threads.cc
* @brief Max uva threads variable setter function.
*/
#include "./max_uva_threads.h"
#include "max_uva_threads.h"
namespace graphbolt {
namespace cuda {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "hip/hip_bf16.h"
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -5,7 +8,7 @@
* @brief Index select operator implementation on CUDA.
*/
#include <c10/core/ScalarType.h>
#include <curand_kernel.h>
#include <hiprand/hiprand_kernel.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <thrust/gather.h>
......@@ -15,14 +18,14 @@
#include <algorithm>
#include <array>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <limits>
#include <numeric>
#include <type_traits>
#include "../random.h"
#include "./common.h"
#include "./utils.h"
#include "common.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -44,11 +47,11 @@ __global__ void _ComputeRandoms(
const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) {
int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = gridDim.x * blockDim.x;
curandStatePhilox4_32_10_t rng;
hiprandStatePhilox4_32_10_t rng;
const auto labor = indices != nullptr;
if (!labor) {
curand_init(random_seed, i, 0, &rng);
hiprand_init(random_seed, i, 0, &rng);
}
while (i < num_edges) {
......@@ -58,10 +61,10 @@ __global__ void _ComputeRandoms(
if (labor) {
constexpr uint64_t kCurandSeed = 999961;
curand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
hiprand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
}
const auto rnd = curand_uniform(&rng);
const auto rnd = hiprand_uniform(&rng);
const auto prob =
sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
const auto exp_rnd = -__logf(rnd);
......@@ -152,9 +155,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
}
// Finally, copy the adjusted fanout values to the device memory.
auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
CUDA_CALL(cudaMemcpyAsync(
CUDA_CALL(hipMemcpyAsync(
fanouts_device.get(), fanouts_pinned_ptr,
sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice,
sizeof(int64_t) * fanouts.size(), hipMemcpyHostToDevice,
cuda::GetCurrentStream()));
auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, nodes);
auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
......@@ -271,7 +274,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
"Selected edge_id_t must be capable of storing edge_ids.");
// Using bfloat16 for random numbers works just as reliably as
// float32 and provides around %30 percent speedup.
using rnd_t = nv_bfloat16;
using rnd_t = __hip_bfloat16;
auto randoms =
allocator.AllocateStorage<rnd_t>(num_edges.value());
auto randoms_sorted =
......@@ -362,7 +365,7 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
CUB_CALL(
DeviceCopy::Batched, input_buffer_it + i,
output_buffer_it + i, sampled_degree + i,
std::min(num_rows - i, max_copy_at_once));
::min(num_rows - i, max_copy_at_once));
}
}));
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -7,10 +8,10 @@
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "./common.h"
#include "./utils.h"
#include "common.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -72,7 +73,7 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
num_nodes + 1, cub::Difference{});
num_nodes + 1, hipcub::Difference{});
}));
in_degree = in_degree.slice(0, 1);
return {in_degree, sliced_indptr};
......@@ -126,7 +127,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
new_sub_indptr.data_ptr<indptr_t>(),
new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{});
new_indegree.data_ptr<indptr_t>(), num_rows + 1, hipcub::Difference{});
}));
// Discard the first element of the SubtractLeftCopy result and ensure that
// new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -6,10 +7,10 @@
*/
#include <c10/core/ScalarType.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "./common.h"
#include "./utils.h"
#include "common.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -10,11 +11,11 @@
#include <thrust/gather.h>
#include <thrust/logical.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <type_traits>
#include "./common.h"
#include "./utils.h"
#include "common.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -97,7 +98,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> UniqueAndCompact(
// and max_id_dst.
if (num_bits == 0) {
num_bits = cuda::NumberOfBits(
1 + std::max(
1 + ::max(
static_cast<scalar_t>(max_id_src),
static_cast<scalar_t>(max_id_dst)));
}
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -6,8 +7,8 @@
*/
#include <graphbolt/cuda_ops.h>
#include "./macro.h"
#include "./utils.h"
#include "macro.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file fused_csc_sampling_graph.cc
......@@ -17,10 +18,10 @@
#include <tuple>
#include <vector>
#include "./macro.h"
#include "./random.h"
#include "./shared_memory_helper.h"
#include "./utils.h"
#include "macro.h"
#include "random.h"
#include "shared_memory_helper.h"
#include "utils.h"
namespace {
torch::optional<torch::Dict<std::string, torch::Tensor>> TensorizeDict(
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file index_select.cc
......@@ -6,8 +7,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/fused_csc_sampling_graph.h>
#include "./macro.h"
#include "./utils.h"
#include "macro.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
*
......@@ -8,8 +9,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/isin.h>
#include "./macro.h"
#include "./utils.h"
#include "macro.h"
#include "utils.h"
namespace {
static constexpr int kSearchGrainSize = 4096;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file python_binding.cc
......@@ -10,14 +11,14 @@
#include <graphbolt/unique_and_compact.h>
#ifdef GRAPHBOLT_USE_CUDA
#include "./cuda/max_uva_threads.h"
#include "cuda/max_uva_threads.h"
#endif
#include "./expand_indptr.h"
#include "./index_select.h"
#include "./random.h"
#include "expand_indptr.h"
#include "index_select.h"
#include "random.h"
#ifdef GRAPHBOLT_USE_CUDA
#include "./cuda/gpu_cache.h"
#include "cuda/gpu_cache.h"
#endif
namespace graphbolt {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file random.cc
* @brief Random Engine.
*/
#include "./random.h"
#include "random.h"
#include <torch/torch.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
*
* @file shared_memory_helper.cc
* @brief Share memory helper implementation.
*/
#include "./shared_memory_helper.h"
#include "shared_memory_helper.h"
#include <graphbolt/serialize.h>
#include <graphbolt/shared_memory.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment