"vscode:/vscode.git/clone" did not exist on "2b2dedc3c0453c249c1b129a1c60e069be74b1ea"
Commit 74d88bf8 authored by sangwz's avatar sangwz
Browse files

Merge branch 'dtk25.04' of http://developer.sourcefind.cn/codes/OpenDAS/dgl into 2.2.1

parents 2a1ac588 314cedc1
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/expand_indptr.cu
* @brief ExpandIndptr operator implementation on CUDA.
*/
#include <hip/hip_runtime.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <limits>
#include "./common.h"
#include <hipcub/backend/rocprim/device/device_copy.hpp>
#include "common.h"
namespace graphbolt {
namespace ops {
......@@ -86,7 +88,7 @@ torch::Tensor ExpandIndptrImpl(
CUB_CALL(
DeviceCopy::Batched, input_buffer + i,
output_buffer + i, buffer_sizes + i,
std::min(num_rows - i, max_copy_at_once));
::min(num_rows - i, max_copy_at_once));
}
}));
}));
......
......@@ -20,7 +20,7 @@ namespace cuda {
class GpuCache : public torch::CustomClassHolder {
using key_t = long long;
constexpr static int set_associativity = 2;
constexpr static int WARP_SIZE = 32;
constexpr static int WARP_SIZE = 64;
constexpr static int bucket_size = WARP_SIZE * set_associativity;
using gpu_cache_t = ::gpu_cache::gpu_cache<
key_t, uint64_t, std::numeric_limits<key_t>::max(), set_associativity,
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -6,8 +7,8 @@
*/
#include <numeric>
#include "./common.h"
#include "./gpu_cache.h"
#include "common.h"
#include "gpu_cache.h"
namespace graphbolt {
namespace cuda {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -10,12 +12,12 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include <numeric>
#include "./common.h"
#include "./max_uva_threads.h"
#include "./utils.h"
#include "common.h"
#include "max_uva_threads.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -132,7 +134,7 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
const dim3 block(BLOCK_SIZE);
const dim3 grid(
(std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
(::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
BLOCK_SIZE - 1) /
BLOCK_SIZE);
......@@ -178,8 +180,10 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCImpl(
return GRAPHBOLT_DISPATCH_ELEMENT_SIZES(
indices.element_size(), "UVAIndexSelectCSCCopyIndices", ([&] {
return UVAIndexSelectCSCCopyIndices<indptr_t, element_size_t>(
indices, num_nodes, in_degree.data_ptr<indptr_t>(),
sliced_indptr.data_ptr<indptr_t>(),
// indices, num_nodes, in_degree.data_ptr<indptr_t>(),
indices, num_nodes, cuda::getTensorDevicePointer<indptr_t>(in_degree),
// sliced_indptr.data_ptr<indptr_t>(),
cuda::getTensorDevicePointer<indptr_t>(sliced_indptr),
sorted_idx.data_ptr<int64_t>(), nodes.options(),
sliced_indptr.scalar_type(), output_size);
}));
......@@ -220,7 +224,7 @@ void IndexSelectCSCCopyIndices(
for (int64_t i = 0; i < num_nodes; i += max_copy_at_once) {
CUB_CALL(
DeviceMemcpy::Batched, input_buffer_it + i, output_buffer_it + i,
buffer_sizes + i, std::min(num_nodes - i, max_copy_at_once));
buffer_sizes + i, ::min(num_nodes - i, max_copy_at_once));
}
}
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -9,9 +11,9 @@
#include <numeric>
#include "./common.h"
#include "./max_uva_threads.h"
#include "./utils.h"
#include "common.h"
#include "max_uva_threads.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -110,7 +112,15 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
{return_len, original_feature_size}, torch::TensorOptions()
.dtype(input.dtype())
.device(c10::DeviceType::CUDA));
DType* input_ptr = reinterpret_cast<DType*>(input.data_ptr());
DType* input_ptr = nullptr;
if(input.is_pinned())
{
CUDA_CALL(hipHostGetDevicePointer((void**)&input_ptr, input.data_ptr(), 0));
}
else{
input_ptr= reinterpret_cast<DType*>(input.data_ptr());
}
DType* ret_ptr = reinterpret_cast<DType*>(ret.data_ptr());
// Sort the index to improve the memory access pattern.
......@@ -124,7 +134,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
// Use a single thread to process each output row to avoid wasting threads.
const int num_threads = cuda::FindNumThreads(return_len);
const int num_blocks =
(std::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
(::min(return_len, cuda::max_uva_threads.value_or(1 << 20)) +
num_threads - 1) /
num_threads;
CUDA_KERNEL_CALL(
......@@ -137,7 +147,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
block.x >>= 1;
block.y <<= 1;
}
const dim3 grid(std::min(
const dim3 grid(::min(
(return_len + block.y - 1) / block.y,
cuda::max_uva_threads.value_or(1 << 20) / BLOCK_SIZE));
if (aligned_feature_size * sizeof(DType) <= GPU_CACHE_LINE_SIZE) {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -8,7 +9,7 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include "./common.h"
#include "common.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -7,7 +8,7 @@
#include <graphbolt/cuda_ops.h>
#include <thrust/binary_search.h>
#include "./common.h"
#include "common.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/max_uva_threads.cc
* @brief Max uva threads variable setter function.
*/
#include "./max_uva_threads.h"
#include "max_uva_threads.h"
namespace graphbolt {
namespace cuda {
......
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include "hip/hip_bf16.h"
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -5,8 +8,8 @@
* @brief Index select operator implementation on CUDA.
*/
#include <c10/core/ScalarType.h>
#include <curand_kernel.h>
#include <graphbolt/continuous_seed.h>
#include <hiprand/hiprand_kernel.h>
#include <graphbolt/cuda_ops.h>
#include <graphbolt/cuda_sampling_ops.h>
#include <thrust/copy.h>
......@@ -14,21 +17,43 @@
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <hipcub/backend/rocprim/device/device_copy.hpp>
#include <algorithm>
#include <array>
#include <cub/cub.cuh>
#if __CUDA_ARCH__ >= 700
#include <cuda/atomic>
#endif // __CUDA_ARCH__ >= 700
#include <hipcub/hipcub.hpp>
#include <limits>
#include <numeric>
#include <type_traits>
#include "../random.h"
#include "../utils.h"
#include "./common.h"
#include "./utils.h"
#include "common.h"
#include "utils.h"
namespace rocprim{
namespace detail{
template<>
struct float_bit_mask<__hip_bfloat16>
{
static constexpr uint16_t sign_bit = 0x8000;
static constexpr uint16_t exponent = 0x7F80;
static constexpr uint16_t mantissa = 0x007F;
using bit_type = uint16_t;
};
template<>
struct radix_key_codec_base<__hip_bfloat16> : radix_key_codec_floating<__hip_bfloat16, unsigned short> {
};
}
}
#if HIP_VERSION_MAJOR<6
__host__ __device__ bool operator>(const __hip_bfloat16& a, const __hip_bfloat16& b)
{
return float(a)>float(b);
}
#endif
namespace graphbolt {
namespace ops {
......@@ -109,13 +134,23 @@ __global__ void _ComputeRandoms(
edge_id_t* edge_ids) {
int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = gridDim.x * blockDim.x;
hiprandStatePhilox4_32_10_t rng;
const auto labor = indices != nullptr;
if (!labor) {
hiprand_init(random_seed, i, 0, &rng);
}
while (i < num_edges) {
const auto row_position = csr_rows[i];
const auto row_offset = i - sub_indptr[row_position];
const auto in_idx = sliced_indptr[row_position] + row_offset;
const auto rnd = random_seed.uniform(labor ? indices[in_idx] : i);
if (labor) {
constexpr uint64_t kCurandSeed = 999961;
hiprand_init(kCurandSeed, random_seed, indices[in_idx], &rng);
}
const auto rnd = hiprand_uniform(&rng);
const auto prob =
sliced_weights ? sliced_weights[i] : static_cast<weights_t>(1);
const auto exp_rnd = -__logf(rnd);
......@@ -216,9 +251,9 @@ c10::intrusive_ptr<sampling::FusedSampledSubgraph> SampleNeighbors(
}
// Finally, copy the adjusted fanout values to the device memory.
auto fanouts_device = allocator.AllocateStorage<int64_t>(fanouts.size());
CUDA_CALL(cudaMemcpyAsync(
CUDA_CALL(hipMemcpyAsync(
fanouts_device.get(), fanouts_pinned_ptr,
sizeof(int64_t) * fanouts.size(), cudaMemcpyHostToDevice,
sizeof(int64_t) * fanouts.size(), hipMemcpyHostToDevice,
cuda::GetCurrentStream()));
auto in_degree_and_sliced_indptr = SliceCSCIndptr(indptr, seeds);
auto in_degree = std::get<0>(in_degree_and_sliced_indptr);
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -7,10 +8,10 @@
#include <thrust/for_each.h>
#include <thrust/iterator/counting_iterator.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "./common.h"
#include "./utils.h"
#include "common.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......@@ -55,7 +56,8 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
THRUST_CALL(
for_each, iota, iota + num_nodes,
SliceFunc<indptr_t, nodes_t>{
nodes.data_ptr<nodes_t>(), indptr.data_ptr<indptr_t>(),
// nodes.data_ptr<nodes_t>(), indptr.data_ptr<indptr_t>(),
cuda::getTensorDevicePointer<nodes_t>(nodes), cuda::getTensorDevicePointer<indptr_t>(indptr),
in_degree.data_ptr<indptr_t>(),
sliced_indptr.data_ptr<indptr_t>()});
}));
......@@ -71,8 +73,8 @@ std::tuple<torch::Tensor, torch::Tensor> SliceCSCIndptr(
using indptr_t = scalar_t;
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
indptr.data_ptr<indptr_t>(), in_degree.data_ptr<indptr_t>(),
num_nodes + 1, cub::Difference{});
cuda::getTensorDevicePointer<indptr_t>(indptr), in_degree.data_ptr<indptr_t>(),
num_nodes + 1, hipcub::Difference{});
}));
in_degree = in_degree.slice(0, 1);
return {in_degree, sliced_indptr};
......@@ -126,7 +128,7 @@ std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> SliceCSCIndptrHetero(
CUB_CALL(
DeviceAdjacentDifference::SubtractLeftCopy,
new_sub_indptr.data_ptr<indptr_t>(),
new_indegree.data_ptr<indptr_t>(), num_rows + 1, cub::Difference{});
new_indegree.data_ptr<indptr_t>(), num_rows + 1, hipcub::Difference{});
}));
// Discard the first element of the SubtractLeftCopy result and ensure that
// new_indegree tensor has size num_rows + 1 so that its ExclusiveCumSum is
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -6,10 +7,10 @@
*/
#include <c10/core/ScalarType.h>
#include <cub/cub.cuh>
#include <hipcub/hipcub.hpp>
#include "./common.h"
#include "./utils.h"
#include "common.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
* @file cuda/unique_and_compact_impl.cu
* @brief Unique and compact operator implementation on CUDA.
*/
#include <hip/hip_runtime.h>
#include <graphbolt/cuda_ops.h>
#include <thrust/binary_search.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/logical.h>
#include <cub/cub.cuh>
#include <mutex>
#include <hipcub/hipcub.hpp>
#include <type_traits>
#include <unordered_map>
......@@ -119,12 +121,17 @@ UniqueAndCompactBatchedSortBased(
// The code block above synchronizes, ensuring safe access to
// max_id_src and max_id_dst.
if (num_bits == 0) {
//
index_t max_id = 0;
for (std::size_t i = 0; i < max_id_src.size(); i++) {
max_id = std::max(max_id, static_cast<index_t>(max_id_src[i]));
max_id = std::max(max_id, static_cast<index_t>(max_id_dst[i]));
}
num_bits = cuda::NumberOfBits(1ll + max_id);
// num_bits = cuda::NumberOfBits(
// 1 + ::max(
// static_cast<scalar_t>(max_id_src),
// static_cast<scalar_t>(max_id_dst)));
}
// Sort the only_src tensor so that we can unique it later.
......
......@@ -101,6 +101,17 @@ __device__ indices_t UpperBound(const indptr_t* A, indices_t n, indptr_t x) {
return l;
}
template<typename DType>
inline DType* getTensorDevicePointer(torch::Tensor inputTensor)
{
DType* ret = inputTensor.data_ptr<DType>();
if(inputTensor.is_pinned())
{
CUDA_CALL(hipHostGetDevicePointer((void**)&ret, (void*)ret, 0));
}
return ret;
}
} // namespace cuda
} // namespace graphbolt
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
......@@ -7,8 +8,8 @@
#include <graphbolt/cuda_ops.h>
#include <torch/autograd.h>
#include "./macro.h"
#include "./utils.h"
#include "macro.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file fused_csc_sampling_graph.cc
......@@ -24,6 +25,7 @@
#include "./shared_memory_helper.h"
#include "./utils.h"
namespace {
torch::optional<torch::Dict<std::string, torch::Tensor>> TensorizeDict(
const torch::optional<torch::Dict<std::string, int64_t>>& dict) {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file index_select.cc
......@@ -8,8 +9,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/fused_csc_sampling_graph.h>
#include "./macro.h"
#include "./utils.h"
#include "macro.h"
#include "utils.h"
namespace graphbolt {
namespace ops {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
*
......@@ -8,8 +9,8 @@
#include <graphbolt/cuda_ops.h>
#include <graphbolt/isin.h>
#include "./macro.h"
#include "./utils.h"
#include "macro.h"
#include "utils.h"
namespace {
static constexpr int kSearchGrainSize = 4096;
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file python_binding.cc
......@@ -10,15 +11,17 @@
#include <graphbolt/unique_and_compact.h>
#ifdef GRAPHBOLT_USE_CUDA
#include "./cuda/max_uva_threads.h"
#include "cuda/max_uva_threads.h"
#endif
#include "./cnumpy.h"
#include "./expand_indptr.h"
#include "./index_select.h"
#include "./random.h"
#ifdef GRAPHBOLT_USE_CUDA
#include "./cuda/gpu_cache.h"
#include "cuda/gpu_cache.h"
#endif
namespace graphbolt {
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
* @file random.cc
* @brief Random Engine.
*/
#include "./random.h"
#include "random.h"
#include <torch/torch.h>
......
// !!! This is a file automatically generated by hipify!!!
/**
* Copyright (c) 2023 by Contributors
*
* @file shared_memory_helper.cc
* @brief Share memory helper implementation.
*/
#include "./shared_memory_helper.h"
#include "shared_memory_helper.h"
#include <graphbolt/serialize.h>
#include <graphbolt/shared_memory.h>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment