"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "b3d5cc4a3622d91f387c7c732b3a9bdcea2a1e8f"
Unverified Commit d873b09a authored by Muhammed Fatih BALIN's avatar Muhammed Fatih BALIN Committed by GitHub
Browse files

[Graphbolt][CUDA] Optimize UVA by using CUB sort (#6635)

parent a67d9e6f
...@@ -8,10 +8,68 @@ ...@@ -8,10 +8,68 @@
#include <c10/cuda/CUDAException.h> #include <c10/cuda/CUDAException.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <torch/script.h>
#include <memory>
#include <unordered_map>
namespace graphbolt { namespace graphbolt {
namespace cuda { namespace cuda {
/**
* @brief This class is designed to allocate workspace storage
* and to get a nonblocking thrust execution policy
* that uses torch's CUDA memory pool and the current cuda stream:
* cuda::CUDAWorkspaceAllocator allocator;
* const auto stream = torch::cuda::getDefaultCUDAStream();
* const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
* Now, one can pass exec_policy to thrust functions
* To get an integer array of size 1000 whose lifetime is managed by unique_ptr,
* use:
* auto int_array = allocator.AllocateStorage<int>(1000);
* int_array.get() gives the raw pointer.
*/
class CUDAWorkspaceAllocator {
using TensorPtrMapType = std::unordered_map<void*, torch::Tensor>;
std::shared_ptr<TensorPtrMapType> ptr_map_;
public:
// Required by thrust to satisfy allocator requirements.
using value_type = char;
explicit CUDAWorkspaceAllocator()
: ptr_map_(std::make_shared<TensorPtrMapType>()) {}
CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;
void operator()(void* ptr) const { ptr_map_->erase(ptr); }
// Required by thrust to satisfy allocator requirements.
value_type* allocate(std::ptrdiff_t size) const {
auto tensor = torch::empty(
size, torch::TensorOptions()
.dtype(torch::kByte)
.device(c10::DeviceType::CUDA));
ptr_map_->operator[](tensor.data_ptr()) = tensor;
return reinterpret_cast<value_type*>(tensor.data_ptr());
}
// Required by thrust to satisfy allocator requirements.
void deallocate(value_type* ptr, std::size_t) const { operator()(ptr); }
template <typename T>
std::unique_ptr<T, CUDAWorkspaceAllocator> AllocateStorage(
std::size_t size) const {
return std::unique_ptr<T, CUDAWorkspaceAllocator>(
reinterpret_cast<T*>(allocate(sizeof(T) * size)), *this);
}
};
template <typename T> template <typename T>
inline bool is_zero(T size) { inline bool is_zero(T size) {
return size == 0; return size == 0;
...@@ -22,6 +80,8 @@ inline bool is_zero<dim3>(dim3 size) { ...@@ -22,6 +80,8 @@ inline bool is_zero<dim3>(dim3 size) {
return size.x == 0 || size.y == 0 || size.z == 0; return size.x == 0 || size.y == 0 || size.z == 0;
} }
#define CUDA_CALL(func) C10_CUDA_CHECK((func))
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \ #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \
{ \ { \
if (!graphbolt::cuda::is_zero((nblks)) && \ if (!graphbolt::cuda::is_zero((nblks)) && \
......
...@@ -5,17 +5,49 @@ ...@@ -5,17 +5,49 @@
*/ */
#include <c10/core/ScalarType.h> #include <c10/core/ScalarType.h>
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <torch/script.h>
#include <numeric> #include <numeric>
#include "../index_select.h" #include "../index_select.h"
#include "./common.h" #include "./common.h"
#include "./utils.h" #include "./utils.h"
#include "cub/cub.cuh"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
std::pair<torch::Tensor, torch::Tensor> Sort(
torch::Tensor input, int num_bits) {
int64_t num_items = input.size(0);
// We utilize int64_t for the values array. (torch::kLong == int64_t)
auto original_idx =
torch::arange(num_items, input.options().dtype(torch::kLong));
auto sorted_array = torch::empty_like(input);
auto sorted_idx = torch::empty_like(original_idx);
cuda::CUDAWorkspaceAllocator allocator;
AT_DISPATCH_INDEX_TYPES(
input.scalar_type(), "SortImpl", ([&] {
using IdType = index_t;
const auto input_keys = input.data_ptr<index_t>();
const int64_t* input_values = original_idx.data_ptr<int64_t>();
IdType* sorted_keys = sorted_array.data_ptr<index_t>();
int64_t* sorted_values = sorted_idx.data_ptr<int64_t>();
cudaStream_t stream = torch::cuda::getDefaultCUDAStream();
if (num_bits == 0) {
num_bits = sizeof(index_t) * 8;
}
size_t workspace_size = 0;
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
nullptr, workspace_size, input_keys, sorted_keys, input_values,
sorted_values, num_items, 0, num_bits, stream));
auto temp = allocator.AllocateStorage<char>(workspace_size);
CUDA_CALL(cub::DeviceRadixSort::SortPairs(
temp.get(), workspace_size, input_keys, sorted_keys, input_values,
sorted_values, num_items, 0, num_bits, stream));
}));
return std::make_pair(sorted_array, sorted_idx);
}
/** @brief Index select operator implementation for feature size 1. */ /** @brief Index select operator implementation for feature size 1. */
template <typename DType, typename IdType> template <typename DType, typename IdType>
__global__ void IndexSelectSingleKernel( __global__ void IndexSelectSingleKernel(
...@@ -115,7 +147,8 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) { ...@@ -115,7 +147,8 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
// Sort the index to improve the memory access pattern. // Sort the index to improve the memory access pattern.
torch::Tensor sorted_index, permutation; torch::Tensor sorted_index, permutation;
std::tie(sorted_index, permutation) = torch::sort(index); std::tie(sorted_index, permutation) =
Sort(index, cuda::NumberOfBits(input_len));
const IdType* index_sorted_ptr = sorted_index.data_ptr<IdType>(); const IdType* index_sorted_ptr = sorted_index.data_ptr<IdType>();
const int64_t* permutation_ptr = permutation.data_ptr<int64_t>(); const int64_t* permutation_ptr = permutation.data_ptr<int64_t>();
......
...@@ -31,6 +31,27 @@ inline int FindNumThreads(int size) { ...@@ -31,6 +31,27 @@ inline int FindNumThreads(int size) {
return ret; return ret;
} }
/**
* @brief Calculate the smallest number of bits needed to represent a given
* range of integers [0, range).
*
*/
template <typename T>
int NumberOfBits(const T& range) {
if (range <= 1) {
// ranges of 0 or 1 require no bits to store
return 0;
}
int bits = 1;
const auto urange = static_cast<std::make_unsigned_t<T>>(range);
while (bits < static_cast<int>(sizeof(T) * 8) && (1ull << bits) < urange) {
++bits;
}
return bits;
}
} // namespace cuda } // namespace cuda
} // namespace graphbolt } // namespace graphbolt
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment