Unverified Commit 177dc133 authored by Muhammed Fatih BALIN's avatar Muhammed Fatih BALIN Committed by GitHub
Browse files

[GraphBolt][CUDA] IndexSelectCSC kernel launch config change. (#7056)

parent 50eb1014
...@@ -14,12 +14,13 @@ ...@@ -14,12 +14,13 @@
#include <numeric> #include <numeric>
#include "./common.h" #include "./common.h"
#include "./max_uva_threads.h"
#include "./utils.h" #include "./utils.h"
namespace graphbolt { namespace graphbolt {
namespace ops { namespace ops {
constexpr int BLOCK_SIZE = 128; constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
// Given the in_degree array and a permutation, returns in_degree of the output // Given the in_degree array and a permutation, returns in_degree of the output
// and the permuted and modified in_degree of the input. The modified in_degree // and the permuted and modified in_degree of the input. The modified in_degree
...@@ -130,7 +131,10 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices( ...@@ -130,7 +131,10 @@ std::tuple<torch::Tensor, torch::Tensor> UVAIndexSelectCSCCopyIndices(
torch::Tensor output_indices = torch::Tensor output_indices =
torch::empty(output_size.value(), options.dtype(indices.scalar_type())); torch::empty(output_size.value(), options.dtype(indices.scalar_type()));
const dim3 block(BLOCK_SIZE); const dim3 block(BLOCK_SIZE);
const dim3 grid((edge_count_aligned + BLOCK_SIZE - 1) / BLOCK_SIZE); const dim3 grid(
(std::min(edge_count_aligned, cuda::max_uva_threads.value_or(1 << 20)) +
BLOCK_SIZE - 1) /
BLOCK_SIZE);
// Find the smallest integer type to store the coo_aligned_rows tensor. // Find the smallest integer type to store the coo_aligned_rows tensor.
const int num_bits = cuda::NumberOfBits(num_nodes); const int num_bits = cuda::NumberOfBits(num_nodes);
......
...@@ -131,7 +131,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) { ...@@ -131,7 +131,7 @@ torch::Tensor UVAIndexSelectImpl_(torch::Tensor input, torch::Tensor index) {
IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr, IndexSelectSingleKernel, num_blocks, num_threads, 0, input_ptr,
input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr); input_len, index_sorted_ptr, return_len, ret_ptr, permutation_ptr);
} else { } else {
constexpr int BLOCK_SIZE = 512; constexpr int BLOCK_SIZE = CUDA_MAX_NUM_THREADS;
dim3 block(BLOCK_SIZE, 1); dim3 block(BLOCK_SIZE, 1);
while (static_cast<int64_t>(block.x) >= 2 * aligned_feature_size) { while (static_cast<int64_t>(block.x) >= 2 * aligned_feature_size) {
block.x >>= 1; block.x >>= 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment