[Performance][CUDA] Sorting for indices for UVM code path. (#5882)

f5f7e08e · Muhammed Fatih BALIN · GitHub · a64ff482 · f5f7e08e · f5f7e08e
Unverified Commit f5f7e08e authored Jul 14, 2023 by Muhammed Fatih BALIN Committed by GitHub Jul 14, 2023
5 changed files
--- a/src/array/cuda/array_index_select.cuh
+++ b/src/array/cuda/array_index_select.cuh
@@ -14,12 +14,13 @@ namespace impl {
 template <typename DType, typename IdType>
 __global__ void IndexSelectSingleKernel(
    const DType* array, const IdType* index, const int64_t length,
-    const int64_t arr_len, DType* out) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int64_t arr_len, DType* out, const int64_t* perm = nullptr) {
+  int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    assert(index[tx] >= 0 && index[tx] < arr_len);
-    out[tx] = array[index[tx]];
+    const auto out_row = perm ? perm[tx] : tx;
+    out[out_row] = array[index[tx]];
    tx += stride_x;
  }
 }
@@ -27,20 +28,22 @@ __global__ void IndexSelectSingleKernel(
 template <typename DType, typename IdType>
 __global__ void IndexSelectMultiKernel(
    const DType* const array, const int64_t num_feat, const IdType* const index,
-    const int64_t length, const int64_t arr_len, DType* const out) {
-  int64_t out_row = blockIdx.x * blockDim.y + threadIdx.y;
+    const int64_t length, const int64_t arr_len, DType* const out,
+    const int64_t* perm = nullptr) {
+  int64_t out_row_index = blockIdx.x * blockDim.y + threadIdx.y;

  const int64_t stride = blockDim.y * gridDim.x;

-  while (out_row < length) {
+  while (out_row_index < length) {
    int64_t col = threadIdx.x;
-    const int64_t in_row = index[out_row];
+    const int64_t in_row = index[out_row_index];
    assert(in_row >= 0 && in_row < arr_len);
+    const auto out_row = perm ? perm[out_row_index] : out_row_index;
    while (col < num_feat) {
      out[out_row * num_feat + col] = array[in_row * num_feat + col];
      col += blockDim.x;
    }
-    out_row += stride;
+    out_row_index += stride;
  }
 }


--- a/src/array/cuda/coo_sort.cu
+++ b/src/array/cuda/coo_sort.cu
@@ -63,32 +63,14 @@ __global__ void _COODecodeEdgesKernel(
  }
 }

-template <typename T>
-int _NumberOfBits(const T& range) {
-  if (range <= 1) {
-    // ranges of 0 or 1 require no bits to store
-    return 0;
-  }
-
-  int bits = 1;
-  while (bits < static_cast<int>(sizeof(T) * 8) && (1 << bits) < range) {
-    ++bits;
-  }
-
-  CHECK_EQ((range - 1) >> bits, 0);
-  CHECK_NE((range - 1) >> (bits - 1), 0);
-
-  return bits;
-}
-
 template <DGLDeviceType XPU, typename IdType>
 void COOSort_(COOMatrix* coo, bool sort_column) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
-  const int row_bits = _NumberOfBits(coo->num_rows);
+  const int row_bits = cuda::_NumberOfBits(coo->num_rows);

  const int64_t nnz = coo->row->shape[0];
  if (sort_column) {
-    const int col_bits = _NumberOfBits(coo->num_cols);
+    const int col_bits = cuda::_NumberOfBits(coo->num_cols);
    const int num_bits = row_bits + col_bits;

    const int nt = 256;

--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -38,6 +38,24 @@ inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) {
  return ret;
 }

+template <typename T>
+int _NumberOfBits(const T& range) {
+  if (range <= 1) {
+    // ranges of 0 or 1 require no bits to store
+    return 0;
+  }
+
+  int bits = 1;
+  while (bits < static_cast<int>(sizeof(T) * 8) && (1 << bits) < range) {
+    ++bits;
+  }
+
+  CHECK_EQ((range - 1) >> bits, 0);
+  CHECK_NE((range - 1) >> (bits - 1), 0);
+
+  return bits;
+}
+
 /**
 * @brief Find number of blocks is smaller than nblks and max_nblks
 * on the given axis ('x', 'y' or 'z').

--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
@@ -18,7 +18,6 @@ namespace impl {
 template <typename DType, typename IdType>
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  cudaStream_t stream = runtime::getCurrentCUDAStream();
-  const IdType* idx_data = static_cast<IdType*>(index->data);
  const int64_t arr_len = array->shape[0];
  const int64_t len = index->shape[0];
  int64_t num_feat = 1;
@@ -37,12 +36,16 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
  if (len == 0 || arr_len * num_feat == 0) return ret;
  DType* ret_data = static_cast<DType*>(ret->data);

+  auto res = Sort(index, cuda::_NumberOfBits(arr_len));
+  const IdType* idx_data = static_cast<IdType*>(res.first->data);
+  const int64_t* perm_data = static_cast<int64_t*>(res.second->data);
+
  if (num_feat == 1) {
    const int nt = cuda::FindNumThreads(len);
    const int nb = (len + nt - 1) / nt;
    CUDA_KERNEL_CALL(
        IndexSelectSingleKernel, nb, nt, 0, stream, array_data, idx_data, len,
-        arr_len, ret_data);
+        arr_len, ret_data, perm_data);
  } else {
    dim3 block(256, 1);
    while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
@@ -53,11 +56,11 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
    if (num_feat * sizeof(DType) < 2 * CACHE_LINE_SIZE) {
      CUDA_KERNEL_CALL(
          IndexSelectMultiKernel, grid, block, 0, stream, array_data, num_feat,
-          idx_data, len, arr_len, ret_data);
+          idx_data, len, arr_len, ret_data, perm_data);
    } else {
      CUDA_KERNEL_CALL(
          IndexSelectMultiKernelAligned, grid, block, 0, stream, array_data,
-          num_feat, idx_data, len, arr_len, ret_data);
+          num_feat, idx_data, len, arr_len, ret_data, perm_data);
    }
  }
  return ret;

--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
@@ -21,25 +21,27 @@ namespace impl {
 template <typename DType, typename IdType>
 __global__ void IndexSelectMultiKernelAligned(
    const DType* const array, const int64_t num_feat, const IdType* const index,
-    const int64_t length, const int64_t arr_len, DType* const out) {
-  int64_t out_row = blockIdx.x * blockDim.y + threadIdx.y;
+    const int64_t length, const int64_t arr_len, DType* const out,
+    const int64_t* perm = nullptr) {
+  int64_t out_row_index = blockIdx.x * blockDim.y + threadIdx.y;

  const int64_t stride = blockDim.y * gridDim.x;

-  while (out_row < length) {
+  while (out_row_index < length) {
    int64_t col = threadIdx.x;
-    const int64_t in_row = index[out_row];
+    const int64_t in_row = index[out_row_index];
    assert(in_row >= 0 && in_row < arr_len);
    const int64_t idx_offset =
        ((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) /
        sizeof(DType);
    col = col - idx_offset;
+    const auto out_row = perm ? perm[out_row_index] : out_row_index;
    while (col < num_feat) {
      if (col >= 0)
        out[out_row * num_feat + col] = array[in_row * num_feat + col];
      col += blockDim.x;
    }
-    out_row += stride;
+    out_row_index += stride;
  }
 }