array_index_select_uvm.cuh 1.68 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
2
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
3
/**
4
 *  Copyright (c) 2021 by Contributors
5
6
 * @file array/cpu/array_index_select_uvm.cuh
 * @brief Array index select GPU kernel implementation
7
8
 */

9
10
#ifndef DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
#define DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
11

12
13
#define CACHE_LINE_SIZE 128

14
15
16
17
namespace dgl {
namespace aten {
namespace impl {

18
19
20
21
/**
 *  This is a cross-device access version of IndexSelectMultiKernel.
 *  Since the memory access over PCIe is more sensitive to the
 *  data access aligment (cacheline), we need a separate version here.
22
 */
23
template <typename DType, typename IdType>
24
__global__ void IndexSelectMultiKernelAligned(
25
    const DType* const array, const int64_t num_feat, const IdType* const index,
26
27
28
    const int64_t length, const int64_t arr_len, DType* const out,
    const int64_t* perm = nullptr) {
  int64_t out_row_index = blockIdx.x * blockDim.y + threadIdx.y;
29

30
  const int64_t stride = blockDim.y * gridDim.x;
31

32
  while (out_row_index < length) {
33
    int64_t col = threadIdx.x;
34
    const int64_t in_row = index[out_row_index];
35
    assert(in_row >= 0 && in_row < arr_len);
36
    const int64_t idx_offset =
37
38
        ((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) /
        sizeof(DType);
39
    col = col - idx_offset;
40
    const auto out_row = perm ? perm[out_row_index] : out_row_index;
41
    while (col < num_feat) {
42
      if (col >= 0)
43
        out[out_row * num_feat + col] = array[in_row * num_feat + col];
44
45
      col += blockDim.x;
    }
46
    out_row_index += stride;
47
48
49
50
51
52
53
  }
}

}  // namespace impl
}  // namespace aten
}  // namespace dgl

54
#endif  // DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_