array_index_select.cu 3.77 KB
Newer Older
1
2
3
4
5
6
7
/*!
 *  Copyright (c) 2019 by Contributors
 * \file array/cpu/array_index_select.cu
 * \brief Array index select GPU implementation
 */
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
8
#include "./array_index_select.cuh"
9
#include "./utils.h"
10
11
12
13
14
15

namespace dgl {
using runtime::NDArray;
namespace aten {
namespace impl {

16
template<DGLDeviceType XPU, typename DType, typename IdType>
17
NDArray IndexSelect(NDArray array, IdArray index) {
18
  cudaStream_t stream = runtime::getCurrentCUDAStream();
19
20
21
22
  const DType* array_data = static_cast<DType*>(array->data);
  const IdType* idx_data = static_cast<IdType*>(index->data);
  const int64_t arr_len = array->shape[0];
  const int64_t len = index->shape[0];
23
24
25
26
27
28
29
  int64_t num_feat = 1;
  std::vector<int64_t> shape{len};
  for (int d = 1; d < array->ndim; ++d) {
    num_feat *= array->shape[d];
    shape.emplace_back(array->shape[d]);
  }

30
  // use index->ctx for pinned array
31
  NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx);
32
33
34
  if (len == 0)
    return ret;
  DType* ret_data = static_cast<DType*>(ret->data);
35
36
37
38

  if (num_feat == 1) {
      const int nt = cuda::FindNumThreads(len);
      const int nb = (len + nt - 1) / nt;
39
      CUDA_KERNEL_CALL(IndexSelectSingleKernel, nb, nt, 0, stream,
40
          array_data, idx_data, len, arr_len, ret_data);
41
42
43
44
45
46
47
  } else {
      dim3 block(256, 1);
      while (static_cast<int64_t>(block.x) >= 2*num_feat) {
          block.x /= 2;
          block.y *= 2;
      }
      const dim3 grid((len+block.y-1)/block.y);
48
      CUDA_KERNEL_CALL(IndexSelectMultiKernel, grid, block, 0, stream,
49
          array_data, num_feat, idx_data, len, arr_len, ret_data);
50
  }
51
52
53
  return ret;
}

54
55
56
57
template NDArray IndexSelect<kDGLCUDA, int32_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, int32_t, int64_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, int64_t, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, int64_t, int64_t>(NDArray, IdArray);
58
#ifdef USE_FP16
59
60
template NDArray IndexSelect<kDGLCUDA, __half, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, __half, int64_t>(NDArray, IdArray);
61
#endif
62
63
64
65
template NDArray IndexSelect<kDGLCUDA, float, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, float, int64_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, double, int32_t>(NDArray, IdArray);
template NDArray IndexSelect<kDGLCUDA, double, int64_t>(NDArray, IdArray);
66

67
template <DGLDeviceType XPU, typename DType>
68
DType IndexSelect(NDArray array, int64_t index) {
69
  auto device = runtime::DeviceAPI::Get(array->ctx);
70
71
72
73
74
75
76
77
#ifdef USE_FP16
  // The initialization constructor for __half is apparently a device-
  // only function in some setups, but the current function, IndexSelect,
  // isn't run on the device, so it doesn't have access to that constructor.
  using SafeDType = typename std::conditional<
      std::is_same<DType, __half>::value, uint16_t, DType>::type;
  SafeDType ret = 0;
#else
78
  DType ret = 0;
79
#endif
80
  device->CopyDataFromTo(
81
      static_cast<DType*>(array->data) + index, 0, reinterpret_cast<DType*>(&ret), 0,
82
      sizeof(DType), array->ctx, DGLContext{kDGLCPU, 0}, array->dtype);
83
  return reinterpret_cast<DType&>(ret);
84
85
}

86
87
88
89
template int32_t IndexSelect<kDGLCUDA, int32_t>(NDArray array, int64_t index);
template int64_t IndexSelect<kDGLCUDA, int64_t>(NDArray array, int64_t index);
template uint32_t IndexSelect<kDGLCUDA, uint32_t>(NDArray array, int64_t index);
template uint64_t IndexSelect<kDGLCUDA, uint64_t>(NDArray array, int64_t index);
90
#ifdef USE_FP16
91
template __half IndexSelect<kDGLCUDA, __half>(NDArray array, int64_t index);
92
#endif
93
94
template float IndexSelect<kDGLCUDA, float>(NDArray array, int64_t index);
template double IndexSelect<kDGLCUDA, double>(NDArray array, int64_t index);
95
96
97
98

}  // namespace impl
}  // namespace aten
}  // namespace dgl