utils.h 8.52 KB
Newer Older
1
2
/*!
 *  Copyright (c) 2020 by Contributors
3
 * \file array/cuda/utils.h
4
5
 * \brief Utilities for CUDA kernels.
 */
6
7
#ifndef DGL_ARRAY_CUDA_UTILS_H_
#define DGL_ARRAY_CUDA_UTILS_H_
8
9

#include <dmlc/logging.h>
10
#include <dgl/runtime/c_runtime_api.h>
11
12
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/ndarray.h>
13
#include "../../runtime/cuda/cuda_common.h"
14
#include "dgl_cub.cuh"
15
16
17
18
19
20
21

namespace dgl {
namespace cuda {

#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
22
23
// The max number of threads per block
#define CUDA_MAX_NUM_THREADS 256
24

25
26
27
28
29
30
31
32
33
34
35
36
37
#ifdef USE_FP16
#define SWITCH_BITS(bits, DType, ...)                           \
  do {                                                          \
    if ((bits) == 16) {                                         \
      typedef half DType;                                       \
      { __VA_ARGS__ }                                           \
    } else if ((bits) == 32) {                                  \
      typedef float DType;                                      \
      { __VA_ARGS__ }                                           \
    } else if ((bits) == 64) {                                  \
      typedef double DType;                                     \
      { __VA_ARGS__ }                                           \
    } else {                                                    \
38
      LOG(FATAL) << "Data type not recognized with bits " << bits; \
39
40
41
42
43
44
45
46
47
48
49
50
    }                                                           \
  } while (0)
#else  // USE_FP16
#define SWITCH_BITS(bits, DType, ...)                           \
  do {                                                          \
    if ((bits) == 32) {                                         \
      typedef float DType;                                      \
      { __VA_ARGS__ }                                           \
    } else if ((bits) == 64) {                                  \
      typedef double DType;                                     \
      { __VA_ARGS__ }                                           \
    } else {                                                    \
51
      LOG(FATAL) << "Data type not recognized with bits " << bits; \
52
53
54
55
    }                                                           \
  } while (0)
#endif  // USE_FP16

56
57
58
59
60
61
/*! \brief Calculate the number of threads needed given the dimension length.
 *
 * It finds the biggest number that is smaller than min(dim, max_nthrs)
 * and is also power of two.
 */
inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) {
62
63
64
  CHECK_GE(dim, 0);
  if (dim == 0)
    return 1;
65
66
67
68
69
70
71
  int ret = max_nthrs;
  while (ret > dim) {
    ret = ret >> 1;
  }
  return ret;
}

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*
 * !\brief Find number of blocks is smaller than nblks and max_nblks
 * on the given axis ('x', 'y' or 'z').
 */
template <char axis>
inline int FindNumBlocks(int nblks, int max_nblks = -1) {
  int default_max_nblks = -1;
  switch (axis) {
    case 'x':
      default_max_nblks = CUDA_MAX_NUM_BLOCKS_X;
      break;
    case 'y':
      default_max_nblks = CUDA_MAX_NUM_BLOCKS_Y;
      break;
    case 'z':
      default_max_nblks = CUDA_MAX_NUM_BLOCKS_Z;
      break;
    default:
      LOG(FATAL) << "Axis " << axis << " not recognized";
      break;
  }
  if (max_nblks == -1)
    max_nblks = default_max_nblks;
  CHECK_NE(nblks, 0);
  if (nblks < max_nblks)
    return nblks;
  return max_nblks;
}

template <typename T>
__device__ __forceinline__ T _ldg(T* addr) {
#if __CUDA_ARCH__ >= 350
  return __ldg(addr);
#else
  return *addr;
#endif
}

110
111
112
113
114
115
116
117
118
/*!
 * \brief Return true if the given bool flag array is all true.
 * The input bool array is in int8_t type so it is aligned with byte address.
 *
 * \param flags The bool array.
 * \param length The length.
 * \param ctx Device context.
 * \return True if all the flags are true.
 */
119
bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx);
120

121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/*!
 * \brief CUDA Kernel of filling the vector started from ptr of size length
 *        with val.
 * \note internal use only.
 */
template <typename DType>
__global__ void _FillKernel(DType* ptr, size_t length, DType val) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    ptr[tx] = val;
    tx += stride_x;
  }
}

/*! \brief Fill the vector started from ptr of size length with val */
template <typename DType>
void _Fill(DType* ptr, size_t length, DType val) {
139
  cudaStream_t stream = runtime::getCurrentCUDAStream();
140
141
  int nt = FindNumThreads(length);
  int nb = (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
142
  CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, stream, ptr, length, val);
143
144
}

145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/*!
 * \brief Search adjacency list linearly for each (row, col) pair and
 * write the data under the matched position in the indices array to the output.
 *
 * If there is no match, the value in \c filler is written.
 * If there are multiple matches, only the first match is written.
 * If the given data array is null, write the matched position to the output.
 */
template <typename IdType, typename DType>
__global__ void _LinearSearchKernel(
    const IdType* indptr, const IdType* indices, const IdType* data,
    const IdType* row, const IdType* col,
    int64_t row_stride, int64_t col_stride,
    int64_t length, const DType* weights, DType filler, DType* out) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    int rpos = tx * row_stride, cpos = tx * col_stride;
    IdType v = -1;
    const IdType r = row[rpos], c = col[cpos];
    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
      if (indices[i] == c) {
        v = data ? data[i] : i;
        break;
      }
    }
171
    if (v == -1) {
172
      out[tx] = filler;
173
174
175
176
177
178
179
180
181
182
183
    } else {
      // The casts here are to be able to handle DType being __half.
      // GCC treats int64_t as a distinct type from long long, so
      // without the explcit cast to long long, it errors out saying
      // that the implicit cast results in an ambiguous choice of
      // constructor for __half.
      // The using statement is to avoid a linter error about using
      // long or long long.
      using LongLong = long long; // NOLINT
      out[tx] = weights ? weights[v] : DType(LongLong(v));
    }
184
185
186
187
    tx += stride_x;
  }
}

188
189
190
template <typename DType>
inline DType GetCUDAScalar(
    runtime::DeviceAPI* device_api,
191
    DGLContext ctx,
192
    const DType* cuda_ptr) {
193
194
195
196
197
198
  DType result;
  device_api->CopyDataFromTo(
      cuda_ptr, 0,
      &result, 0,
      sizeof(result),
      ctx,
199
200
      DGLContext{kDGLCPU, 0},
      DGLDataTypeTraits<DType>::dtype);
201
202
203
  return result;
}

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/*!
 * \brief Given a sorted array and a value this function returns the index
 * of the first element which compares greater than value.
 *
 * This function assumes 0-based index
 * @param A: ascending sorted array
 * @param n: size of the A
 * @param x: value to search in A
 * @return index, i, of the first element st. A[i]>x. If x>=A[n-1] returns n.
 * if x<A[0] then it returns 0.
 */
template <typename IdType>
__device__ IdType _UpperBound(const IdType *A, int64_t n, IdType x) {
  IdType l = 0, r = n, m = 0;
  while (l < r) {
    m = l + (r-l)/2;
    if (x >= A[m]) {
      l = m+1;
    } else {
      r = m;
    }
  }
  return l;
}

/*!
 * \brief Given a sorted array and a value this function returns the index
 * of the element who is equal to val. If not exist returns n+1
 *
 * This function assumes 0-based index
 * @param A: ascending sorted array
 * @param n: size of the A
 * @param x: value to search in A
 * @return index, i, st. A[i]==x. If such an index not exists returns 'n'.
 */
template <typename IdType>
__device__ IdType _BinarySearch(const IdType *A, int64_t n, IdType x) {
  IdType l = 0, r = n-1, m = 0;
  while (l <= r) {
    m = l + (r-l)/2;
    if (A[m] == x) {
      return m;
    }
    if (A[m] < x) {
      l = m+1;
    } else {
      r = m-1;
    }
  }
  return n;  // not found
}

256
257
258
259
260
261
262
263
264
265
266
267
268
269
template <typename DType, typename BoolType>
void MaskSelect(
    runtime::DeviceAPI* device, const DGLContext& ctx,
    const DType* input, const BoolType* mask, DType* output, int64_t n,
    int64_t* rst, cudaStream_t stream) {
  size_t workspace_size = 0;
  CUDA_CALL(cub::DeviceSelect::Flagged(
      nullptr, workspace_size, input, mask, output, rst, n, stream));
  void* workspace = device->AllocWorkspace(ctx, workspace_size);
  CUDA_CALL(cub::DeviceSelect::Flagged(
      workspace, workspace_size, input, mask, output, rst, n, stream));
  device->FreeWorkspace(ctx, workspace);
}

270
271
272
}  // namespace cuda
}  // namespace dgl

273
#endif  // DGL_ARRAY_CUDA_UTILS_H_