utils.h 7.92 KB
Newer Older
1
2
/*!
 *  Copyright (c) 2020 by Contributors
3
 * \file array/cuda/utils.h
4
5
 * \brief Utilities for CUDA kernels.
 */
6
7
#ifndef DGL_ARRAY_CUDA_UTILS_H_
#define DGL_ARRAY_CUDA_UTILS_H_
8
9

#include <dmlc/logging.h>
10
11
#include <dgl/runtime/device_api.h>
#include <dgl/runtime/ndarray.h>
12
#include <dlpack/dlpack.h>
13
#include "../../runtime/cuda/cuda_common.h"
14
15
16
17
18
19
20
21
22

namespace dgl {
namespace cuda {

#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
#define CUDA_MAX_NUM_THREADS 1024

23
24
25
26
27
28
29
30
31
32
33
34
35
#ifdef USE_FP16
#define SWITCH_BITS(bits, DType, ...)                           \
  do {                                                          \
    if ((bits) == 16) {                                         \
      typedef half DType;                                       \
      { __VA_ARGS__ }                                           \
    } else if ((bits) == 32) {                                  \
      typedef float DType;                                      \
      { __VA_ARGS__ }                                           \
    } else if ((bits) == 64) {                                  \
      typedef double DType;                                     \
      { __VA_ARGS__ }                                           \
    } else {                                                    \
36
      LOG(FATAL) << "Data type not recognized with bits " << bits; \
37
38
39
40
41
42
43
44
45
46
47
48
    }                                                           \
  } while (0)
#else  // USE_FP16
#define SWITCH_BITS(bits, DType, ...)                           \
  do {                                                          \
    if ((bits) == 32) {                                         \
      typedef float DType;                                      \
      { __VA_ARGS__ }                                           \
    } else if ((bits) == 64) {                                  \
      typedef double DType;                                     \
      { __VA_ARGS__ }                                           \
    } else {                                                    \
49
      LOG(FATAL) << "Data type not recognized with bits " << bits; \
50
51
52
53
    }                                                           \
  } while (0)
#endif  // USE_FP16

54
55
56
57
58
59
/*! \brief Calculate the number of threads needed given the dimension length.
 *
 * It finds the biggest number that is smaller than min(dim, max_nthrs)
 * and is also power of two.
 */
inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) {
60
61
62
  CHECK_GE(dim, 0);
  if (dim == 0)
    return 1;
63
64
65
66
67
68
69
  int ret = max_nthrs;
  while (ret > dim) {
    ret = ret >> 1;
  }
  return ret;
}

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
 * !\brief Find number of blocks is smaller than nblks and max_nblks
 * on the given axis ('x', 'y' or 'z').
 */
template <char axis>
inline int FindNumBlocks(int nblks, int max_nblks = -1) {
  int default_max_nblks = -1;
  switch (axis) {
    case 'x':
      default_max_nblks = CUDA_MAX_NUM_BLOCKS_X;
      break;
    case 'y':
      default_max_nblks = CUDA_MAX_NUM_BLOCKS_Y;
      break;
    case 'z':
      default_max_nblks = CUDA_MAX_NUM_BLOCKS_Z;
      break;
    default:
      LOG(FATAL) << "Axis " << axis << " not recognized";
      break;
  }
  if (max_nblks == -1)
    max_nblks = default_max_nblks;
  CHECK_NE(nblks, 0);
  if (nblks < max_nblks)
    return nblks;
  return max_nblks;
}

template <typename T>
__device__ __forceinline__ T _ldg(T* addr) {
#if __CUDA_ARCH__ >= 350
  return __ldg(addr);
#else
  return *addr;
#endif
}

108
109
110
111
112
113
114
115
116
117
118
/*!
 * \brief Return true if the given bool flag array is all true.
 * The input bool array is in int8_t type so it is aligned with byte address.
 *
 * \param flags The bool array.
 * \param length The length.
 * \param ctx Device context.
 * \return True if all the flags are true.
 */
bool AllTrue(int8_t* flags, int64_t length, const DLContext& ctx);

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*!
 * \brief CUDA Kernel of filling the vector started from ptr of size length
 *        with val.
 * \note internal use only.
 */
template <typename DType>
__global__ void _FillKernel(DType* ptr, size_t length, DType val) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    ptr[tx] = val;
    tx += stride_x;
  }
}

/*! \brief Fill the vector started from ptr of size length with val */
template <typename DType>
void _Fill(DType* ptr, size_t length, DType val) {
  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
  int nt = FindNumThreads(length);
  int nb = (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
  CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, thr_entry->stream, ptr, length, val);
}

143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/*!
 * \brief Search adjacency list linearly for each (row, col) pair and
 * write the data under the matched position in the indices array to the output.
 *
 * If there is no match, the value in \c filler is written.
 * If there are multiple matches, only the first match is written.
 * If the given data array is null, write the matched position to the output.
 */
template <typename IdType, typename DType>
__global__ void _LinearSearchKernel(
    const IdType* indptr, const IdType* indices, const IdType* data,
    const IdType* row, const IdType* col,
    int64_t row_stride, int64_t col_stride,
    int64_t length, const DType* weights, DType filler, DType* out) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  const int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    int rpos = tx * row_stride, cpos = tx * col_stride;
    IdType v = -1;
    const IdType r = row[rpos], c = col[cpos];
    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
      if (indices[i] == c) {
        v = data ? data[i] : i;
        break;
      }
    }
169
    if (v == -1) {
170
      out[tx] = filler;
171
172
173
174
175
176
177
178
179
180
181
    } else {
      // The casts here are to be able to handle DType being __half.
      // GCC treats int64_t as a distinct type from long long, so
      // without the explcit cast to long long, it errors out saying
      // that the implicit cast results in an ambiguous choice of
      // constructor for __half.
      // The using statement is to avoid a linter error about using
      // long or long long.
      using LongLong = long long; // NOLINT
      out[tx] = weights ? weights[v] : DType(LongLong(v));
    }
182
183
184
185
    tx += stride_x;
  }
}

186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
template <typename DType>
inline DType GetCUDAScalar(
    runtime::DeviceAPI* device_api,
    DLContext ctx,
    const DType* cuda_ptr,
    cudaStream_t stream) {
  DType result;
  device_api->CopyDataFromTo(
      cuda_ptr, 0,
      &result, 0,
      sizeof(result),
      ctx,
      DLContext{kDLCPU, 0},
      DLDataTypeTraits<DType>::dtype,
      stream);
  return result;
}

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/*!
 * \brief Given a sorted array and a value this function returns the index
 * of the first element which compares greater than value.
 *
 * This function assumes 0-based index
 * @param A: ascending sorted array
 * @param n: size of the A
 * @param x: value to search in A
 * @return index, i, of the first element st. A[i]>x. If x>=A[n-1] returns n.
 * if x<A[0] then it returns 0.
 */
template <typename IdType>
__device__ IdType _UpperBound(const IdType *A, int64_t n, IdType x) {
  IdType l = 0, r = n, m = 0;
  while (l < r) {
    m = l + (r-l)/2;
    if (x >= A[m]) {
      l = m+1;
    } else {
      r = m;
    }
  }
  return l;
}

/*!
 * \brief Given a sorted array and a value this function returns the index
 * of the element who is equal to val. If not exist returns n+1
 *
 * This function assumes 0-based index
 * @param A: ascending sorted array
 * @param n: size of the A
 * @param x: value to search in A
 * @return index, i, st. A[i]==x. If such an index not exists returns 'n'.
 */
template <typename IdType>
__device__ IdType _BinarySearch(const IdType *A, int64_t n, IdType x) {
  IdType l = 0, r = n-1, m = 0;
  while (l <= r) {
    m = l + (r-l)/2;
    if (A[m] == x) {
      return m;
    }
    if (A[m] < x) {
      l = m+1;
    } else {
      r = m-1;
    }
  }
  return n;  // not found
}

256
257
258
}  // namespace cuda
}  // namespace dgl

259
#endif  // DGL_ARRAY_CUDA_UTILS_H_