"vscode:/vscode.git/clone" did not exist on "790212f4d992641f2cb17d2d6c4eaeff0c91e741"
array_op_impl.cu 15.5 KB
Newer Older
1
/*!
2
 *  Copyright (c) 2020-2021 by Contributors
3
4
5
6
7
 * \file array/cuda/array_op_impl.cu
 * \brief Array operator GPU implementation
 */
#include <dgl/array.h>
#include "../../runtime/cuda/cuda_common.h"
8
#include "../../runtime/cuda/cuda_hashtable.cuh"
9
#include "./utils.h"
10
#include "../arith.h"
11
12
13

namespace dgl {
using runtime::NDArray;
14
using namespace runtime::cuda;
15
16
17
namespace aten {
namespace impl {

18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
///////////////////////////// BinaryElewise /////////////////////////////

template <typename IdType, typename Op>
__global__ void _BinaryElewiseKernel(
    const IdType* lhs, const IdType* rhs, IdType* out, int64_t length) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    out[tx] = Op::Call(lhs[tx], rhs[tx]);
    tx += stride_x;
  }
}

template <DLDeviceType XPU, typename IdType, typename Op>
IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
  const int64_t len = lhs->shape[0];
  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
38
  cudaStream_t stream = runtime::getCurrentCUDAStream();
39
40
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
41
  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
42
      nb, nt, 0, stream,
43
44
45
46
47
48
49
50
      lhs_data, rhs_data, ret_data, len);
  return ret;
}

template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(IdArray lhs, IdArray rhs);
51
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mod>(IdArray lhs, IdArray rhs);
52
53
54
55
56
57
58
59
60
61
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(IdArray lhs, IdArray rhs);
62
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mod>(IdArray lhs, IdArray rhs);
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(IdArray lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(IdArray lhs, IdArray rhs);


template <typename IdType, typename Op>
__global__ void _BinaryElewiseKernel(
    const IdType* lhs, IdType rhs, IdType* out, int64_t length) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    out[tx] = Op::Call(lhs[tx], rhs);
    tx += stride_x;
  }
}

template <DLDeviceType XPU, typename IdType, typename Op>
IdArray BinaryElewise(IdArray lhs, IdType rhs) {
  const int64_t len = lhs->shape[0];
  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
88
  cudaStream_t stream = runtime::getCurrentCUDAStream();
89
90
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
91
  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
92
      nb, nt, 0, stream,
93
94
95
96
97
98
99
100
      lhs_data, rhs, ret_data, len);
  return ret;
}

template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(IdArray lhs, int32_t rhs);
101
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mod>(IdArray lhs, int32_t rhs);
102
103
104
105
106
107
108
109
110
111
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(IdArray lhs, int32_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(IdArray lhs, int64_t rhs);
112
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mod>(IdArray lhs, int64_t rhs);
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(IdArray lhs, int64_t rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(IdArray lhs, int64_t rhs);



template <typename IdType, typename Op>
__global__ void _BinaryElewiseKernel(
    IdType lhs, const IdType* rhs, IdType* out, int64_t length) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    out[tx] = Op::Call(lhs, rhs[tx]);
    tx += stride_x;
  }
}

template <DLDeviceType XPU, typename IdType, typename Op>
IdArray BinaryElewise(IdType lhs, IdArray rhs) {
  const int64_t len = rhs->shape[0];
  IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
139
  cudaStream_t stream = runtime::getCurrentCUDAStream();
140
141
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
142
  CUDA_KERNEL_CALL((_BinaryElewiseKernel<IdType, Op>),
143
      nb, nt, 0, stream,
144
145
146
147
148
149
150
151
      lhs, rhs_data, ret_data, len);
  return ret;
}

template IdArray BinaryElewise<kDLGPU, int32_t, arith::Add>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Sub>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mul>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Div>(int32_t lhs, IdArray rhs);
152
template IdArray BinaryElewise<kDLGPU, int32_t, arith::Mod>(int32_t lhs, IdArray rhs);
153
154
155
156
157
158
159
160
161
162
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GT>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LT>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::GE>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::LE>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::EQ>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int32_t, arith::NE>(int32_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Add>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Sub>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mul>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Div>(int64_t lhs, IdArray rhs);
163
template IdArray BinaryElewise<kDLGPU, int64_t, arith::Mod>(int64_t lhs, IdArray rhs);
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GT>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LT>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::GE>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::LE>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::EQ>(int64_t lhs, IdArray rhs);
template IdArray BinaryElewise<kDLGPU, int64_t, arith::NE>(int64_t lhs, IdArray rhs);

template <typename IdType, typename Op>
__global__ void _UnaryElewiseKernel(
    const IdType* lhs, IdType* out, int64_t length) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    out[tx] = Op::Call(lhs[tx]);
    tx += stride_x;
179
  }
180
181
182
183
184
185
186
187
}

template <DLDeviceType XPU, typename IdType, typename Op>
IdArray UnaryElewise(IdArray lhs) {
  const int64_t len = lhs->shape[0];
  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
  IdType* ret_data = static_cast<IdType*>(ret->data);
188
  cudaStream_t stream = runtime::getCurrentCUDAStream();
189
190
  int nt = cuda::FindNumThreads(len);
  int nb = (len + nt - 1) / nt;
191
  CUDA_KERNEL_CALL((_UnaryElewiseKernel<IdType, Op>),
192
      nb, nt, 0, stream,
193
      lhs_data, ret_data, len);
194
195
196
  return ret;
}

197
198
199
200
201
template IdArray UnaryElewise<kDLGPU, int32_t, arith::Neg>(IdArray lhs);
template IdArray UnaryElewise<kDLGPU, int64_t, arith::Neg>(IdArray lhs);

///////////////////////////// Full /////////////////////////////

202
template <typename DType>
203
__global__ void _FullKernel(
204
    DType* out, int64_t length, DType val) {
205
206
207
208
209
210
211
212
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    out[tx] = val;
    tx += stride_x;
  }
}

213
214
215
216
template <DLDeviceType XPU, typename DType>
NDArray Full(DType val, int64_t length, DLContext ctx) {
  NDArray ret = NDArray::Empty({length}, DLDataTypeTraits<DType>::dtype, ctx);
  DType* ret_data = static_cast<DType*>(ret->data);
217
  cudaStream_t stream = runtime::getCurrentCUDAStream();
218
219
  int nt = cuda::FindNumThreads(length);
  int nb = (length + nt - 1) / nt;
220
  CUDA_KERNEL_CALL((_FullKernel<DType>), nb, nt, 0, stream,
221
      ret_data, length, val);
222
223
224
225
226
  return ret;
}

template IdArray Full<kDLGPU, int32_t>(int32_t val, int64_t length, DLContext ctx);
template IdArray Full<kDLGPU, int64_t>(int64_t val, int64_t length, DLContext ctx);
227
228
229
#ifdef USE_FP16
template IdArray Full<kDLGPU, __half>(__half val, int64_t length, DLContext ctx);
#endif
230
231
template IdArray Full<kDLGPU, float>(float val, int64_t length, DLContext ctx);
template IdArray Full<kDLGPU, double>(double val, int64_t length, DLContext ctx);
232

233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

///////////////////////////// Range /////////////////////////////

template <typename IdType>
__global__ void _RangeKernel(IdType* out, IdType low, IdType length) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    out[tx] = low + tx;
    tx += stride_x;
  }
}

template <DLDeviceType XPU, typename IdType>
IdArray Range(IdType low, IdType high, DLContext ctx) {
  CHECK(high >= low) << "high must be bigger than low";
  const IdType length = high - low;
  IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
  if (length == 0)
    return ret;
  IdType* ret_data = static_cast<IdType*>(ret->data);
254
  cudaStream_t stream = runtime::getCurrentCUDAStream();
255
  int nt = cuda::FindNumThreads(length);
256
  int nb = (length + nt - 1) / nt;
257
  CUDA_KERNEL_CALL((_RangeKernel<IdType>),
258
      nb, nt, 0, stream,
259
      ret_data, low, length);
260
261
262
263
264
265
  return ret;
}

template IdArray Range<kDLGPU, int32_t>(int32_t, int32_t, DLContext);
template IdArray Range<kDLGPU, int64_t>(int64_t, int64_t, DLContext);

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
///////////////////////////// Relabel_ //////////////////////////////

template <typename IdType>
__global__ void _RelabelKernel(
    IdType* out, int64_t length, DeviceOrderedHashTable<IdType> table) {

  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;

  while (tx < length) {
    out[tx] = table.Search(out[tx])->local;
    tx += stride_x;
  }
}

template <DLDeviceType XPU, typename IdType>
IdArray Relabel_(const std::vector<IdArray>& arrays) {
  IdArray all_nodes = Concat(arrays);
  const int64_t total_length = all_nodes->shape[0];

  if (total_length == 0) {
    return all_nodes;
  }

  const auto& ctx = arrays[0]->ctx;
  auto device = runtime::DeviceAPI::Get(ctx);
292
  cudaStream_t stream = runtime::getCurrentCUDAStream();
293
294

  // build node maps and get the induced nodes
295
  OrderedHashTable<IdType> node_map(total_length, ctx, stream);
296
297
298
299
300
301
302
303
304
  int64_t num_induced = 0;
  int64_t * num_induced_device = static_cast<int64_t*>(
      device->AllocWorkspace(ctx, sizeof(int64_t)));
  IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType)*8);

  CUDA_CALL(cudaMemsetAsync(
    num_induced_device,
    0,
    sizeof(*num_induced_device),
305
    stream));
306
307
308
309
310
311

  node_map.FillWithDuplicates(
    all_nodes.Ptr<IdType>(),
    all_nodes->shape[0],
    induced_nodes.Ptr<IdType>(),
    num_induced_device,
312
313
    stream);
  // copy using the internal current stream
314
315
316
317
318
319
  device->CopyDataFromTo(
    num_induced_device, 0,
    &num_induced, 0,
    sizeof(num_induced),
    ctx,
    DGLContext{kDLCPU, 0},
320
321
    DGLType{kDLInt, 64, 1});

322
  device->StreamSync(ctx, stream);
323
324
325
326
327
328
329
330
331
332
333
  device->FreeWorkspace(ctx, num_induced_device);

  // resize the induced nodes
  induced_nodes->shape[0] = num_induced;

  // relabel
  const int nt = 128;
  for (IdArray arr : arrays) {
    const int64_t length = arr->shape[0];
    int nb = (length + nt - 1) / nt;
    CUDA_KERNEL_CALL((_RelabelKernel<IdType>),
334
      nb, nt, 0, stream,
335
336
337
338
339
340
341
342
343
      arr.Ptr<IdType>(), length, node_map.DeviceHandle());
  }

  return induced_nodes;
}

template IdArray Relabel_<kDLGPU, int32_t>(const std::vector<IdArray>& arrays);
template IdArray Relabel_<kDLGPU, int64_t>(const std::vector<IdArray>& arrays);

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
///////////////////////////// AsNumBits /////////////////////////////

template <typename InType, typename OutType>
__global__ void _CastKernel(const InType* in, OutType* out, size_t length) {
  int tx = blockIdx.x * blockDim.x + threadIdx.x;
  int stride_x = gridDim.x * blockDim.x;
  while (tx < length) {
    out[tx] = in[tx];
    tx += stride_x;
  }
}

template <DLDeviceType XPU, typename IdType>
IdArray AsNumBits(IdArray arr, uint8_t bits) {
  const std::vector<int64_t> shape(arr->shape, arr->shape + arr->ndim);
  IdArray ret = IdArray::Empty(shape, DLDataType{kDLInt, bits, 1}, arr->ctx);
  const int64_t length = ret.NumElements();
361
  cudaStream_t stream = runtime::getCurrentCUDAStream();
362
  int nt = cuda::FindNumThreads(length);
363
364
  int nb = (length + nt - 1) / nt;
  if (bits == 32) {
365
    CUDA_KERNEL_CALL((_CastKernel<IdType, int32_t>),
366
        nb, nt, 0, stream,
367
368
        static_cast<IdType*>(arr->data), static_cast<int32_t*>(ret->data), length);
  } else {
369
    CUDA_KERNEL_CALL((_CastKernel<IdType, int64_t>),
370
        nb, nt, 0, stream,
371
372
373
374
375
376
377
378
379
380
381
382
        static_cast<IdType*>(arr->data), static_cast<int64_t*>(ret->data), length);
  }
  return ret;
}


template IdArray AsNumBits<kDLGPU, int32_t>(IdArray arr, uint8_t bits);
template IdArray AsNumBits<kDLGPU, int64_t>(IdArray arr, uint8_t bits);

}  // namespace impl
}  // namespace aten
}  // namespace dgl