cuda_device_api.cc 13.6 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
// !!! This is a file automatically generated by hipify!!!
2
/**
3
 *  Copyright (c) 2017-2022 by Contributors
4
5
 * @file cuda_device_api.cc
 * @brief GPU specific API
6
 */
sangwzh's avatar
sangwzh committed
7
#include <hip/hip_runtime.h>
8
#include <dgl/runtime/device_api.h>
9
#include <dgl/runtime/registry.h>
10
#include <dgl/runtime/tensordispatch.h>
11
#include <dmlc/thread_local.h>
12

13
14
15
16
17
18
19
#include "cuda_common.h"

namespace dgl {
namespace runtime {

class CUDADeviceAPI final : public DeviceAPI {
 public:
20
21
  CUDADeviceAPI() {
    int count;
sangwzh's avatar
sangwzh committed
22
    auto err = hipGetDeviceCount(&count);
23
    switch (err) {
sangwzh's avatar
sangwzh committed
24
      case hipSuccess:
25
26
27
        break;
      default:
        count = 0;
sangwzh's avatar
sangwzh committed
28
        hipGetLastError();
29
30
31
32
    }
    is_available_ = count > 0;
  }

33
  bool IsAvailable() final { return is_available_; }
34

35
  void SetDevice(DGLContext ctx) final {
sangwzh's avatar
sangwzh committed
36
    CUDA_CALL(hipSetDevice(ctx.device_id));
37
38
39
40
41
  }
  void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final {
    int value = 0;
    switch (kind) {
      case kExist:
42
        value =
sangwzh's avatar
sangwzh committed
43
44
45
            (hipDeviceGetAttribute(
                 &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id) ==
             hipSuccess);
46
47
        break;
      case kMaxThreadsPerBlock: {
sangwzh's avatar
sangwzh committed
48
49
        CUDA_CALL(hipDeviceGetAttribute(
            &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id));
50
51
52
        break;
      }
      case kWarpSize: {
53
        CUDA_CALL(
sangwzh's avatar
sangwzh committed
54
            hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id));
55
56
57
        break;
      }
      case kMaxSharedMemoryPerBlock: {
sangwzh's avatar
sangwzh committed
58
59
        CUDA_CALL(hipDeviceGetAttribute(
            &value, hipDeviceAttributeMaxSharedMemoryPerBlock, ctx.device_id));
60
61
62
63
        break;
      }
      case kComputeVersion: {
        std::ostringstream os;
sangwzh's avatar
sangwzh committed
64
65
        CUDA_CALL(hipDeviceGetAttribute(
            &value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id));
66
        os << value << ".";
sangwzh's avatar
sangwzh committed
67
68
        CUDA_CALL(hipDeviceGetAttribute(
            &value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id));
69
70
71
72
73
        os << value;
        *rv = os.str();
        return;
      }
      case kDeviceName: {
sangwzh's avatar
sangwzh committed
74
75
        hipDeviceProp_t props;
        CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
76
        *rv = std::string(props.name);
sangwzh's avatar
sangwzh committed
77
        // printf("******* debug: device.name:%s\n ",std::string(props.name).c_str());
78
79
80
        return;
      }
      case kMaxClockRate: {
sangwzh's avatar
sangwzh committed
81
82
        CUDA_CALL(hipDeviceGetAttribute(
            &value, hipDeviceAttributeClockRate, ctx.device_id));
83
84
85
        break;
      }
      case kMultiProcessorCount: {
sangwzh's avatar
sangwzh committed
86
87
        CUDA_CALL(hipDeviceGetAttribute(
            &value, hipDeviceAttributeMultiprocessorCount, ctx.device_id));
88
89
90
91
        break;
      }
      case kMaxThreadDimensions: {
        int dims[3];
sangwzh's avatar
sangwzh committed
92
93
94
95
96
97
        CUDA_CALL(hipDeviceGetAttribute(
            &dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id));
        CUDA_CALL(hipDeviceGetAttribute(
            &dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id));
        CUDA_CALL(hipDeviceGetAttribute(
            &dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id));
98
99

        std::stringstream ss;  // use json string to return multiple int values;
100
        ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
101
102
103
104
105
106
        *rv = ss.str();
        return;
      }
    }
    *rv = value;
  }
107
108
109
  void* AllocDataSpace(
      DGLContext ctx, size_t nbytes, size_t alignment,
      DGLDataType type_hint) final {
110
111
    SetDevice(ctx);
    // Redirect to PyTorch's allocator when available.
112
113
114
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable()) {
      return tensor_dispatcher->CUDAAllocWorkspace(
sangwzh's avatar
sangwzh committed
115
          nbytes, getCurrentHIPStreamMasqueradingAsCUDA());
116
    }
117
118
    CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
    void* ret;
sangwzh's avatar
sangwzh committed
119
    CUDA_CALL(hipMalloc(&ret, nbytes));
120
121
122
123
    return ret;
  }

  void FreeDataSpace(DGLContext ctx, void* ptr) final {
124
    SetDevice(ctx);
125
126
127
128
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable()) {
      return tensor_dispatcher->CUDAFreeWorkspace(ptr);
    }
sangwzh's avatar
sangwzh committed
129
    CUDA_CALL(hipFree(ptr));
130
131
  }

132
133
134
135
  void CopyDataFromTo(
      const void* from, size_t from_offset, void* to, size_t to_offset,
      size_t size, DGLContext ctx_from, DGLContext ctx_to,
      DGLDataType type_hint, DGLStreamHandle stream) {
sangwzh's avatar
sangwzh committed
136
    hipStream_t cu_stream = static_cast<hipStream_t>(stream);
137
138
    from = static_cast<const char*>(from) + from_offset;
    to = static_cast<char*>(to) + to_offset;
sangwzh's avatar
sangwzh committed
139
140
    if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA || ctx_from.device_type == kDGLROCM && ctx_to.device_type == kDGLROCM) {
      CUDA_CALL(hipSetDevice(ctx_from.device_id));
141
      if (ctx_from.device_id == ctx_to.device_id) {
sangwzh's avatar
sangwzh committed
142
        GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream);
143
      } else {
sangwzh's avatar
sangwzh committed
144
        CUDA_CALL(hipMemcpyPeerAsync(
145
            to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
146
      }
147
    } else if (
sangwzh's avatar
sangwzh committed
148
149
150
        (ctx_from.device_type == kDGLCUDA || ctx_to.device_type == kDGLROCM)&& ctx_to.device_type == kDGLCPU) {
      CUDA_CALL(hipSetDevice(ctx_from.device_id));
      GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream);
151
    } else if (
sangwzh's avatar
sangwzh committed
152
153
154
        ctx_from.device_type == kDGLCPU && (ctx_to.device_type == kDGLCUDA||ctx_to.device_type == kDGLROCM)) {
      CUDA_CALL(hipSetDevice(ctx_to.device_id));
      GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream);
155
156
157
158
159
    } else {
      LOG(FATAL) << "expect copy from/to GPU or between GPU";
    }
  }

160
161
162
163
  void CopyDataFromTo(
      const void* from, size_t from_offset, void* to, size_t to_offset,
      size_t size, DGLContext ctx_from, DGLContext ctx_to,
      DGLDataType type_hint) final {
164
    auto stream = GetStream();
165
166
167
    CopyDataFromTo(
        from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint,
        stream);
168
169
  }

170
  // To ensure correct behavior, `record_event` must be invoked anytime a
sangwzh's avatar
sangwzh committed
171
  // pointer from PyTorch CachingHostAllocator is used in a hipMemcpyAsync
172
  // call. It provides a way to re-use freed pinned (page-locked) memory
sangwzh's avatar
sangwzh committed
173
  // allocations and avoid device sync due to hipHostFree calls.
174
175
176
177
178
179
180
181
182
183
  void RecordedCopyDataFromTo(
      void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
      DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint,
      void* pytorch_ctx) final {
    auto stream = GetStream();
    CopyDataFromTo(
        from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint,
        stream);
    auto tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable()) {
sangwzh's avatar
sangwzh committed
184
      auto custream = static_cast<hipStream_t>(stream);
185
186
187
188
189
190
191
      void* ptr = ctx_to.device_type == kDGLCPU ? to : from;
      int id =
          ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id;
      tensor_dispatcher->CUDARecordHostAlloc(ptr, pytorch_ctx, custream, id);
    }
  }

192
  DGLStreamHandle CreateStream(DGLContext ctx) {
sangwzh's avatar
sangwzh committed
193
194
    CUDA_CALL(hipSetDevice(ctx.device_id));
    hipStream_t retval;
195
    // make sure the legacy default stream won't block on this stream
sangwzh's avatar
sangwzh committed
196
    CUDA_CALL(hipStreamCreateWithFlags(&retval, hipStreamNonBlocking));
197
198
199
200
    return static_cast<DGLStreamHandle>(retval);
  }

  void FreeStream(DGLContext ctx, DGLStreamHandle stream) {
sangwzh's avatar
sangwzh committed
201
202
203
    CUDA_CALL(hipSetDevice(ctx.device_id));
    hipStream_t cu_stream = static_cast<hipStream_t>(stream);
    CUDA_CALL(hipStreamDestroy(cu_stream));
204
205
  }

206
207
  void SyncStreamFromTo(
      DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) {
sangwzh's avatar
sangwzh committed
208
209
210
211
212
213
214
215
    CUDA_CALL(hipSetDevice(ctx.device_id));
    hipStream_t src_stream = static_cast<hipStream_t>(event_src);
    hipStream_t dst_stream = static_cast<hipStream_t>(event_dst);
    hipEvent_t evt;
    CUDA_CALL(hipEventCreate(&evt));
    CUDA_CALL(hipEventRecord(evt, src_stream));
    CUDA_CALL(hipStreamWaitEvent(dst_stream, evt, 0));
    CUDA_CALL(hipEventDestroy(evt));
216
217
218
  }

  void StreamSync(DGLContext ctx, DGLStreamHandle stream) final {
sangwzh's avatar
sangwzh committed
219
220
    CUDA_CALL(hipSetDevice(ctx.device_id));
    CUDA_CALL(hipStreamSynchronize(static_cast<hipStream_t>(stream)));
221
222
  }

223
  /** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
224
225
226
227
228
229
   *        so just avoid calling our SetStream/CreateStream unless
   *        you really need advanced stream control.
   * TODO(Xin): Redirect this to PyTorch or remove it.
   * PyTorch allows external CUDA streams to be set as current since v1.11.
   */
  void SetStream(DGLContext ctx, DGLStreamHandle stream) final {}
230

231
  DGLStreamHandle GetStream() const final {
sangwzh's avatar
sangwzh committed
232
    return static_cast<DGLStreamHandle>(getCurrentHIPStreamMasqueradingAsCUDA());
233
234
  }

sangwzh's avatar
sangwzh committed
235
  /** NOTE: hipHostRegister can be called from an arbitrary GPU device,
236
237
238
239
   *        so we don't need to specify a ctx.
   *        The pinned memory can be seen by all CUDA contexts,
   *        not just the one that performed the allocation
   */
240
  bool PinData(void* ptr, size_t nbytes) override {
241
    // prevent users from pinning empty tensors or graphs
242
    if (ptr == nullptr || nbytes == 0) return false;
243
244
245
246
247
248
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    // Minimize the pinned memory pool allocated by backend (via tensoradapter)
    // to preserve enough memory for DGL inherited in-place pin-memory operation
    if (tensor_dispatcher->IsAvailable()) {
      tensor_dispatcher->CUDAHostAllocatorEmptyCache();
    }
sangwzh's avatar
sangwzh committed
249
    CUDA_CALL(hipHostRegister(ptr, nbytes, hipHostRegisterDefault));
250
    return true;
251
252
  }

253
  void UnpinData(void* ptr) {
254
    if (ptr == nullptr) return;
sangwzh's avatar
sangwzh committed
255
    CUDA_CALL(hipHostUnregister(ptr));
256
257
  }

258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
  void* AllocPinnedDataSpace(
      size_t nbytes, void** ctx, void** deleter) override {
    // prevent pinning empty tensors or graphs
    if (nbytes == 0) return nullptr;
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    CHECK(tensor_dispatcher->IsAvailable())
        << "CachingHostAllocator is not available in the current backend "
           "PyTorch. Please update the PyTorch version to 1.11+";
    return tensor_dispatcher->CUDAAllocHostWorkspace(nbytes, ctx, deleter);
  }

  void FreePinnedDataSpace(void** deleter) override {
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    CHECK(tensor_dispatcher->IsAvailable())
        << "CachingHostAllocator is not available in the current backend "
           "PyTorch. Please update the PyTorch version to 1.11+";
    tensor_dispatcher->CUDAFreeHostWorkspace(deleter);
  }

277
278
  bool IsPinned(const void* ptr) override {
    // can't be a pinned tensor if CUDA context is unavailable.
279
    if (!is_available_) return false;
280

sangwzh's avatar
sangwzh committed
281
282
    hipPointerAttribute_t attr;
    hipError_t status = hipPointerGetAttributes(&attr, ptr);
283
284
285
    bool result = false;

    switch (status) {
sangwzh's avatar
sangwzh committed
286
      case hipErrorInvalidValue:
287
        // might be a normal CPU tensor in CUDA 10.2-
sangwzh's avatar
sangwzh committed
288
        hipGetLastError();  // clear error
289
        break;
sangwzh's avatar
sangwzh committed
290
291
      case hipSuccess:
        result = (attr.type == hipMemoryTypeHost);
292
        break;
sangwzh's avatar
sangwzh committed
293
294
295
296
      case hipErrorInitializationError:
      case hipErrorNoDevice:
      case hipErrorInsufficientDriver:
      case hipErrorInvalidDevice:
297
298
299
300
301
302
        // We don't want to fail in these particular cases since this function
        // can be called when users only want to run on CPU even if CUDA API is
        // enabled, or in a forked subprocess where CUDA context cannot be
        // initialized.  So we just mark the CUDA context to unavailable and
        // return.
        is_available_ = false;
sangwzh's avatar
sangwzh committed
303
        hipGetLastError();  // clear error
304
305
306
        break;
      default:
        LOG(FATAL) << "error while determining memory status: "
sangwzh's avatar
sangwzh committed
307
                   << hipGetErrorString(status);
308
        break;
309
310
311
312
313
    }

    return result;
  }

314
315
  void* AllocWorkspace(
      DGLContext ctx, size_t size, DGLDataType type_hint) final {
316
    SetDevice(ctx);
317
    // Redirect to PyTorch's allocator when available.
318
319
320
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable())
      return tensor_dispatcher->CUDAAllocWorkspace(
sangwzh's avatar
sangwzh committed
321
          size, getCurrentHIPStreamMasqueradingAsCUDA());
322
323

    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
324
325
326
  }

  void FreeWorkspace(DGLContext ctx, void* data) final {
327
    SetDevice(ctx);
328
329
330
    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
    if (tensor_dispatcher->IsAvailable())
      return tensor_dispatcher->CUDAFreeWorkspace(data);
331
332

    CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
333
334
335
336
337
338
339
340
341
  }

  static const std::shared_ptr<CUDADeviceAPI>& Global() {
    static std::shared_ptr<CUDADeviceAPI> inst =
        std::make_shared<CUDADeviceAPI>();
    return inst;
  }

 private:
342
  static void GPUCopy(
sangwzh's avatar
sangwzh committed
343
344
345
346
      const void* from, void* to, size_t size, hipMemcpyKind kind,
      hipStream_t stream) {
    CUDA_CALL(hipMemcpyAsync(to, from, size, kind, stream));
    if (stream == 0 && kind == hipMemcpyDeviceToHost) {
347
348
      // only wait for the copy, when it's on the default stream, and it's to
      // host memory
sangwzh's avatar
sangwzh committed
349
      CUDA_CALL(hipStreamSynchronize(stream));
350
351
    }
  }
352
353

  bool is_available_ = true;
354
355
356
357
};

typedef dmlc::ThreadLocalStore<CUDAThreadEntry> CUDAThreadStore;

358
CUDAThreadEntry::CUDAThreadEntry() : pool(kDGLCUDA, CUDADeviceAPI::Global()) {}
359
360
361
362
363

CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
  return CUDAThreadStore::Get();
}

sangwzh's avatar
sangwzh committed
364
hipStream_t getCurrentHIPStreamMasqueradingAsCUDA() {
365
366
367
  TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
  if (tensor_dispatcher->IsAvailable())
    return tensor_dispatcher->CUDAGetCurrentStream();
368
369
370
371
  else  // return the default stream when TA is not available
    return nullptr;
}

372
DGL_REGISTER_GLOBAL("device_api.cuda")
373
374
375
376
    .set_body([](DGLArgs args, DGLRetValue* rv) {
      DeviceAPI* ptr = CUDADeviceAPI::Global().get();
      *rv = static_cast<void*>(ptr);
    });
377
378
379

}  // namespace runtime
}  // namespace dgl