cuda_common.h 7.9 KB
Newer Older
1
/**
2
 *  Copyright (c) 2017 by Contributors
3
4
 * @file cuda_common.h
 * @brief Common utilities for CUDA
5
6
7
8
9
10
 */
#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_

#include <cublas_v2.h>
#include <cuda_runtime.h>
11
#include <curand.h>
12
#include <cusparse.h>
13
#include <dgl/runtime/packed_func.h>
14

15
#include <memory>
16
#include <string>
17

18
19
20
21
22
#include "../workspace_pool.h"

namespace dgl {
namespace runtime {

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/*
  How to use this class to get a nonblocking thrust execution policy that uses
  DGL's memory pool and the current cuda stream

  runtime::CUDAWorkspaceAllocator allocator(ctx);
  const auto stream = runtime::getCurrentCUDAStream();
  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);

  now, one can pass exec_policy to thrust functions

  to get an integer array of size 1000 whose lifetime is managed by unique_ptr,
  use: auto int_array = allocator.alloc_unique<int>(1000); int_array.get() gives
  the raw pointer.
*/
class CUDAWorkspaceAllocator {
  DGLContext ctx;

 public:
  typedef char value_type;

  void operator()(void* ptr) const {
    runtime::DeviceAPI::Get(ctx)->FreeWorkspace(ctx, ptr);
  }

  explicit CUDAWorkspaceAllocator(DGLContext ctx) : ctx(ctx) {}

  CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;

  template <typename T>
  std::unique_ptr<T, CUDAWorkspaceAllocator> alloc_unique(
      std::size_t size) const {
    return std::unique_ptr<T, CUDAWorkspaceAllocator>(
        reinterpret_cast<T*>(runtime::DeviceAPI::Get(ctx)->AllocWorkspace(
            ctx, sizeof(T) * size)),
        *this);
  }

  char* allocate(std::ptrdiff_t size) const {
    return reinterpret_cast<char*>(
        runtime::DeviceAPI::Get(ctx)->AllocWorkspace(ctx, size));
  }

  void deallocate(char* ptr, std::size_t) const {
    runtime::DeviceAPI::Get(ctx)->FreeWorkspace(ctx, ptr);
  }
};

70
71
template <typename T>
inline bool is_zero(T size) {
72
  return size == 0;
73
74
75
76
}

template <>
inline bool is_zero<dim3>(dim3 size) {
77
  return size.x == 0 || size.y == 0 || size.z == 0;
78
79
}

80
81
82
83
#define CUDA_DRIVER_CALL(x)                                             \
  {                                                                     \
    CUresult result = x;                                                \
    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
84
      const char* msg;                                                  \
85
      cuGetErrorName(result, &msg);                                     \
86
      LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg;     \
87
88
89
    }                                                                   \
  }

90
91
92
93
94
#define CUDA_CALL(func)                                      \
  {                                                          \
    cudaError_t e = (func);                                  \
    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
        << "CUDA: " << cudaGetErrorString(e);                \
95
96
  }

97
98
99
100
101
102
103
104
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...)            \
  {                                                                           \
    if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
      (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__);         \
      cudaError_t e = cudaGetLastError();                                     \
      CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)                \
          << "CUDA kernel launch error: " << cudaGetErrorString(e);           \
    }                                                                         \
105
106
  }

107
108
109
110
#define CUSPARSE_CALL(func)                                         \
  {                                                                 \
    cusparseStatus_t e = (func);                                    \
    CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
111
112
  }

113
114
115
116
#define CUBLAS_CALL(func)                                       \
  {                                                             \
    cublasStatus_t e = (func);                                  \
    CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
117
118
  }

119
120
121
122
123
124
125
#define CURAND_CALL(func)                                                      \
  {                                                                            \
    curandStatus_t e = (func);                                                 \
    CHECK(e == CURAND_STATUS_SUCCESS)                                          \
        << "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
        << __FILE__ << ":" << __LINE__;                                        \
  }
126
127
128

inline const char* curandGetErrorString(curandStatus_t error) {
  switch (error) {
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
    case CURAND_STATUS_SUCCESS:
      return "CURAND_STATUS_SUCCESS";
    case CURAND_STATUS_VERSION_MISMATCH:
      return "CURAND_STATUS_VERSION_MISMATCH";
    case CURAND_STATUS_NOT_INITIALIZED:
      return "CURAND_STATUS_NOT_INITIALIZED";
    case CURAND_STATUS_ALLOCATION_FAILED:
      return "CURAND_STATUS_ALLOCATION_FAILED";
    case CURAND_STATUS_TYPE_ERROR:
      return "CURAND_STATUS_TYPE_ERROR";
    case CURAND_STATUS_OUT_OF_RANGE:
      return "CURAND_STATUS_OUT_OF_RANGE";
    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
    case CURAND_STATUS_LAUNCH_FAILURE:
      return "CURAND_STATUS_LAUNCH_FAILURE";
    case CURAND_STATUS_PREEXISTING_FAILURE:
      return "CURAND_STATUS_PREEXISTING_FAILURE";
    case CURAND_STATUS_INITIALIZATION_FAILED:
      return "CURAND_STATUS_INITIALIZATION_FAILED";
    case CURAND_STATUS_ARCH_MISMATCH:
      return "CURAND_STATUS_ARCH_MISMATCH";
    case CURAND_STATUS_INTERNAL_ERROR:
      return "CURAND_STATUS_INTERNAL_ERROR";
155
156
157
158
159
  }
  // To suppress compiler warning.
  return "Unrecognized curand error string";
}

160
/**
161
 * @brief Cast data type to cudaDataType_t.
162
163
164
165
166
167
168
 */
template <typename T>
struct cuda_dtype {
  static constexpr cudaDataType_t value = CUDA_R_32F;
};

template <>
169
struct cuda_dtype<__half> {
170
171
172
  static constexpr cudaDataType_t value = CUDA_R_16F;
};

173
174
175
176
177
178
179
#if BF16_ENABLED
template <>
struct cuda_dtype<__nv_bfloat16> {
  static constexpr cudaDataType_t value = CUDA_R_16BF;
};
#endif  // BF16_ENABLED

180
181
182
183
184
185
186
187
188
189
template <>
struct cuda_dtype<float> {
  static constexpr cudaDataType_t value = CUDA_R_32F;
};

template <>
struct cuda_dtype<double> {
  static constexpr cudaDataType_t value = CUDA_R_64F;
};

190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*
 * \brief Accumulator type for SpMM.
 */
template <typename T>
struct accum_dtype {
  typedef float type;
};

template <>
struct accum_dtype<__half> {
  typedef float type;
};

#if BF16_ENABLED
template <>
struct accum_dtype<__nv_bfloat16> {
  typedef float type;
};
#endif  // BF16_ENABLED

template <>
struct accum_dtype<float> {
  typedef float type;
};

template <>
struct accum_dtype<double> {
  typedef double type;
};

Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
220
#if CUDART_VERSION >= 11000
221
/**
222
 * @brief Cast index data type to cusparseIndexType_t.
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
 */
template <typename T>
struct cusparse_idtype {
  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
};

template <>
struct cusparse_idtype<int32_t> {
  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
};

template <>
struct cusparse_idtype<int64_t> {
  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
};
Quan (Andy) Gan's avatar
Quan (Andy) Gan committed
238
#endif
239

240
/** @brief Thread local workspace */
241
242
class CUDAThreadEntry {
 public:
243
  /** @brief The cusparse handler */
244
  cusparseHandle_t cusparse_handle{nullptr};
245
  /** @brief The cublas handler */
246
  cublasHandle_t cublas_handle{nullptr};
247
  /** @brief The curand generator */
248
  curandGenerator_t curand_gen{nullptr};
249
  /** @brief thread local pool*/
250
  WorkspacePool pool;
251
  /** @brief constructor */
252
253
254
255
  CUDAThreadEntry();
  // get the threadlocal workspace
  static CUDAThreadEntry* ThreadLocal();
};
256

257
/** @brief Get the current CUDA stream */
258
cudaStream_t getCurrentCUDAStream();
259
260
261
}  // namespace runtime
}  // namespace dgl
#endif  // DGL_RUNTIME_CUDA_CUDA_COMMON_H_