common.h 9.21 KB
Newer Older
sangwzh's avatar
sangwzh committed
1
// !!! This is a file automatically generated by hipify!!!
2
3
/**
 *  Copyright (c) 2017-2023 by Contributors
4
 *  Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
5
6
7
8
9
10
 * @file cuda/common.h
 * @brief Common utilities for CUDA
 */
#ifndef GRAPHBOLT_CUDA_COMMON_H_
#define GRAPHBOLT_CUDA_COMMON_H_

11
#include <thrust/execution_policy.h>
sangwzh's avatar
sangwzh committed
12
13
14
15
16
#include <ATen/hip/HIPEvent.h>
#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
#include <c10/hip/HIPException.h>
#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
#include <hip/hip_runtime.h>
17
18
19
20
#include <torch/script.h>

#include <memory>
#include <unordered_map>
21
22
23
24

namespace graphbolt {
namespace cuda {

25
26
27
28
/**
 * @brief This class is designed to allocate workspace storage
 * and to get a nonblocking thrust execution policy
 * that uses torch's CUDA memory pool and the current cuda stream:
29
 *
30
 * cuda::CUDAWorkspaceAllocator allocator;
sangwzh's avatar
sangwzh committed
31
32
 * const auto stream = torch::hip::getDefaultHIPStreamMasqueradingAsCUDA();
 * const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
33
 *
34
 * Now, one can pass exec_policy to thrust functions
35
 *
36
37
 * To get an integer array of size 1000 whose lifetime is managed by unique_ptr,
 * use:
38
 *
39
 * auto int_array = allocator.AllocateStorage<int>(1000);
40
 *
41
42
 * int_array.get() gives the raw pointer.
 */
43
template <typename value_t = char>
44
struct CUDAWorkspaceAllocator {
45
  static_assert(sizeof(char) == 1, "sizeof(char) == 1 should hold.");
46
  // Required by thrust to satisfy allocator requirements.
47
  using value_type = value_t;
48

49
  explicit CUDAWorkspaceAllocator() { at::globalContext().lazyInitCUDA(); }
50

51
52
53
  template <class U>
  CUDAWorkspaceAllocator(CUDAWorkspaceAllocator<U> const&) noexcept {}

54
55
  CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;

56
  void operator()(void* ptr) const {
sangwz's avatar
sangwz committed
57
    at::hip::HIPCachingAllocator::raw_delete(ptr);
58
  }
59
60
61

  // Required by thrust to satisfy allocator requirements.
  value_type* allocate(std::ptrdiff_t size) const {
62
    return reinterpret_cast<value_type*>(
sangwz's avatar
sangwz committed
63
        at::hip::HIPCachingAllocator::raw_alloc(size * sizeof(value_type)));
64
65
66
67
68
69
70
71
72
  }

  // Required by thrust to satisfy allocator requirements.
  void deallocate(value_type* ptr, std::size_t) const { operator()(ptr); }

  template <typename T>
  std::unique_ptr<T, CUDAWorkspaceAllocator> AllocateStorage(
      std::size_t size) const {
    return std::unique_ptr<T, CUDAWorkspaceAllocator>(
73
        reinterpret_cast<T*>(
sangwz's avatar
sangwz committed
74
            at::cuda::HIPCachingAllocator::raw_alloc(sizeof(T) * size)),
75
        *this);
76
77
78
  }
};

79
inline auto GetAllocator() { return CUDAWorkspaceAllocator{}; }
80

sangwzh's avatar
sangwzh committed
81
inline auto GetCurrentStream() { return c10::hip::getCurrentHIPStreamMasqueradingAsCUDA(); }
82

83
84
85
86
87
88
89
90
91
92
template <typename T>
inline bool is_zero(T size) {
  return size == 0;
}

template <>
inline bool is_zero<dim3>(dim3 size) {
  return size.x == 0 || size.y == 0 || size.z == 0;
}

93
94
#define CUDA_RUNTIME_CHECK(EXPR)                           \
  do {                                                     \
sangwz's avatar
sangwz committed
95
96
97
    hipError_t __err = EXPR;                              \
    if (__err != hipSuccess) {                            \
      auto get_error_str_err = hipGetErrorString(__err);  \
98
      AT_ERROR("HIP runtime error: ", get_error_str_err); \
99
    }                                                      \
100
  } while (0)
sangwzh's avatar
sangwzh committed
101
#define CUDA_CALL(func) C10_HIP_CHECK((func))
102

103
104
105
106
107
#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, ...)          \
  {                                                                 \
    if (!graphbolt::cuda::is_zero((nblks)) &&                       \
        !graphbolt::cuda::is_zero((nthrs))) {                       \
      auto stream = graphbolt::cuda::GetCurrentStream();            \
sangwzh's avatar
sangwzh committed
108
109
     hipLaunchKernelGGL(( (kernel)), dim3((nblks)), dim3((nthrs)), (shmem), stream, __VA_ARGS__); \
      C10_HIP_KERNEL_LAUNCH_CHECK();                               \
110
    }                                                               \
111
112
  }

113
114
115
116
117
#define CUB_CALL(fn, ...)                                                     \
  {                                                                           \
    auto allocator = graphbolt::cuda::GetAllocator();                         \
    auto stream = graphbolt::cuda::GetCurrentStream();                        \
    size_t workspace_size = 0;                                                \
sangwzh's avatar
sangwzh committed
118
    CUDA_CALL(hipcub::fn(nullptr, workspace_size, __VA_ARGS__, stream));         \
119
    auto workspace = allocator.AllocateStorage<char>(workspace_size);         \
sangwzh's avatar
sangwzh committed
120
    CUDA_CALL(hipcub::fn(workspace.get(), workspace_size, __VA_ARGS__, stream)); \
121
122
123
124
125
126
  }

#define THRUST_CALL(fn, ...)                                                 \
  [&] {                                                                      \
    auto allocator = graphbolt::cuda::GetAllocator();                        \
    auto stream = graphbolt::cuda::GetCurrentStream();                       \
sangwzh's avatar
sangwzh committed
127
    const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); \
128
129
130
    return thrust::fn(exec_policy, __VA_ARGS__);                             \
  }()

131
132
133
134
135
136
137
138
139
140
141
142
143
/**
 * @brief This class is designed to handle the copy operation of a single
 * scalar_t item from a given CUDA device pointer. Later, if the object is cast
 * into scalar_t, the value can be read.
 *
 * auto num_edges = cuda::CopyScalar(indptr.data_ptr<scalar_t>() +
 *     indptr.size(0) - 1);
 * // Perform many operations here, they will run as normal.
 * // We finally need to read num_edges.
 * auto indices = torch::empty(static_cast<scalar_t>(num_edges));
 */
template <typename scalar_t>
struct CopyScalar {
144
145
  CopyScalar() : is_ready_(true) { init_pinned_storage(); }

sangwzh's avatar
sangwzh committed
146
  void record(at::hip::HIPStreamMasqueradingAsCUDA stream = GetCurrentStream()) {
147
148
149
150
151
152
153
154
155
156
    copy_event_.record(stream);
    is_ready_ = false;
  }

  scalar_t* get() {
    return reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr());
  }

  CopyScalar(const scalar_t* device_ptr) {
    init_pinned_storage();
157
    auto stream = GetCurrentStream();
sangwzh's avatar
sangwzh committed
158
    CUDA_CALL(hipMemcpyAsync(
159
        reinterpret_cast<scalar_t*>(pinned_scalar_.data_ptr()), device_ptr,
sangwzh's avatar
sangwzh committed
160
        sizeof(scalar_t), hipMemcpyDeviceToHost, stream));
161
    record(stream);
162
163
164
165
166
167
168
  }

  operator scalar_t() {
    if (!is_ready_) {
      copy_event_.synchronize();
      is_ready_ = true;
    }
169
    return *get();
170
171
172
  }

 private:
173
174
175
176
177
178
  void init_pinned_storage() {
    pinned_scalar_ = torch::empty(
        sizeof(scalar_t),
        c10::TensorOptions().dtype(torch::kBool).pinned_memory(true));
  }

179
180
181
182
183
  torch::Tensor pinned_scalar_;
  at::cuda::CUDAEvent copy_event_;
  bool is_ready_;
};

184
185
186
187
188
189
190
191
192
193
// This includes all integer, float and boolean types.
#define GRAPHBOLT_DISPATCH_CASE_ALL_TYPES(...)            \
  AT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__)                 \
  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::Bool, __VA_ARGS__)

#define GRAPHBOLT_DISPATCH_ALL_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, GRAPHBOLT_DISPATCH_CASE_ALL_TYPES(__VA_ARGS__))

194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#define GRAPHBOLT_DISPATCH_ELEMENT_SIZES(element_size, name, ...)             \
  [&] {                                                                       \
    switch (element_size) {                                                   \
      case 1: {                                                               \
        using element_size_t = uint8_t;                                       \
        return __VA_ARGS__();                                                 \
      }                                                                       \
      case 2: {                                                               \
        using element_size_t = uint16_t;                                      \
        return __VA_ARGS__();                                                 \
      }                                                                       \
      case 4: {                                                               \
        using element_size_t = uint32_t;                                      \
        return __VA_ARGS__();                                                 \
      }                                                                       \
      case 8: {                                                               \
        using element_size_t = uint64_t;                                      \
        return __VA_ARGS__();                                                 \
      }                                                                       \
      case 16: {                                                              \
        using element_size_t = float4;                                        \
        return __VA_ARGS__();                                                 \
      }                                                                       \
      default:                                                                \
        TORCH_CHECK(false, name, " with the element_size is not supported!"); \
        using element_size_t = uint8_t;                                       \
        return __VA_ARGS__();                                                 \
    }                                                                         \
  }()

224
225
226
}  // namespace cuda
}  // namespace graphbolt
#endif  // GRAPHBOLT_CUDA_COMMON_H_