cuda_utils.hu 6.53 KB
Newer Older
1
/*!
2
 * Copyright (c) 2020-2021 IBM Corporation, Microsoft Corporation. All rights reserved.
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5

6
7
8
#ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_
#define LIGHTGBM_CUDA_CUDA_UTILS_H_

9
#ifdef USE_CUDA
10

11
12
13
#if defined(USE_ROCM)
#include <LightGBM/cuda/cuda_rocm_interop.h>
#else
14
15
#include <cuda.h>
#include <cuda_runtime.h>
16
#endif
17
#include <stdio.h>
18

19
#include <LightGBM/utils/log.h>
20
21

#include <algorithm>
22
#include <vector>
23
#include <cmath>
24

25
26
namespace LightGBM {

27
28
typedef unsigned long long atomic_add_long_t;

29
30
31
32
33
34
35
#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
  if (code != cudaSuccess) {
    LightGBM::Log::Fatal("[CUDA] %s %s %d\n", cudaGetErrorString(code), file, line);
    if (abort) exit(code);
  }
}
36
37
38
39
40

#define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }

void SetCUDADevice(int gpu_device_id, const char* file, int line);

41
42
int GetCUDADevice(const char* file, int line);

43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
template <typename T>
void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) {
  void* tmp_ptr = nullptr;
  CUDASUCCESS_OR_FATAL_OUTER(cudaMalloc(&tmp_ptr, size * sizeof(T)));
  *out_ptr = reinterpret_cast<T*>(tmp_ptr);
}

template <typename T>
void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice));
}

template <typename T>
void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  AllocateCUDAMemory<T>(dst_ptr, size, file, line);
  CopyFromHostToCUDADevice<T>(*dst_ptr, src_ptr, size, file, line);
}

template <typename T>
void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost));
}

template <typename T>
void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost, stream));
}

template <typename T>
void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice));
}

template <typename T>
void CopyFromCUDADeviceToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice));
}

void SynchronizeCUDADevice(const char* file, const int line);

template <typename T>
void SetCUDAMemory(T* dst_ptr, int value, size_t size, const char* file, const int line) {
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast<void*>(dst_ptr), value, size * sizeof(T)));
  SynchronizeCUDADevice(file, line);
}

template <typename T>
void DeallocateCUDAMemory(T** ptr, const char* file, const int line) {
  if (*ptr != nullptr) {
    CUDASUCCESS_OR_FATAL_OUTER(cudaFree(reinterpret_cast<void*>(*ptr)));
    *ptr = nullptr;
  }
}

void PrintLastCUDAError();

template <typename T>
class CUDAVector {
 public:
  CUDAVector() {
    size_ = 0;
    data_ = nullptr;
  }

  explicit CUDAVector(size_t size) {
    size_ = size;
    AllocateCUDAMemory<T>(&data_, size_, __FILE__, __LINE__);
  }

  void Resize(size_t size) {
128
129
130
    if (size == size_) {
      return;
    }
131
132
    if (size == 0) {
      Clear();
133
      return;
134
135
136
137
    }
    T* new_data = nullptr;
    AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
    if (size_ > 0 && data_ != nullptr) {
138
139
      const size_t size_for_old_content = std::min<size_t>(size_, size);
      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
140
141
142
143
144
145
    }
    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
    data_ = new_data;
    size_ = size;
  }

146
147
148
149
150
  void InitFromHostVector(const std::vector<T>& host_vector) {
    Resize(host_vector.size());
    CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
  }

151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
  void Clear() {
    if (size_ > 0 && data_ != nullptr) {
      DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
    }
    size_ = 0;
  }

  void PushBack(const T* values, size_t len) {
    T* new_data = nullptr;
    AllocateCUDAMemory<T>(&new_data, size_ + len, __FILE__, __LINE__);
    if (size_ > 0 && data_ != nullptr) {
      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_, __FILE__, __LINE__);
    }
    CopyFromCUDADeviceToCUDADevice<T>(new_data + size_, values, len, __FILE__, __LINE__);
    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
    size_ += len;
    data_ = new_data;
  }

  size_t Size() {
    return size_;
  }

  ~CUDAVector() {
    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
  }

  std::vector<T> ToHost() {
    std::vector<T> host_vector(size_);
    if (size_ > 0 && data_ != nullptr) {
      CopyFromCUDADeviceToHost(host_vector.data(), data_, size_, __FILE__, __LINE__);
    }
    return host_vector;
  }

186
  T* RawData() const {
187
188
189
    return data_;
  }

190
191
192
193
  void SetValue(int value) {
    SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
  }

shiyu1994's avatar
shiyu1994 committed
194
195
196
197
  const T* RawDataReadOnly() const {
    return data_;
  }

198
199
200
201
202
 private:
  T* data_;
  size_t size_;
};

203
204
205
206
207
208
209
210
211
template <typename T>
static __device__ T SafeLog(T x) {
  if (x > 0) {
    return std::log(x);
  } else {
    return -INFINITY;
  }
}

212
}  // namespace LightGBM
213

214
#endif  // USE_CUDA
215

216
#endif  // LIGHTGBM_CUDA_CUDA_UTILS_H_