cuda_utils.hu 6.45 KB
Newer Older
1
/*!
2
 * Copyright (c) 2020-2021 IBM Corporation, Microsoft Corporation. All rights reserved.
3
4
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
5

6
7
8
#ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_
#define LIGHTGBM_CUDA_CUDA_UTILS_H_

9
#ifdef USE_CUDA
10

11
12
13
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
14

15
#include <LightGBM/utils/log.h>
16
17

#include <algorithm>
18
#include <vector>
19
#include <cmath>
20

21
22
namespace LightGBM {

23
24
typedef unsigned long long atomic_add_long_t;

25
26
27
28
29
30
31
#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
  if (code != cudaSuccess) {
    LightGBM::Log::Fatal("[CUDA] %s %s %d\n", cudaGetErrorString(code), file, line);
    if (abort) exit(code);
  }
}
32
33
34
35
36

#define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }

void SetCUDADevice(int gpu_device_id, const char* file, int line);

37
38
int GetCUDADevice(const char* file, int line);

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
template <typename T>
void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) {
  void* tmp_ptr = nullptr;
  CUDASUCCESS_OR_FATAL_OUTER(cudaMalloc(&tmp_ptr, size * sizeof(T)));
  *out_ptr = reinterpret_cast<T*>(tmp_ptr);
}

template <typename T>
void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice));
}

template <typename T>
void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  AllocateCUDAMemory<T>(dst_ptr, size, file, line);
  CopyFromHostToCUDADevice<T>(*dst_ptr, src_ptr, size, file, line);
}

template <typename T>
void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost));
}

template <typename T>
void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost, stream));
}

template <typename T>
void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice));
}

template <typename T>
void CopyFromCUDADeviceToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) {
  void* void_dst_ptr = reinterpret_cast<void*>(dst_ptr);
  const void* void_src_ptr = reinterpret_cast<const void*>(src_ptr);
  size_t size_in_bytes = size * sizeof(T);
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice));
}

void SynchronizeCUDADevice(const char* file, const int line);

template <typename T>
void SetCUDAMemory(T* dst_ptr, int value, size_t size, const char* file, const int line) {
  CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast<void*>(dst_ptr), value, size * sizeof(T)));
  SynchronizeCUDADevice(file, line);
}

template <typename T>
void DeallocateCUDAMemory(T** ptr, const char* file, const int line) {
  if (*ptr != nullptr) {
    CUDASUCCESS_OR_FATAL_OUTER(cudaFree(reinterpret_cast<void*>(*ptr)));
    *ptr = nullptr;
  }
}

void PrintLastCUDAError();

template <typename T>
class CUDAVector {
 public:
  CUDAVector() {
    size_ = 0;
    data_ = nullptr;
  }

  explicit CUDAVector(size_t size) {
    size_ = size;
    AllocateCUDAMemory<T>(&data_, size_, __FILE__, __LINE__);
  }

  void Resize(size_t size) {
124
125
126
    if (size == size_) {
      return;
    }
127
128
    if (size == 0) {
      Clear();
129
      return;
130
131
132
133
    }
    T* new_data = nullptr;
    AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
    if (size_ > 0 && data_ != nullptr) {
134
135
      const size_t size_for_old_content = std::min<size_t>(size_, size);
      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
136
137
138
139
140
141
    }
    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
    data_ = new_data;
    size_ = size;
  }

142
143
144
145
146
  void InitFromHostVector(const std::vector<T>& host_vector) {
    Resize(host_vector.size());
    CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
  }

147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
  void Clear() {
    if (size_ > 0 && data_ != nullptr) {
      DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
    }
    size_ = 0;
  }

  void PushBack(const T* values, size_t len) {
    T* new_data = nullptr;
    AllocateCUDAMemory<T>(&new_data, size_ + len, __FILE__, __LINE__);
    if (size_ > 0 && data_ != nullptr) {
      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_, __FILE__, __LINE__);
    }
    CopyFromCUDADeviceToCUDADevice<T>(new_data + size_, values, len, __FILE__, __LINE__);
    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
    size_ += len;
    data_ = new_data;
  }

  size_t Size() {
    return size_;
  }

  ~CUDAVector() {
    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
  }

  std::vector<T> ToHost() {
    std::vector<T> host_vector(size_);
    if (size_ > 0 && data_ != nullptr) {
      CopyFromCUDADeviceToHost(host_vector.data(), data_, size_, __FILE__, __LINE__);
    }
    return host_vector;
  }

182
  T* RawData() const {
183
184
185
    return data_;
  }

186
187
188
189
  void SetValue(int value) {
    SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
  }

shiyu1994's avatar
shiyu1994 committed
190
191
192
193
  const T* RawDataReadOnly() const {
    return data_;
  }

194
195
196
197
198
 private:
  T* data_;
  size_t size_;
};

199
200
201
202
203
204
205
206
207
template <typename T>
static __device__ T SafeLog(T x) {
  if (x > 0) {
    return std::log(x);
  } else {
    return -INFINITY;
  }
}

208
}  // namespace LightGBM
209

210
211
#endif  // USE_CUDA

212
#endif  // LIGHTGBM_CUDA_CUDA_UTILS_H_