Unverified Commit 5c9e61d1 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

[CUDA] Set GPU device ID in threads (#6028)



* set gpu device id in open mp threads

* move SetCUDADevice outside for loop

---------
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
parent fe838d88
...@@ -98,6 +98,7 @@ class CUDAColumnData { ...@@ -98,6 +98,7 @@ class CUDAColumnData {
void ResizeWhenCopySubrow(const data_size_t num_used_indices); void ResizeWhenCopySubrow(const data_size_t num_used_indices);
int gpu_device_id_;
int num_threads_; int num_threads_;
data_size_t num_data_; data_size_t num_data_;
int num_columns_; int num_columns_;
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/metric.h> #include <LightGBM/metric.h>
namespace LightGBM { namespace LightGBM {
...@@ -19,6 +20,8 @@ class CUDAMetricInterface: public HOST_METRIC { ...@@ -19,6 +20,8 @@ class CUDAMetricInterface: public HOST_METRIC {
explicit CUDAMetricInterface(const Config& config): HOST_METRIC(config) { explicit CUDAMetricInterface(const Config& config): HOST_METRIC(config) {
cuda_labels_ = nullptr; cuda_labels_ = nullptr;
cuda_weights_ = nullptr; cuda_weights_ = nullptr;
const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;
SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
} }
void Init(const Metadata& metadata, data_size_t num_data) override { void Init(const Metadata& metadata, data_size_t num_data) override {
......
...@@ -21,7 +21,10 @@ namespace LightGBM { ...@@ -21,7 +21,10 @@ namespace LightGBM {
template <typename HOST_OBJECTIVE> template <typename HOST_OBJECTIVE>
class CUDAObjectiveInterface: public HOST_OBJECTIVE { class CUDAObjectiveInterface: public HOST_OBJECTIVE {
public: public:
explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {} explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {
const int gpu_device_id = config.gpu_device_id >= 0 ? config.gpu_device_id : 0;
SetCUDADevice(gpu_device_id, __FILE__, __LINE__);
}
explicit CUDAObjectiveInterface(const std::vector<std::string>& strs): HOST_OBJECTIVE(strs) {} explicit CUDAObjectiveInterface(const std::vector<std::string>& strs): HOST_OBJECTIVE(strs) {}
......
...@@ -28,6 +28,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = ...@@ -28,6 +28,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
void SetCUDADevice(int gpu_device_id, const char* file, int line); void SetCUDADevice(int gpu_device_id, const char* file, int line);
int GetCUDADevice(const char* file, int line);
template <typename T> template <typename T>
void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) { void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) {
void* tmp_ptr = nullptr; void* tmp_ptr = nullptr;
......
...@@ -26,6 +26,12 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) { ...@@ -26,6 +26,12 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) {
} }
} }
int GetCUDADevice(const char* file, int line) {
int cur_gpu_device_id = 0;
CUDASUCCESS_OR_FATAL_OUTER(cudaGetDevice(&cur_gpu_device_id));
return cur_gpu_device_id;
}
} // namespace LightGBM } // namespace LightGBM
#endif // USE_CUDA #endif // USE_CUDA
...@@ -12,11 +12,8 @@ namespace LightGBM { ...@@ -12,11 +12,8 @@ namespace LightGBM {
CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_id) { CUDAColumnData::CUDAColumnData(const data_size_t num_data, const int gpu_device_id) {
num_threads_ = OMP_NUM_THREADS(); num_threads_ = OMP_NUM_THREADS();
num_data_ = num_data; num_data_ = num_data;
if (gpu_device_id >= 0) { gpu_device_id_ = gpu_device_id >= 0 ? gpu_device_id : 0;
SetCUDADevice(gpu_device_id, __FILE__, __LINE__); SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
} else {
SetCUDADevice(0, __FILE__, __LINE__);
}
cuda_used_indices_ = nullptr; cuda_used_indices_ = nullptr;
cuda_data_by_column_ = nullptr; cuda_data_by_column_ = nullptr;
cuda_column_bit_type_ = nullptr; cuda_column_bit_type_ = nullptr;
...@@ -117,7 +114,10 @@ void CUDAColumnData::Init(const int num_columns, ...@@ -117,7 +114,10 @@ void CUDAColumnData::Init(const int num_columns,
feature_mfb_is_na_ = feature_mfb_is_na; feature_mfb_is_na_ = feature_mfb_is_na;
data_by_column_.resize(num_columns_, nullptr); data_by_column_.resize(num_columns_, nullptr);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(num_threads_) #pragma omp parallel num_threads(num_threads_)
{
SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
#pragma omp for schedule(static)
for (int column_index = 0; column_index < num_columns_; ++column_index) { for (int column_index = 0; column_index < num_columns_; ++column_index) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const int8_t bit_type = column_bit_type[column_index]; const int8_t bit_type = column_bit_type[column_index];
...@@ -149,6 +149,7 @@ void CUDAColumnData::Init(const int num_columns, ...@@ -149,6 +149,7 @@ void CUDAColumnData::Init(const int num_columns,
} }
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
}
OMP_THROW_EX(); OMP_THROW_EX();
feature_to_column_ = feature_to_column; feature_to_column_ = feature_to_column;
InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_,
...@@ -182,7 +183,10 @@ void CUDAColumnData::CopySubrow( ...@@ -182,7 +183,10 @@ void CUDAColumnData::CopySubrow(
AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__);
data_by_column_.resize(num_columns_, nullptr); data_by_column_.resize(num_columns_, nullptr);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(num_threads_) #pragma omp parallel num_threads(num_threads_)
{
SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
#pragma omp for schedule(static)
for (int column_index = 0; column_index < num_columns_; ++column_index) { for (int column_index = 0; column_index < num_columns_; ++column_index) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const uint8_t bit_type = column_bit_type_[column_index]; const uint8_t bit_type = column_bit_type_[column_index];
...@@ -201,6 +205,7 @@ void CUDAColumnData::CopySubrow( ...@@ -201,6 +205,7 @@ void CUDAColumnData::CopySubrow(
} }
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
}
OMP_THROW_EX(); OMP_THROW_EX();
InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__);
InitColumnMetaInfo(); InitColumnMetaInfo();
...@@ -221,7 +226,10 @@ void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) { ...@@ -221,7 +226,10 @@ void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) {
DeallocateCUDAMemory<data_size_t>(&cuda_used_indices_, __FILE__, __LINE__); DeallocateCUDAMemory<data_size_t>(&cuda_used_indices_, __FILE__, __LINE__);
AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__); AllocateCUDAMemory<data_size_t>(&cuda_used_indices_, num_used_indices_size, __FILE__, __LINE__);
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(num_threads_) #pragma omp parallel num_threads(num_threads_)
{
SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
#pragma omp for schedule(static)
for (int column_index = 0; column_index < num_columns_; ++column_index) { for (int column_index = 0; column_index < num_columns_; ++column_index) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
const uint8_t bit_type = column_bit_type_[column_index]; const uint8_t bit_type = column_bit_type_[column_index];
...@@ -243,6 +251,7 @@ void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) { ...@@ -243,6 +251,7 @@ void CUDAColumnData::ResizeWhenCopySubrow(const data_size_t num_used_indices) {
} }
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
}
OMP_THROW_EX(); OMP_THROW_EX();
DeallocateCUDAMemory<void*>(&cuda_data_by_column_, __FILE__, __LINE__); DeallocateCUDAMemory<void*>(&cuda_data_by_column_, __FILE__, __LINE__);
InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__); InitCUDAMemoryFromHostMemory<void*>(&cuda_data_by_column_, data_by_column_.data(), data_by_column_.size(), __FILE__, __LINE__);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment