Unverified Commit 4f47547c authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[CUDA] consolidate CUDA versions (#5677)



* [ci] speed up if-else, swig, and lint conda setup

* add 'source activate'

* python constraint

* start removing cuda v1

* comment out CI

* remove more references

* revert some unnecessaary changes

* revert a few more mistakes

* revert another change that ignored params

* sigh

* remove CUDATreeLearner

* fix tests, docs

* fix quoting in setup.py

* restore all CI

* Apply suggestions from code review
Co-authored-by: default avatarshiyu1994 <shiyu_k1994@qq.com>

* Apply suggestions from code review

* completely remove cuda_exp, update docs

---------
Co-authored-by: default avatarshiyu1994 <shiyu_k1994@qq.com>
parent 5ffd7571
......@@ -4,7 +4,7 @@
* license information.
*/
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include "cuda_histogram_constructor.hpp"
......@@ -429,4 +429,4 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
......@@ -6,7 +6,7 @@
#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
#define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_row_data.hpp>
#include <LightGBM/feature_group.h>
......@@ -165,5 +165,5 @@ class CUDAHistogramConstructor {
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
......@@ -4,7 +4,7 @@
* license information.
*/
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include "cuda_leaf_splits.hpp"
......@@ -68,4 +68,4 @@ void CUDALeafSplits::Resize(const data_size_t num_data) {
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
......@@ -5,7 +5,7 @@
*/
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include "cuda_leaf_splits.hpp"
#include <LightGBM/cuda/cuda_algorithms.hpp>
......@@ -126,4 +126,4 @@ void CUDALeafSplits::LaunchInitValuesKernal(
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
......@@ -6,7 +6,7 @@
#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
#define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/bin.h>
......@@ -156,5 +156,5 @@ class CUDALeafSplits {
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
......@@ -4,7 +4,7 @@
* license information.
*/
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include "cuda_single_gpu_tree_learner.hpp"
......@@ -515,4 +515,4 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
......@@ -4,7 +4,7 @@
* license information.
*/
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_algorithms.hpp>
......@@ -258,4 +258,4 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel(
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
......@@ -9,7 +9,7 @@
#include <memory>
#include <vector>
#ifdef USE_CUDA_EXP
#ifdef USE_CUDA
#include "cuda_leaf_splits.hpp"
#include "cuda_histogram_constructor.hpp"
......@@ -137,7 +137,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
} // namespace LightGBM
#else // USE_CUDA_EXP
#else // USE_CUDA
// When GPU support is not compiled in, quit with an error message
......@@ -147,12 +147,12 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
public:
#pragma warning(disable : 4702)
explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) {
Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n"
"Please recompile with CMake option -DUSE_CUDA_EXP=1");
Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
"Please recompile with CMake option -DUSE_CUDAP=1");
}
};
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // USE_CUDA
#endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
/*!
* Copyright (c) 2020 IBM Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifdef USE_CUDA
#include "cuda_kernel_launcher.h"
#include <LightGBM/utils/log.h>
#include <cuda_runtime.h>
#include <cstdio>
namespace LightGBM {
void cuda_histogram(
int histogram_size,
data_size_t leaf_num_data,
data_size_t num_data,
bool use_all_features,
bool is_constant_hessian,
int num_workgroups,
cudaStream_t stream,
uint8_t* arg0,
uint8_t* arg1,
data_size_t arg2,
data_size_t* arg3,
data_size_t arg4,
score_t* arg5,
score_t* arg6,
score_t arg6_const,
char* arg7,
volatile int* arg8,
void* arg9,
size_t exp_workgroups_per_feature) {
if (histogram_size == 16) {
if (leaf_num_data == num_data) {
if (use_all_features) {
if (!is_constant_hessian)
histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
} else {
if (!is_constant_hessian)
histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
}
} else {
if (use_all_features) {
// seems all features is always enabled, so this should be the same as fulldata
if (!is_constant_hessian)
histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
} else {
if (!is_constant_hessian)
histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
}
}
} else if (histogram_size == 64) {
if (leaf_num_data == num_data) {
if (use_all_features) {
if (!is_constant_hessian)
histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
} else {
if (!is_constant_hessian)
histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
}
} else {
if (use_all_features) {
// seems all features is always enabled, so this should be the same as fulldata
if (!is_constant_hessian)
histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
} else {
if (!is_constant_hessian)
histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
}
}
} else {
if (leaf_num_data == num_data) {
if (use_all_features) {
if (!is_constant_hessian)
histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
} else {
if (!is_constant_hessian)
histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
}
} else {
if (use_all_features) {
// seems all features is always enabled, so this should be the same as fulldata
if (!is_constant_hessian)
histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
} else {
if (!is_constant_hessian)
histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
else
histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
arg3, arg4, arg5,
arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
}
}
}
}
} // namespace LightGBM
#endif // USE_CUDA
/*!
* Copyright (c) 2020 IBM Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
#ifdef USE_CUDA
#include <chrono>
#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t
namespace LightGBM {
struct ThreadData {
// device id
int device_id;
// parameters for cuda_histogram
int histogram_size;
data_size_t leaf_num_data;
data_size_t num_data;
bool use_all_features;
bool is_constant_hessian;
int num_workgroups;
cudaStream_t stream;
uint8_t* device_features;
uint8_t* device_feature_masks;
data_size_t* device_data_indices;
score_t* device_gradients;
score_t* device_hessians;
score_t hessians_const;
char* device_subhistograms;
volatile int* sync_counters;
void* device_histogram_outputs;
size_t exp_workgroups_per_feature;
// cuda events
cudaEvent_t* kernel_start;
cudaEvent_t* kernel_wait_obj;
std::chrono::duration<double, std::milli>* kernel_input_wait_time;
// copy histogram
size_t output_size;
char* host_histogram_output;
cudaEvent_t* histograms_wait_obj;
};
void cuda_histogram(
int histogram_size,
data_size_t leaf_num_data,
data_size_t num_data,
bool use_all_features,
bool is_constant_hessian,
int num_workgroups,
cudaStream_t stream,
uint8_t* arg0,
uint8_t* arg1,
data_size_t arg2,
data_size_t* arg3,
data_size_t arg4,
score_t* arg5,
score_t* arg6,
score_t arg6_const,
char* arg7,
volatile int* arg8,
void* arg9,
size_t exp_workgroups_per_feature);
} // namespace LightGBM
#endif // USE_CUDA
#endif // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
This diff is collapsed.
/*!
* Copyright (c) 2020 IBM Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/dataset.h>
#include <LightGBM/feature_group.h>
#include <LightGBM/tree.h>
#include <string>
#include <cmath>
#include <cstdio>
#include <memory>
#include <random>
#include <vector>
#ifdef USE_CUDA
#include <cuda_runtime.h>
#endif
#include "feature_histogram.hpp"
#include "serial_tree_learner.h"
#include "data_partition.hpp"
#include "split_info.hpp"
#include "leaf_splits.hpp"
#ifdef USE_CUDA
#include <LightGBM/cuda/vector_cudahost.h>
#include "cuda_kernel_launcher.h"
using json11::Json;
namespace LightGBM {
/*!
* \brief CUDA-based parallel learning algorithm.
*/
class CUDATreeLearner: public SerialTreeLearner {
public:
explicit CUDATreeLearner(const Config* tree_config);
~CUDATreeLearner();
void Init(const Dataset* train_data, bool is_constant_hessian) override;
void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override;
void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
if (subset == nullptr && used_indices != nullptr) {
if (num_data != num_data_) {
use_bagging_ = true;
return;
}
}
use_bagging_ = false;
}
protected:
void BeforeTrain() override;
bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
void FindBestSplits(const Tree* tree) override;
void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
private:
typedef float gpu_hist_t;
/*!
* \brief Find the best number of workgroups processing one feature for maximizing efficiency
* \param leaf_num_data The number of data examples on the current leaf being processed
* \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
*/
int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
/*!
* \brief Initialize GPU device
* \param num_gpu: number of maximum gpus
*/
void InitGPU(int num_gpu);
/*!
* \brief Allocate memory for GPU computation // alloc only
*/
void CountDenseFeatureGroups(); // compute num_dense_feature_group
void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory
void AllocateGPUMemory();
/*!
* \ ResetGPUMemory
*/
void ResetGPUMemory();
/*!
* \ copy dense feature from CPU to GPU
*/
void copyDenseFeature();
/*!
* \brief Compute GPU feature histogram for the current leaf.
* Indices, gradients and Hessians have been copied to the device.
* \param leaf_num_data Number of data on current leaf
* \param use_all_features Set to true to not use feature masks, with a faster kernel
*/
void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
int leaf_num_data, bool use_all_features,
int num_workgroups, int exp_workgroups_per_feature) {
ThreadData* td = &thread_data[device_id];
td->device_id = device_id;
td->histogram_size = histogram_size;
td->leaf_num_data = leaf_num_data;
td->num_data = num_data_;
td->use_all_features = use_all_features;
td->is_constant_hessian = share_state_->is_constant_hessian;
td->num_workgroups = num_workgroups;
td->stream = stream_[device_id];
td->device_features = device_features_[device_id];
td->device_feature_masks = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
td->device_data_indices = device_data_indices_[device_id];
td->device_gradients = device_gradients_[device_id];
td->device_hessians = device_hessians_[device_id];
td->hessians_const = hessians_[0];
td->device_subhistograms = device_subhistograms_[device_id];
td->sync_counters = sync_counters_[device_id];
td->device_histogram_outputs = device_histogram_outputs_[device_id];
td->exp_workgroups_per_feature = exp_workgroups_per_feature;
td->kernel_start = &(kernel_start_[device_id]);
td->kernel_wait_obj = &(kernel_wait_obj_[device_id]);
td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);
size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
td->output_size = output_size;
td->host_histogram_output = reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset;
td->histograms_wait_obj = &(histograms_wait_obj_[device_id]);
}
/*!
* \brief Wait for GPU kernel execution and read histogram
* \param histograms Destination of histogram results from GPU.
*/
template <typename HistType>
void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
/*!
* \brief Construct GPU histogram asynchronously.
* Interface is similar to Dataset::ConstructHistograms().
* \param is_feature_used A predicate vector for enabling each feature
* \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
* Set to nullptr to skip copy to GPU.
* \param num_data Number of data examples to be included in histogram
* \return true if GPU kernel is launched, false if GPU is not used
*/
bool ConstructGPUHistogramsAsync(
const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data);
/*! brief Log2 of max number of workgroups per feature*/
const int kMaxLogWorkgroupsPerFeature = 10; // 2^10
/*! brief Max total number of workgroups with preallocated workspace.
* If we use more than this number of workgroups, we have to reallocate subhistograms */
std::vector<int> preallocd_max_num_wg_;
/*! \brief True if bagging is used */
bool use_bagging_;
/*! \brief GPU command queue object */
std::vector<cudaStream_t> stream_;
/*! \brief total number of feature-groups */
int num_feature_groups_;
/*! \brief total number of dense feature-groups, which will be processed on GPU */
int num_dense_feature_groups_;
std::vector<int> num_gpu_feature_groups_;
std::vector<int> offset_gpu_feature_groups_;
/*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
* With bin size > 16, there are 4 features per DWORD.
* With bin size <=16, there are 8 features per DWORD.
*/
int dword_features_;
/*! \brief Max number of bins of training data, used to determine
* which GPU kernel to use */
int max_num_bin_;
/*! \brief Used GPU kernel bin size (64, 256) */
int histogram_size_;
int device_bin_size_;
/*! \brief Size of histogram bin entry, depending if single or double precision is used */
size_t hist_bin_entry_sz_;
/*! \brief Indices of all dense feature-groups */
std::vector<int> dense_feature_group_map_;
/*! \brief Indices of all sparse feature-groups */
std::vector<int> sparse_feature_group_map_;
/*! \brief GPU memory object holding the training data */
std::vector<uint8_t*> device_features_;
/*! \brief GPU memory object holding the ordered gradient */
std::vector<score_t*> device_gradients_;
/*! \brief GPU memory object holding the ordered hessian */
std::vector<score_t*> device_hessians_;
/*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
std::vector<char> feature_masks_;
/*! \brief GPU memory object holding the feature masks */
std::vector<char*> device_feature_masks_;
/*! \brief Pointer to pinned memory of feature masks */
char* ptr_pinned_feature_masks_ = nullptr;
/*! \brief GPU memory object holding indices of the leaf being processed */
std::vector<data_size_t*> device_data_indices_;
/*! \brief GPU memory object holding counters for workgroup coordination */
std::vector<int*> sync_counters_;
/*! \brief GPU memory object holding temporary sub-histograms per workgroup */
std::vector<char*> device_subhistograms_;
/*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
std::vector<void*> device_histogram_outputs_;
/*! \brief Host memory pointer for histogram outputs */
void *host_histogram_outputs_;
/*! CUDA waitlist object for waiting for data transfer before kernel execution */
std::vector<cudaEvent_t> kernel_wait_obj_;
/*! CUDA waitlist object for reading output histograms after kernel execution */
std::vector<cudaEvent_t> histograms_wait_obj_;
/*! CUDA Asynchronous waiting object for copying indices */
std::vector<cudaEvent_t> indices_future_;
/*! Asynchronous waiting object for copying gradients */
std::vector<cudaEvent_t> gradients_future_;
/*! Asynchronous waiting object for copying Hessians */
std::vector<cudaEvent_t> hessians_future_;
/*! Asynchronous waiting object for copying dense features */
std::vector<cudaEvent_t> features_future_;
// host-side buffer for converting feature data into featre4 data
int nthreads_; // number of Feature4* vector on host4_vecs_
std::vector<cudaEvent_t> kernel_start_;
std::vector<float> kernel_time_; // measure histogram kernel time
std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
int num_gpu_;
int allocated_num_data_; // allocated data instances
pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu
};
} // namespace LightGBM
#else // USE_CUDA
// When GPU support is not compiled in, quit with an error message
namespace LightGBM {
class CUDATreeLearner: public SerialTreeLearner {
public:
#pragma warning(disable : 4702)
explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
"Please recompile with CMake option -DUSE_CUDA=1");
}
};
} // namespace LightGBM
#endif // USE_CUDA
#endif // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
......@@ -276,7 +276,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
}
// instantiate template classes, otherwise linker cannot find the code
template class DataParallelTreeLearner<CUDATreeLearner>;
template class DataParallelTreeLearner<GPUTreeLearner>;
template class DataParallelTreeLearner<SerialTreeLearner>;
......
......@@ -77,7 +77,6 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
}
// instantiate template classes, otherwise linker cannot find the code
template class FeatureParallelTreeLearner<CUDATreeLearner>;
template class FeatureParallelTreeLearner<GPUTreeLearner>;
template class FeatureParallelTreeLearner<SerialTreeLearner>;
} // namespace LightGBM
......@@ -12,7 +12,6 @@
#include <memory>
#include <vector>
#include "cuda_tree_learner.h"
#include "gpu_tree_learner.h"
#include "serial_tree_learner.h"
......
......@@ -344,15 +344,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set<int>* fo
}
bool use_subtract = parent_leaf_histogram_array_ != nullptr;
#ifdef USE_CUDA
if (LGBM_config_::current_learner == use_cpu_learner) {
SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
} else {
ConstructHistograms(is_feature_used, use_subtract);
}
#else
ConstructHistograms(is_feature_used, use_subtract);
#endif
FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
}
......
......@@ -211,7 +211,7 @@ class SerialTreeLearner: public TreeLearner {
std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
/*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
#elif defined(USE_CUDA) || defined(USE_CUDA_EXP)
#elif defined(USE_CUDA)
/*! \brief gradients of current iteration, ordered for cache optimized */
std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
/*! \brief hessians of current iteration, ordered for cache optimized */
......
......@@ -4,7 +4,6 @@
*/
#include <LightGBM/tree_learner.h>
#include "cuda_tree_learner.h"
#include "gpu_tree_learner.h"
#include "linear_tree_learner.h"
#include "parallel_tree_learner.h"
......@@ -40,24 +39,14 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
return new VotingParallelTreeLearner<GPUTreeLearner>(config);
}
} else if (device_type == std::string("cuda")) {
if (learner_type == std::string("serial")) {
return new CUDATreeLearner(config);
} else if (learner_type == std::string("feature")) {
return new FeatureParallelTreeLearner<CUDATreeLearner>(config);
} else if (learner_type == std::string("data")) {
return new DataParallelTreeLearner<CUDATreeLearner>(config);
} else if (learner_type == std::string("voting")) {
return new VotingParallelTreeLearner<CUDATreeLearner>(config);
}
} else if (device_type == std::string("cuda_exp")) {
if (learner_type == std::string("serial")) {
if (config->num_gpu == 1) {
return new CUDASingleGPUTreeLearner(config, boosting_on_cuda);
} else {
Log::Fatal("cuda_exp only supports training on a single GPU.");
Log::Fatal("Currently cuda version only supports training on a single GPU.");
}
} else {
Log::Fatal("cuda_exp only supports training on a single machine.");
Log::Fatal("Currently cuda version only supports training on a single machine.");
}
}
return nullptr;
......
......@@ -501,7 +501,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
}
// instantiate template classes, otherwise linker cannot find the code
template class VotingParallelTreeLearner<CUDATreeLearner>;
template class VotingParallelTreeLearner<GPUTreeLearner>;
template class VotingParallelTreeLearner<SerialTreeLearner>;
} // namespace LightGBM
......@@ -48,7 +48,7 @@ def test_basic(tmp_path):
assert bst.current_iteration() == 20
assert bst.num_trees() == 20
assert bst.num_model_per_iteration() == 1
if getenv('TASK', '') != 'cuda_exp':
if getenv('TASK', '') != 'cuda':
assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
assert bst.upper_bound() == pytest.approx(3.3182142872462883)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment