Unverified Commit f901f471 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

[CUDA] CUDA Quantized Training (fixes #5606) (#5933)

* add quantized training (first stage)

* add histogram construction functions for integer gradients

* add stochastic rounding

* update docs

* fix compilation errors by adding template instantiations

* update files for compilation

* fix compilation of gpu version

* initialize gradient discretizer before share states

* add a test case for quantized training

* add quantized training for data distributed training

* Delete origin.pred

* Delete ifelse.pred

* Delete LightGBM_model.txt

* remove useless changes

* fix lint error

* remove debug loggings

* fix mismatch of vector and allocator types

* remove changes in main.cpp

* fix bugs with uninitialized gradient discretizer

* initialize ordered gradients in gradient discretizer

* disable quantized training with gpu and cuda

fix msvc compilation errors and warnings

* fix bug in data parallel tree learner

* make quantized training test deterministic

* make quantized training in test case more accurate

* refactor test_quantized_training

* fix leaf splits initialization with quantized training

* check distributed quantized training result

* add cuda gradient discretizer

* add quantized training for CUDA version in tree learner

* remove cuda computability 6.1 and 6.2

* fix parts of gpu quantized training errors and warnings

* fix build-python.sh to install locally built version

* fix memory access bugs

* fix lint errors

* mark cuda quantized training on cuda with categorical features as unsupported

* rename cuda_utils.h to cuda_utils.hu

* enable quantized training with cuda

* fix cuda quantized training with sparse row data

* allow using global memory buffer in histogram construction with cuda quantized training

* recover build-python.sh

enlarge allowed package size to 100M
parent 3d9ada76
...@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then ...@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
pydistcheck \ pydistcheck \
--inspect \ --inspect \
--ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \ --ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
--max-allowed-size-uncompressed '70M' \ --max-allowed-size-uncompressed '100M' \
--max-allowed-files 800 \ --max-allowed-files 800 \
${DIST_DIR}/* || exit -1 ${DIST_DIR}/* || exit -1
elif { test $(uname -m) = "aarch64"; }; then elif { test $(uname -m) = "aarch64"; }; then
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
#include <stdio.h> #include <stdio.h>
#include <LightGBM/bin.h> #include <LightGBM/bin.h>
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/log.h> #include <LightGBM/utils/log.h>
#include <algorithm> #include <algorithm>
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_ #define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
#include <LightGBM/config.h> #include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/bin.h> #include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h> #include <LightGBM/utils/openmp_wrapper.h>
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_ #ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#define LIGHTGBM_CUDA_CUDA_METADATA_HPP_ #define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
#include <vector> #include <vector>
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/metric.h> #include <LightGBM/metric.h>
namespace LightGBM { namespace LightGBM {
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/objective_function.h> #include <LightGBM/objective_function.h>
#include <LightGBM/meta.h> #include <LightGBM/meta.h>
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include <LightGBM/bin.h> #include <LightGBM/bin.h>
#include <LightGBM/config.h> #include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/dataset.h> #include <LightGBM/dataset.h>
#include <LightGBM/train_share_states.h> #include <LightGBM/train_share_states.h>
#include <LightGBM/utils/openmp_wrapper.h> #include <LightGBM/utils/openmp_wrapper.h>
......
...@@ -24,12 +24,14 @@ class CUDASplitInfo { ...@@ -24,12 +24,14 @@ class CUDASplitInfo {
double left_sum_gradients; double left_sum_gradients;
double left_sum_hessians; double left_sum_hessians;
int64_t left_sum_of_gradients_hessians;
data_size_t left_count; data_size_t left_count;
double left_gain; double left_gain;
double left_value; double left_value;
double right_sum_gradients; double right_sum_gradients;
double right_sum_hessians; double right_sum_hessians;
int64_t right_sum_of_gradients_hessians;
data_size_t right_count; data_size_t right_count;
double right_gain; double right_gain;
double right_value; double right_value;
......
...@@ -7,15 +7,21 @@ ...@@ -7,15 +7,21 @@
#define LIGHTGBM_CUDA_CUDA_UTILS_H_ #define LIGHTGBM_CUDA_CUDA_UTILS_H_
#ifdef USE_CUDA #ifdef USE_CUDA
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stdio.h> #include <stdio.h>
#include <LightGBM/utils/log.h> #include <LightGBM/utils/log.h>
#include <algorithm>
#include <vector> #include <vector>
#include <cmath> #include <cmath>
namespace LightGBM { namespace LightGBM {
typedef unsigned long long atomic_add_long_t;
#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) { if (code != cudaSuccess) {
...@@ -125,13 +131,19 @@ class CUDAVector { ...@@ -125,13 +131,19 @@ class CUDAVector {
T* new_data = nullptr; T* new_data = nullptr;
AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__); AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
if (size_ > 0 && data_ != nullptr) { if (size_ > 0 && data_ != nullptr) {
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__); const size_t size_for_old_content = std::min<size_t>(size_, size);
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
} }
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__); DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
data_ = new_data; data_ = new_data;
size_ = size; size_ = size;
} }
void InitFromHostVector(const std::vector<T>& host_vector) {
Resize(host_vector.size());
CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
}
void Clear() { void Clear() {
if (size_ > 0 && data_ != nullptr) { if (size_ > 0 && data_ != nullptr) {
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__); DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
...@@ -171,6 +183,10 @@ class CUDAVector { ...@@ -171,6 +183,10 @@ class CUDAVector {
return data_; return data_;
} }
void SetValue(int value) {
SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
}
const T* RawDataReadOnly() const { const T* RawDataReadOnly() const {
return data_; return data_;
} }
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#ifndef LIGHTGBM_SAMPLE_STRATEGY_H_ #ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
#define LIGHTGBM_SAMPLE_STRATEGY_H_ #define LIGHTGBM_SAMPLE_STRATEGY_H_
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/random.h> #include <LightGBM/utils/random.h>
#include <LightGBM/utils/common.h> #include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h> #include <LightGBM/utils/threading.h>
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include "../score_updater.hpp" #include "../score_updater.hpp"
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
namespace LightGBM { namespace LightGBM {
......
...@@ -389,10 +389,6 @@ void Config::CheckParamConflict() { ...@@ -389,10 +389,6 @@ void Config::CheckParamConflict() {
if (deterministic) { if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
} }
if (use_quantized_grad) {
Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
use_quantized_grad = false;
}
} }
// linear tree learner must be serial type and run on CPU device // linear tree learner must be serial type and run on CPU device
if (linear_tree) { if (linear_tree) {
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp> #include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <vector> #include <vector>
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp> #include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <vector> #include <vector>
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#ifdef USE_CUDA #ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp> #include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h> #include <LightGBM/cuda/cuda_utils.hu>
#include <vector> #include <vector>
......
...@@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder( ...@@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder(
select_features_by_node_(select_features_by_node), select_features_by_node_(select_features_by_node),
cuda_hist_(cuda_hist) { cuda_hist_(cuda_hist) {
InitFeatureMetaInfo(train_data); InitFeatureMetaInfo(train_data);
if (has_categorical_feature_ && config->use_quantized_grad) {
Log::Fatal("Quantized training on GPU with categorical features is not supported yet.");
}
cuda_leaf_best_split_info_ = nullptr; cuda_leaf_best_split_info_ = nullptr;
cuda_best_split_info_ = nullptr; cuda_best_split_info_ = nullptr;
cuda_best_split_info_buffer_ = nullptr; cuda_best_split_info_buffer_ = nullptr;
...@@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf( ...@@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_smaller_leaf,
const data_size_t num_data_in_larger_leaf, const data_size_t num_data_in_larger_leaf,
const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_smaller_leaf,
const double sum_hessians_in_larger_leaf) { const double sum_hessians_in_larger_leaf,
const score_t* grad_scale,
const score_t* hess_scale,
const uint8_t smaller_num_bits_in_histogram_bins,
const uint8_t larger_num_bits_in_histogram_bins) {
const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ && const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_); sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ && const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0); sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits, if (grad_scale != nullptr && hess_scale != nullptr) {
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
grad_scale, hess_scale, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
} else {
LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
}
global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel"); global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid); LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
SynchronizeCUDADevice(__FILE__, __LINE__); SynchronizeCUDADevice(__FILE__, __LINE__);
......
...@@ -67,7 +67,11 @@ class CUDABestSplitFinder { ...@@ -67,7 +67,11 @@ class CUDABestSplitFinder {
const data_size_t num_data_in_smaller_leaf, const data_size_t num_data_in_smaller_leaf,
const data_size_t num_data_in_larger_leaf, const data_size_t num_data_in_larger_leaf,
const double sum_hessians_in_smaller_leaf, const double sum_hessians_in_smaller_leaf,
const double sum_hessians_in_larger_leaf); const double sum_hessians_in_larger_leaf,
const score_t* grad_scale,
const score_t* hess_scale,
const uint8_t smaller_num_bits_in_histogram_bins,
const uint8_t larger_num_bits_in_histogram_bins);
const CUDASplitInfo* FindBestFromAllSplits( const CUDASplitInfo* FindBestFromAllSplits(
const int cur_num_leaves, const int cur_num_leaves,
...@@ -114,6 +118,31 @@ class CUDABestSplitFinder { ...@@ -114,6 +118,31 @@ class CUDABestSplitFinder {
#undef LaunchFindBestSplitsForLeafKernel_PARAMS #undef LaunchFindBestSplitsForLeafKernel_PARAMS
#define LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS \
const CUDALeafSplitsStruct* smaller_leaf_splits, \
const CUDALeafSplitsStruct* larger_leaf_splits, \
const int smaller_leaf_index, \
const int larger_leaf_index, \
const bool is_smaller_leaf_valid, \
const bool is_larger_leaf_valid, \
const score_t* grad_scale, \
const score_t* hess_scale, \
const uint8_t smaller_num_bits_in_histogram_bins, \
const uint8_t larger_num_bits_in_histogram_bins
void LaunchFindBestSplitsDiscretizedForLeafKernel(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner0(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND, bool USE_L1>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner1(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner2(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
#undef LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS
void LaunchSyncBestSplitForLeafKernel( void LaunchSyncBestSplitForLeafKernel(
const int host_smaller_leaf_index, const int host_smaller_leaf_index,
const int host_larger_leaf_index, const int host_larger_leaf_index,
......
...@@ -368,6 +368,12 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n ...@@ -368,6 +368,12 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n
cur_num_leaves_ = num_leaves; cur_num_leaves_ = num_leaves;
} }
void CUDADataPartition::ReduceLeafGradStat(
const score_t* gradients, const score_t* hessians,
CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const {
LaunchReduceLeafGradStat(gradients, hessians, tree, leaf_grad_stat_buffer, leaf_hess_state_buffer);
}
} // namespace LightGBM } // namespace LightGBM
#endif // USE_CUDA #endif // USE_CUDA
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment