Unverified Commit f901f471 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

[CUDA] CUDA Quantized Training (fixes #5606) (#5933)

* add quantized training (first stage)

* add histogram construction functions for integer gradients

* add stochastic rounding

* update docs

* fix compilation errors by adding template instantiations

* update files for compilation

* fix compilation of gpu version

* initialize gradient discretizer before share states

* add a test case for quantized training

* add quantized training for data distributed training

* Delete origin.pred

* Delete ifelse.pred

* Delete LightGBM_model.txt

* remove useless changes

* fix lint error

* remove debug loggings

* fix mismatch of vector and allocator types

* remove changes in main.cpp

* fix bugs with uninitialized gradient discretizer

* initialize ordered gradients in gradient discretizer

* disable quantized training with gpu and cuda

fix msvc compilation errors and warnings

* fix bug in data parallel tree learner

* make quantized training test deterministic

* make quantized training in test case more accurate

* refactor test_quantized_training

* fix leaf splits initialization with quantized training

* check distributed quantized training result

* add cuda gradient discretizer

* add quantized training for CUDA version in tree learner

* remove cuda computability 6.1 and 6.2

* fix parts of gpu quantized training errors and warnings

* fix build-python.sh to install locally built version

* fix memory access bugs

* fix lint errors

* mark cuda quantized training on cuda with categorical features as unsupported

* rename cuda_utils.h to cuda_utils.hu

* enable quantized training with cuda

* fix cuda quantized training with sparse row data

* allow using global memory buffer in histogram construction with cuda quantized training

* recover build-python.sh

enlarge allowed package size to 100M
parent 3d9ada76
......@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
pydistcheck \
--inspect \
--ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
--max-allowed-size-uncompressed '70M' \
--max-allowed-size-uncompressed '100M' \
--max-allowed-files 800 \
${DIST_DIR}/* || exit -1
elif { test $(uname -m) = "aarch64"; }; then
......
......@@ -13,7 +13,7 @@
#include <stdio.h>
#include <LightGBM/bin.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/log.h>
#include <algorithm>
......
......@@ -9,7 +9,7 @@
#define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
#include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
......
......@@ -8,7 +8,7 @@
#ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/meta.h>
#include <vector>
......
......@@ -9,7 +9,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/metric.h>
namespace LightGBM {
......
......@@ -9,7 +9,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/objective_function.h>
#include <LightGBM/meta.h>
......
......@@ -10,7 +10,7 @@
#include <LightGBM/bin.h>
#include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/dataset.h>
#include <LightGBM/train_share_states.h>
#include <LightGBM/utils/openmp_wrapper.h>
......
......@@ -24,12 +24,14 @@ class CUDASplitInfo {
double left_sum_gradients;
double left_sum_hessians;
int64_t left_sum_of_gradients_hessians;
data_size_t left_count;
double left_gain;
double left_value;
double right_sum_gradients;
double right_sum_hessians;
int64_t right_sum_of_gradients_hessians;
data_size_t right_count;
double right_gain;
double right_value;
......
......@@ -7,15 +7,21 @@
#define LIGHTGBM_CUDA_CUDA_UTILS_H_
#ifdef USE_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <LightGBM/utils/log.h>
#include <algorithm>
#include <vector>
#include <cmath>
namespace LightGBM {
typedef unsigned long long atomic_add_long_t;
#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) {
......@@ -125,13 +131,19 @@ class CUDAVector {
T* new_data = nullptr;
AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
if (size_ > 0 && data_ != nullptr) {
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__);
const size_t size_for_old_content = std::min<size_t>(size_, size);
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
}
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
data_ = new_data;
size_ = size;
}
void InitFromHostVector(const std::vector<T>& host_vector) {
Resize(host_vector.size());
CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
}
void Clear() {
if (size_ > 0 && data_ != nullptr) {
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
......@@ -171,6 +183,10 @@ class CUDAVector {
return data_;
}
void SetValue(int value) {
SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
}
const T* RawDataReadOnly() const {
return data_;
}
......
......@@ -6,7 +6,7 @@
#ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
#define LIGHTGBM_SAMPLE_STRATEGY_H_
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h>
......
......@@ -8,7 +8,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include "../score_updater.hpp"
......
......@@ -5,7 +5,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
namespace LightGBM {
......
......@@ -389,10 +389,6 @@ void Config::CheckParamConflict() {
if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
}
if (use_quantized_grad) {
Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
use_quantized_grad = false;
}
}
// linear tree learner must be serial type and run on CPU device
if (linear_tree) {
......
......@@ -10,7 +10,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <vector>
......
......@@ -10,7 +10,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <vector>
......
......@@ -10,7 +10,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <vector>
......
......@@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder(
select_features_by_node_(select_features_by_node),
cuda_hist_(cuda_hist) {
InitFeatureMetaInfo(train_data);
if (has_categorical_feature_ && config->use_quantized_grad) {
Log::Fatal("Quantized training on GPU with categorical features is not supported yet.");
}
cuda_leaf_best_split_info_ = nullptr;
cuda_best_split_info_ = nullptr;
cuda_best_split_info_buffer_ = nullptr;
......@@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
const data_size_t num_data_in_smaller_leaf,
const data_size_t num_data_in_larger_leaf,
const double sum_hessians_in_smaller_leaf,
const double sum_hessians_in_larger_leaf) {
const double sum_hessians_in_larger_leaf,
const score_t* grad_scale,
const score_t* hess_scale,
const uint8_t smaller_num_bits_in_histogram_bins,
const uint8_t larger_num_bits_in_histogram_bins) {
const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
if (grad_scale != nullptr && hess_scale != nullptr) {
LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
grad_scale, hess_scale, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
} else {
LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
}
global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
SynchronizeCUDADevice(__FILE__, __LINE__);
......
......@@ -67,7 +67,11 @@ class CUDABestSplitFinder {
const data_size_t num_data_in_smaller_leaf,
const data_size_t num_data_in_larger_leaf,
const double sum_hessians_in_smaller_leaf,
const double sum_hessians_in_larger_leaf);
const double sum_hessians_in_larger_leaf,
const score_t* grad_scale,
const score_t* hess_scale,
const uint8_t smaller_num_bits_in_histogram_bins,
const uint8_t larger_num_bits_in_histogram_bins);
const CUDASplitInfo* FindBestFromAllSplits(
const int cur_num_leaves,
......@@ -114,6 +118,31 @@ class CUDABestSplitFinder {
#undef LaunchFindBestSplitsForLeafKernel_PARAMS
#define LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS \
const CUDALeafSplitsStruct* smaller_leaf_splits, \
const CUDALeafSplitsStruct* larger_leaf_splits, \
const int smaller_leaf_index, \
const int larger_leaf_index, \
const bool is_smaller_leaf_valid, \
const bool is_larger_leaf_valid, \
const score_t* grad_scale, \
const score_t* hess_scale, \
const uint8_t smaller_num_bits_in_histogram_bins, \
const uint8_t larger_num_bits_in_histogram_bins
void LaunchFindBestSplitsDiscretizedForLeafKernel(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner0(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND, bool USE_L1>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner1(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner2(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
#undef LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS
void LaunchSyncBestSplitForLeafKernel(
const int host_smaller_leaf_index,
const int host_larger_leaf_index,
......
......@@ -368,6 +368,12 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n
cur_num_leaves_ = num_leaves;
}
void CUDADataPartition::ReduceLeafGradStat(
const score_t* gradients, const score_t* hessians,
CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const {
LaunchReduceLeafGradStat(gradients, hessians, tree, leaf_grad_stat_buffer, leaf_hess_state_buffer);
}
} // namespace LightGBM
#endif // USE_CUDA
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment