Unverified Commit 17ecfab3 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

Add quantized training (CPU part) (#5800)

* add quantized training (first stage)

* add histogram construction functions for integer gradients

* add stochastic rounding

* update docs

* fix compilation errors by adding template instantiations

* update files for compilation

* fix compilation of gpu version

* initialize gradient discretizer before share states

* add a test case for quantized training

* add quantized training for data distributed training

* Delete origin.pred

* Delete ifelse.pred

* Delete LightGBM_model.txt

* remove useless changes

* fix lint error

* remove debug loggings

* fix mismatch of vector and allocator types

* remove changes in main.cpp

* fix bugs with uninitialized gradient discretizer

* initialize ordered gradients in gradient discretizer

* disable quantized training with gpu and cuda

fix msvc compilation errors and warnings

* fix bug in data parallel tree learner

* make quantized training test deterministic

* make quantized training in test case more accurate

* refactor test_quantized_training

* fix leaf splits initialization with quantized training

* check distributed quantized training result
parent a97c444b
......@@ -85,6 +85,38 @@ class LeafSplits {
sum_hessians_ = tmp_sum_hessians;
}
/*!
* \brief Init splits on the current leaf, it will traverse all data to sum up the results
* \param int_gradients_and_hessians Discretized gradients and hessians
* \param grad_scale Scaling factor to recover original gradients from discretized gradients
* \param hess_scale Scaling factor to recover original hessians from discretized hessians
*/
void Init(const int8_t* int_gradients_and_hessians,
const double grad_scale, const double hess_scale) {
num_data_in_leaf_ = num_data_;
leaf_index_ = 0;
data_indices_ = nullptr;
double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f;
const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
const int64_t packed_long_int_grad_and_hess =
(static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
(static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
}
/*!
* \brief Init splits on current leaf of partial data.
* \param leaf Index of current leaf
......@@ -109,6 +141,40 @@ class LeafSplits {
}
/*!
* \brief Init splits on current leaf of partial data.
* \param leaf Index of current leaf
* \param data_partition current data partition
* \param int_gradients_and_hessians Discretized gradients and hessians
* \param grad_scale Scaling factor to recover original gradients from discretized gradients
* \param hess_scale Scaling factor to recover original hessians from discretized hessians
*/
void Init(int leaf, const DataPartition* data_partition,
const int8_t* int_gradients_and_hessians,
const score_t grad_scale, const score_t hess_scale) {
leaf_index_ = leaf;
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
double tmp_sum_gradients = 0.0f;
double tmp_sum_hessians = 0.0f;
const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
int64_t tmp_sum_gradients_and_hessians = 0;
#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
const data_size_t idx = data_indices_[i];
tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;
tmp_sum_hessians += int_gradients_and_hessians[2 * idx] * hess_scale;
const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
const int64_t packed_long_int_grad_and_hess =
(static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
(static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
}
sum_gradients_ = tmp_sum_gradients;
sum_hessians_ = tmp_sum_hessians;
int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
}
/*!
* \brief Init splits on current leaf, only update sum_gradients and sum_hessians
* \param sum_gradients
......@@ -120,6 +186,19 @@ class LeafSplits {
sum_hessians_ = sum_hessians;
}
/*!
* \brief Init splits on current leaf, only update sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \param int_sum_gradients_and_hessians
*/
void Init(double sum_gradients, double sum_hessians, int64_t int_sum_gradients_and_hessians) {
leaf_index_ = 0;
sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians;
int_sum_gradients_and_hessians_ = int_sum_gradients_and_hessians;
}
/*!
* \brief Init splits on current leaf
*/
......@@ -142,6 +221,9 @@ class LeafSplits {
/*! \brief Get sum of Hessians of current leaf */
double sum_hessians() const { return sum_hessians_; }
/*! \brief Get sum of discretized gradients and Hessians of current leaf */
int64_t int_sum_gradients_and_hessians() const { return int_sum_gradients_and_hessians_; }
/*! \brief Get indices of data of current leaf */
const data_size_t* data_indices() const { return data_indices_; }
......@@ -162,6 +244,8 @@ class LeafSplits {
double sum_gradients_;
/*! \brief sum of Hessians of current leaf */
double sum_hessians_;
/*! \brief sum of discretized gradients and Hessians of current leaf */
int64_t int_sum_gradients_and_hessians_;
/*! \brief indices of data of current leaf */
const data_size_t* data_indices_;
/*! \brief weight of current leaf */
......
......@@ -71,15 +71,24 @@ class DataParallelTreeLearner: public TREELEARNER_T {
}
}
void PrepareBufferPos(
const std::vector<std::vector<int>>& feature_distribution,
std::vector<comm_size_t>* block_start,
std::vector<comm_size_t>* block_len,
std::vector<comm_size_t>* buffer_write_start_pos,
std::vector<comm_size_t>* buffer_read_start_pos,
comm_size_t* reduce_scatter_size,
size_t hist_entry_size);
private:
/*! \brief Rank of local machine */
int rank_;
/*! \brief Number of machines of this parallel task */
int num_machines_;
/*! \brief Buffer for network send */
std::vector<char> input_buffer_;
std::vector<char, Common::AlignmentAllocator<char, 32>> input_buffer_;
/*! \brief Buffer for network receive */
std::vector<char> output_buffer_;
std::vector<char, Common::AlignmentAllocator<char, 32>> output_buffer_;
/*! \brief different machines will aggregate histograms for different features,
use this to mark local aggregate features*/
std::vector<bool> is_feature_aggregated_;
......@@ -87,12 +96,22 @@ class DataParallelTreeLearner: public TREELEARNER_T {
std::vector<comm_size_t> block_start_;
/*! \brief Block size for reduce scatter */
std::vector<comm_size_t> block_len_;
/*! \brief Block start index for reduce scatter with int16 histograms */
std::vector<comm_size_t> block_start_int16_;
/*! \brief Block size for reduce scatter with int16 histograms */
std::vector<comm_size_t> block_len_int16_;
/*! \brief Write positions for feature histograms */
std::vector<comm_size_t> buffer_write_start_pos_;
/*! \brief Read positions for local feature histograms */
std::vector<comm_size_t> buffer_read_start_pos_;
/*! \brief Write positions for feature histograms with int16 histograms*/
std::vector<comm_size_t> buffer_write_start_pos_int16_;
/*! \brief Read positions for local feature histograms with int16 histograms */
std::vector<comm_size_t> buffer_read_start_pos_int16_;
/*! \brief Size for reduce scatter */
comm_size_t reduce_scatter_size_;
/*! \brief Size for reduce scatter with int16 histogram*/
comm_size_t reduce_scatter_size_int16_;
/*! \brief Store global number of data in leaves */
std::vector<data_size_t> global_data_count_in_leaf_;
};
......
......@@ -21,6 +21,7 @@ namespace LightGBM {
SerialTreeLearner::SerialTreeLearner(const Config* config)
: config_(config), col_sampler_(config) {
gradient_discretizer_ = nullptr;
}
SerialTreeLearner::~SerialTreeLearner() {
......@@ -60,6 +61,11 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
ordered_gradients_.resize(num_data_);
ordered_hessians_.resize(num_data_);
if (config_->use_quantized_grad) {
gradient_discretizer_.reset(new GradientDiscretizer(config_->num_grad_quant_bins, config_->num_iterations, config_->seed, is_constant_hessian, config_->stochastic_rounding));
gradient_discretizer_->Init(num_data_, config_->num_leaves, num_features_, train_data_);
}
GetShareStates(train_data_, is_constant_hessian, true);
histogram_pool_.DynamicChangeSize(train_data_,
share_state_->num_hist_total_bin(),
......@@ -76,17 +82,31 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset,
bool is_constant_hessian,
bool is_first_time) {
if (is_first_time) {
share_state_.reset(dataset->GetShareStates(
if (config_->use_quantized_grad) {
share_state_.reset(dataset->GetShareStates<true, 32>(
reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr,
col_sampler_.is_feature_used_bytree(), is_constant_hessian,
config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins));
} else {
share_state_.reset(dataset->GetShareStates<false, 0>(
ordered_gradients_.data(), ordered_hessians_.data(),
col_sampler_.is_feature_used_bytree(), is_constant_hessian,
config_->force_col_wise, config_->force_row_wise));
config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins));
}
} else {
CHECK_NOTNULL(share_state_);
// cannot change is_hist_col_wise during training
share_state_.reset(dataset->GetShareStates(
if (config_->use_quantized_grad) {
share_state_.reset(dataset->GetShareStates<true, 32>(
reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr,
col_sampler_.is_feature_used_bytree(), is_constant_hessian,
share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins));
} else {
share_state_.reset(dataset->GetShareStates<false, 0>(
ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(),
is_constant_hessian, share_state_->is_col_wise,
!share_state_->is_col_wise));
!share_state_->is_col_wise, config_->num_grad_quant_bins));
}
}
CHECK_NOTNULL(share_state_);
}
......@@ -169,6 +189,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
}
share_state_->num_threads = num_threads;
if (config_->use_quantized_grad) {
gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_);
}
// some initial works before training
BeforeTrain();
......@@ -205,6 +229,11 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
}
if (config_->use_quantized_grad && config_->quant_train_renew_leaf) {
gradient_discretizer_->RenewIntGradTreeOutput(tree.get(), config_, data_partition_.get(), gradients_, hessians_,
[this] (int leaf_index) { return GetGlobalDataCountInLeaf(leaf_index); });
}
Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth);
return tree.release();
}
......@@ -270,11 +299,25 @@ void SerialTreeLearner::BeforeTrain() {
// Sumup for root
if (data_partition_->leaf_count(0) == num_data_) {
// use all data
if (!config_->use_quantized_grad) {
smaller_leaf_splits_->Init(gradients_, hessians_);
} else {
smaller_leaf_splits_->Init(
gradient_discretizer_->discretized_gradients_and_hessians(),
gradient_discretizer_->grad_scale(),
gradient_discretizer_->hess_scale());
}
} else {
// use bagging, only use part of data
if (!config_->use_quantized_grad) {
smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
} else {
smaller_leaf_splits_->Init(
0, data_partition_.get(),
gradient_discretizer_->discretized_gradients_and_hessians(),
gradient_discretizer_->grad_scale(),
gradient_discretizer_->hess_scale());
}
}
larger_leaf_splits_->Init();
......@@ -282,6 +325,10 @@ void SerialTreeLearner::BeforeTrain() {
if (cegb_ != nullptr) {
cegb_->BeforeTrain();
}
if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
gradient_discretizer_->SetNumBitsInHistogramBin<false>(0, -1, data_partition_->leaf_count(0), 0);
}
}
bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
......@@ -353,9 +400,53 @@ void SerialTreeLearner::ConstructHistograms(
Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
global_timer);
// construct smaller leaf
if (config_->use_quantized_grad) {
const uint8_t smaller_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
hist_t* ptr_smaller_leaf_hist_data =
smaller_leaf_num_bits <= 16 ?
reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) :
reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[0].RawDataInt32() - kHistOffset);
#define SMALLER_LEAF_ARGS \
is_feature_used, smaller_leaf_splits_->data_indices(), \
smaller_leaf_splits_->num_data_in_leaf(), \
reinterpret_cast<const score_t*>(gradient_discretizer_->discretized_gradients_and_hessians()), \
nullptr, \
reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), \
nullptr, \
share_state_.get(), \
reinterpret_cast<hist_t*>(ptr_smaller_leaf_hist_data)
if (smaller_leaf_num_bits <= 16) {
train_data_->ConstructHistograms<true, 16>(SMALLER_LEAF_ARGS);
} else {
train_data_->ConstructHistograms<true, 32>(SMALLER_LEAF_ARGS);
}
#undef SMALLER_LEAF_ARGS
if (larger_leaf_histogram_array_ && !use_subtract) {
const uint8_t larger_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
hist_t* ptr_larger_leaf_hist_data =
larger_leaf_num_bits <= 16 ?
reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) :
reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[0].RawDataInt32() - kHistOffset);
#define LARGER_LEAF_ARGS \
is_feature_used, larger_leaf_splits_->data_indices(), \
larger_leaf_splits_->num_data_in_leaf(), \
reinterpret_cast<const score_t*>(gradient_discretizer_->discretized_gradients_and_hessians()), \
nullptr, \
reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), \
nullptr, \
share_state_.get(), \
reinterpret_cast<hist_t*>(ptr_larger_leaf_hist_data)
if (larger_leaf_num_bits <= 16) {
train_data_->ConstructHistograms<true, 16>(LARGER_LEAF_ARGS);
} else {
train_data_->ConstructHistograms<true, 32>(LARGER_LEAF_ARGS);
}
#undef LARGER_LEAF_ARGS
}
} else {
hist_t* ptr_smaller_leaf_hist_data =
smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
train_data_->ConstructHistograms(
train_data_->ConstructHistograms<false, 0>(
is_feature_used, smaller_leaf_splits_->data_indices(),
smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
......@@ -364,12 +455,13 @@ void SerialTreeLearner::ConstructHistograms(
// construct larger leaf
hist_t* ptr_larger_leaf_hist_data =
larger_leaf_histogram_array_[0].RawData() - kHistOffset;
train_data_->ConstructHistograms(
train_data_->ConstructHistograms<false, 0>(
is_feature_used, larger_leaf_splits_->data_indices(),
larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
ptr_larger_leaf_hist_data);
}
}
}
void SerialTreeLearner::FindBestSplitsFromHistograms(
......@@ -388,6 +480,26 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
if (larger_leaf_splits_->leaf_index() >= 0) {
larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index());
}
if (use_subtract && config_->use_quantized_grad) {
const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index());
const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode<false>(parent_index);
const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
if (parent_hist_bits > 16 && larger_hist_bits <= 16) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
OMP_LOOP_EX_BEGIN();
if (!is_feature_used[feature_index]) {
continue;
}
larger_leaf_histogram_array_[feature_index].CopyToBuffer(gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
}
}
OMP_INIT_EX();
// find splits
#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
......@@ -397,10 +509,24 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
continue;
}
const int tid = omp_get_thread_num();
if (config_->use_quantized_grad) {
const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
const int64_t int_sum_gradient_and_hessian = smaller_leaf_splits_->int_sum_gradients_and_hessians();
if (hist_bits_bin <= 16) {
train_data_->FixHistogramInt<int32_t, int32_t, 16, 16>(
feature_index, int_sum_gradient_and_hessian,
reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[feature_index].RawDataInt16()));
} else {
train_data_->FixHistogramInt<int64_t, int64_t, 32, 32>(
feature_index, int_sum_gradient_and_hessian,
reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[feature_index].RawDataInt32()));
}
} else {
train_data_->FixHistogram(
feature_index, smaller_leaf_splits_->sum_gradients(),
smaller_leaf_splits_->sum_hessians(),
smaller_leaf_histogram_array_[feature_index].RawData());
}
int real_fidx = train_data_->RealFeatureIndex(feature_index);
ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index,
......@@ -417,14 +543,51 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
}
if (use_subtract) {
larger_leaf_histogram_array_[feature_index].Subtract(
if (config_->use_quantized_grad) {
const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index());
const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode<false>(parent_index);
const uint8_t smaller_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
if (parent_hist_bits <= 16) {
CHECK_LE(smaller_hist_bits, 16);
CHECK_LE(larger_hist_bits, 16);
larger_leaf_histogram_array_[feature_index].Subtract<true, int32_t, int32_t, int32_t, 16, 16, 16>(
smaller_leaf_histogram_array_[feature_index]);
} else if (larger_hist_bits <= 16) {
CHECK_LE(smaller_hist_bits, 16);
larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int32_t, int32_t, 32, 16, 16>(
smaller_leaf_histogram_array_[feature_index], gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
} else if (smaller_hist_bits <= 16) {
larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int32_t, int64_t, 32, 16, 32>(
smaller_leaf_histogram_array_[feature_index]);
} else {
larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int64_t, int64_t, 32, 32, 32>(
smaller_leaf_histogram_array_[feature_index]);
}
} else {
larger_leaf_histogram_array_[feature_index].Subtract<false>(
smaller_leaf_histogram_array_[feature_index]);
}
} else {
if (config_->use_quantized_grad) {
const int64_t int_sum_gradient_and_hessian = larger_leaf_splits_->int_sum_gradients_and_hessians();
const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
if (hist_bits_bin <= 16) {
train_data_->FixHistogramInt<int32_t, int32_t, 16, 16>(
feature_index, int_sum_gradient_and_hessian,
reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[feature_index].RawDataInt16()));
} else {
train_data_->FixHistogramInt<int64_t, int64_t, 32, 32>(
feature_index, int_sum_gradient_and_hessian,
reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[feature_index].RawDataInt32()));
}
} else {
train_data_->FixHistogram(
feature_index, larger_leaf_splits_->sum_gradients(),
larger_leaf_splits_->sum_hessians(),
larger_leaf_histogram_array_[feature_index].RawData());
}
}
ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index,
real_fidx,
......@@ -699,6 +862,11 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
best_split_info.left_sum_hessian,
best_split_info.left_output);
}
if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
gradient_discretizer_->SetNumBitsInHistogramBin<false>(*left_leaf, *right_leaf,
data_partition_->leaf_count(*left_leaf),
data_partition_->leaf_count(*right_leaf));
}
auto leaves_need_update = constraints_->Update(
is_numerical_split, *left_leaf, *right_leaf,
best_split_info.monotone_type, best_split_info.right_output,
......@@ -762,9 +930,21 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
train_data_->FeatureNumBin(feature_index));
}
SplitInfo new_split;
if (config_->use_quantized_grad) {
const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(leaf_splits->leaf_index());
histogram_array_[feature_index].FindBestThresholdInt(
leaf_splits->int_sum_gradients_and_hessians(),
gradient_discretizer_->grad_scale(),
gradient_discretizer_->hess_scale(),
hist_bits_bin,
hist_bits_bin,
num_data,
constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split);
} else {
histogram_array_[feature_index].FindBestThreshold(
leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split);
}
new_split.feature = real_fidx;
if (cegb_ != nullptr) {
new_split.gain -=
......
......@@ -24,6 +24,7 @@
#include "col_sampler.hpp"
#include "data_partition.hpp"
#include "feature_histogram.hpp"
#include "gradient_discretizer.hpp"
#include "leaf_splits.hpp"
#include "monotone_constraints.hpp"
#include "split_info.hpp"
......@@ -170,6 +171,8 @@ class SerialTreeLearner: public TreeLearner {
std::set<int> FindAllForceFeatures(Json force_split_leaf_setting);
void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index);
/*!
* \brief Get the number of data in a leaf
* \param leaf_idx The index of leaf
......@@ -230,6 +233,7 @@ class SerialTreeLearner: public TreeLearner {
const Json* forced_split_json_;
std::unique_ptr<TrainingShareStates> share_state_;
std::unique_ptr<CostEfficientGradientBoosting> cegb_;
std::unique_ptr<GradientDiscretizer> gradient_discretizer_;
};
inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
......
......@@ -40,10 +40,14 @@ struct SplitInfo {
double left_sum_gradient = 0;
/*! \brief Left sum hessian after split */
double left_sum_hessian = 0;
/*! \brief Left sum discretized gradient and hessian after split */
int64_t left_sum_gradient_and_hessian = 0;
/*! \brief Right sum gradient after split */
double right_sum_gradient = 0;
/*! \brief Right sum hessian after split */
double right_sum_hessian = 0;
/*! \brief Right sum discretized gradient and hessian after split */
int64_t right_sum_gradient_and_hessian = 0;
std::vector<uint32_t> cat_threshold;
/*! \brief True if default split is left */
bool default_left = true;
......@@ -71,10 +75,14 @@ struct SplitInfo {
buffer += sizeof(left_sum_gradient);
std::memcpy(buffer, &left_sum_hessian, sizeof(left_sum_hessian));
buffer += sizeof(left_sum_hessian);
std::memcpy(buffer, &left_sum_gradient_and_hessian, sizeof(left_sum_gradient_and_hessian));
buffer += sizeof(left_sum_gradient_and_hessian);
std::memcpy(buffer, &right_sum_gradient, sizeof(right_sum_gradient));
buffer += sizeof(right_sum_gradient);
std::memcpy(buffer, &right_sum_hessian, sizeof(right_sum_hessian));
buffer += sizeof(right_sum_hessian);
std::memcpy(buffer, &right_sum_gradient_and_hessian, sizeof(right_sum_gradient_and_hessian));
buffer += sizeof(right_sum_gradient_and_hessian);
std::memcpy(buffer, &default_left, sizeof(default_left));
buffer += sizeof(default_left);
std::memcpy(buffer, &monotone_type, sizeof(monotone_type));
......@@ -103,10 +111,14 @@ struct SplitInfo {
buffer += sizeof(left_sum_gradient);
std::memcpy(&left_sum_hessian, buffer, sizeof(left_sum_hessian));
buffer += sizeof(left_sum_hessian);
std::memcpy(&left_sum_gradient_and_hessian, buffer, sizeof(left_sum_gradient_and_hessian));
buffer += sizeof(left_sum_gradient_and_hessian);
std::memcpy(&right_sum_gradient, buffer, sizeof(right_sum_gradient));
buffer += sizeof(right_sum_gradient);
std::memcpy(&right_sum_hessian, buffer, sizeof(right_sum_hessian));
buffer += sizeof(right_sum_hessian);
std::memcpy(&right_sum_gradient_and_hessian, buffer, sizeof(right_sum_gradient_and_hessian));
buffer += sizeof(right_sum_gradient_and_hessian);
std::memcpy(&default_left, buffer, sizeof(default_left));
buffer += sizeof(default_left);
std::memcpy(&monotone_type, buffer, sizeof(monotone_type));
......
......@@ -1854,3 +1854,44 @@ def test_predict_with_raw_score(task, output, cluster):
if task.endswith('classification'):
pred_proba_raw = model.predict_proba(dX, raw_score=True).compute()
assert_eq(raw_predictions, pred_proba_raw)
def test_distributed_quantized_training(cluster):
with Client(cluster) as client:
X, y, w, _, dX, dy, dw, _ = _create_data(
objective='regression',
output='array'
)
np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f")
params = {
"boosting_type": 'gbdt',
"n_estimators": 50,
"num_leaves": 31,
'use_quantized_grad': True,
'num_grad_quant_bins': 30,
'quant_train_renew_leaf': True,
'verbose': -1,
'force_row_wise': True,
}
quant_dask_classifier = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
**params
)
quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw)
quant_p1 = quant_dask_classifier.predict(dX)
quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2))
params["use_quantized_grad"] = False
dask_classifier = lgb.DaskLGBMRegressor(
client=client,
time_out=5,
**params
)
dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
p1 = dask_classifier.predict(dX)
rmse = np.sqrt(np.mean((p1.compute() - y) ** 2))
assert quant_rmse < rmse + 7.0
......@@ -4116,3 +4116,19 @@ def test_train_raises_informative_error_for_params_of_wrong_type():
dtrain = lgb.Dataset(X, label=y)
with pytest.raises(lgb.basic.LightGBMError, match="Parameter early_stopping_round should be of type int, got \"too-many\""):
lgb.train(params, dtrain)
def test_quantized_training():
X, y = make_synthetic_regression()
ds = lgb.Dataset(X, label=y)
bst_params = {'num_leaves': 15, 'verbose': -1, 'seed': 0}
bst = lgb.train(bst_params, ds, num_boost_round=10)
rmse = np.sqrt(np.mean((bst.predict(X) - y) ** 2))
bst_params.update({
'use_quantized_grad': True,
'num_grad_quant_bins': 30,
'quant_train_renew_leaf': True,
})
quant_bst = lgb.train(bst_params, ds, num_boost_round=10)
quant_rmse = np.sqrt(np.mean((quant_bst.predict(X) - y) ** 2))
assert quant_rmse < rmse + 6.0
......@@ -306,6 +306,7 @@
<ClInclude Include="..\src\treelearner\parallel_tree_learner.h" />
<ClInclude Include="..\src\treelearner\serial_tree_learner.h" />
<ClInclude Include="..\src\treelearner\split_info.hpp" />
<ClInclude Include="..\src\treelearner\gradient_discretizer.hpp" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\src\application\application.cpp" />
......@@ -341,6 +342,7 @@
<ClCompile Include="..\src\treelearner\serial_tree_learner.cpp" />
<ClCompile Include="..\src\treelearner\tree_learner.cpp" />
<ClCompile Include="..\src\treelearner\voting_parallel_tree_learner.cpp" />
<ClCompile Include="..\src\treelearner\gradient_discretizer.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
......
......@@ -51,6 +51,9 @@
<ClInclude Include="..\src\treelearner\serial_tree_learner.h">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\treelearner\gradient_discretizer.hpp">
<Filter>src\treelearner</Filter>
</ClInclude>
<ClInclude Include="..\src\application\predictor.hpp">
<Filter>src\application</Filter>
</ClInclude>
......@@ -338,5 +341,8 @@
<ClCompile Include="..\src\treelearner\linear_tree_learner.cpp">
<Filter>src\treelearner</Filter>
</ClCompile>
<ClCompile Include="..\src\treelearner\gradient_discretizer.cpp">
<Filter>src\treelearner</Filter>
</ClCompile>
</ItemGroup>
</Project>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment