Unverified Commit 17ecfab3 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

Add quantized training (CPU part) (#5800)

* add quantized training (first stage)

* add histogram construction functions for integer gradients

* add stochastic rounding

* update docs

* fix compilation errors by adding template instantiations

* update files for compilation

* fix compilation of gpu version

* initialize gradient discretizer before share states

* add a test case for quantized training

* add quantized training for data distributed training

* Delete origin.pred

* Delete ifelse.pred

* Delete LightGBM_model.txt

* remove useless changes

* fix lint error

* remove debug loggings

* fix mismatch of vector and allocator types

* remove changes in main.cpp

* fix bugs with uninitialized gradient discretizer

* initialize ordered gradients in gradient discretizer

* disable quantized training with gpu and cuda

fix msvc compilation errors and warnings

* fix bug in data parallel tree learner

* make quantized training test deterministic

* make quantized training in test case more accurate

* refactor test_quantized_training

* fix leaf splits initialization with quantized training

* check distributed quantized training result
parent a97c444b
...@@ -48,6 +48,7 @@ OBJECTS = \ ...@@ -48,6 +48,7 @@ OBJECTS = \
treelearner/data_parallel_tree_learner.o \ treelearner/data_parallel_tree_learner.o \
treelearner/feature_parallel_tree_learner.o \ treelearner/feature_parallel_tree_learner.o \
treelearner/gpu_tree_learner.o \ treelearner/gpu_tree_learner.o \
treelearner/gradient_discretizer.o \
treelearner/linear_tree_learner.o \ treelearner/linear_tree_learner.o \
treelearner/serial_tree_learner.o \ treelearner/serial_tree_learner.o \
treelearner/tree_learner.o \ treelearner/tree_learner.o \
......
...@@ -49,6 +49,7 @@ OBJECTS = \ ...@@ -49,6 +49,7 @@ OBJECTS = \
treelearner/data_parallel_tree_learner.o \ treelearner/data_parallel_tree_learner.o \
treelearner/feature_parallel_tree_learner.o \ treelearner/feature_parallel_tree_learner.o \
treelearner/gpu_tree_learner.o \ treelearner/gpu_tree_learner.o \
treelearner/gradient_discretizer.o \
treelearner/linear_tree_learner.o \ treelearner/linear_tree_learner.o \
treelearner/serial_tree_learner.o \ treelearner/serial_tree_learner.o \
treelearner/tree_learner.o \ treelearner/tree_learner.o \
......
...@@ -658,6 +658,38 @@ Learning Control Parameters ...@@ -658,6 +658,38 @@ Learning Control Parameters
- **Note**: can be used only in CLI version - **Note**: can be used only in CLI version
- ``use_quantized_grad`` :raw-html:`<a id="use_quantized_grad" title="Permalink to this parameter" href="#use_quantized_grad">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
- whether to use gradient quantization when training
- enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
- with quantized training, most arithmetics in the training process will be integer operations
- gradient quantization can accelerate training, with little accuracy drop in most cases
- **Note**: can be used only with ``device_type = cpu``
- ``num_grad_quant_bins`` :raw-html:`<a id="num_grad_quant_bins" title="Permalink to this parameter" href="#num_grad_quant_bins">&#x1F517;&#xFE0E;</a>`, default = ``4``, type = int
- number of bins to quantization gradients and hessians
- with more bins, the quantized training will be closer to full precision training
- **Note**: can be used only with ``device_type = cpu``
- ``quant_train_renew_leaf`` :raw-html:`<a id="quant_train_renew_leaf" title="Permalink to this parameter" href="#quant_train_renew_leaf">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
- whether to renew the leaf values with original gradients when quantized training
- renewing is very helpful for good quantized training accuracy for ranking objectives
- **Note**: can be used only with ``device_type = cpu``
- ``stochastic_rounding`` :raw-html:`<a id="stochastic_rounding" title="Permalink to this parameter" href="#stochastic_rounding">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool
- whether to use stochastic rounding in gradient quantization
IO Parameters IO Parameters
------------- -------------
......
...@@ -30,11 +30,14 @@ enum MissingType { ...@@ -30,11 +30,14 @@ enum MissingType {
}; };
typedef double hist_t; typedef double hist_t;
typedef int32_t int_hist_t;
typedef uint64_t hist_cnt_t; typedef uint64_t hist_cnt_t;
// check at compile time // check at compile time
static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct"); static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct");
const size_t kHistEntrySize = 2 * sizeof(hist_t); const size_t kHistEntrySize = 2 * sizeof(hist_t);
const size_t kInt32HistEntrySize = 2 * sizeof(int_hist_t);
const size_t kInt16HistEntrySize = 2 * sizeof(int16_t);
const int kHistOffset = 2; const int kHistOffset = 2;
const double kSparseThreshold = 0.7; const double kSparseThreshold = 0.7;
...@@ -56,6 +59,28 @@ inline static void HistogramSumReducer(const char* src, char* dst, int type_size ...@@ -56,6 +59,28 @@ inline static void HistogramSumReducer(const char* src, char* dst, int type_size
} }
} }
inline static void Int32HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
const int64_t* src_ptr = reinterpret_cast<const int64_t*>(src);
int64_t* dst_ptr = reinterpret_cast<int64_t*>(dst);
const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
const int num_threads = OMP_NUM_THREADS();
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (comm_size_t i = 0; i < steps; ++i) {
dst_ptr[i] += src_ptr[i];
}
}
inline static void Int16HistogramSumReducer(const char* src, char* dst, int type_size, comm_size_t len) {
const int32_t* src_ptr = reinterpret_cast<const int32_t*>(src);
int32_t* dst_ptr = reinterpret_cast<int32_t*>(dst);
const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
const int num_threads = OMP_NUM_THREADS();
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (comm_size_t i = 0; i < steps; ++i) {
dst_ptr[i] += src_ptr[i];
}
}
/*! \brief This class used to convert feature values into bin, /*! \brief This class used to convert feature values into bin,
* and store some meta information for bin*/ * and store some meta information for bin*/
class BinMapper { class BinMapper {
...@@ -332,6 +357,33 @@ class Bin { ...@@ -332,6 +357,33 @@ class Bin {
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0; hist_t* out) const = 0;
virtual void ConstructHistogramInt8(
const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt16(
const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt32(
const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* ordered_gradients, const score_t* ordered_hessians,
hist_t* out) const = 0;
/*! /*!
* \brief Construct histogram of this feature, * \brief Construct histogram of this feature,
* Note: We use ordered_gradients and ordered_hessians to improve cache hit chance * Note: We use ordered_gradients and ordered_hessians to improve cache hit chance
...@@ -351,6 +403,24 @@ class Bin { ...@@ -351,6 +403,24 @@ class Bin {
virtual void ConstructHistogram(data_size_t start, data_size_t end, virtual void ConstructHistogram(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0; const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* ordered_gradients, hist_t* out) const = 0;
virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin, virtual data_size_t Split(uint32_t min_bin, uint32_t max_bin,
uint32_t default_bin, uint32_t most_freq_bin, uint32_t default_bin, uint32_t most_freq_bin,
MissingType missing_type, bool default_left, MissingType missing_type, bool default_left,
...@@ -464,6 +534,57 @@ class MultiValBin { ...@@ -464,6 +534,57 @@ class MultiValBin {
const score_t* ordered_hessians, const score_t* ordered_hessians,
hist_t* out) const = 0; hist_t* out) const = 0;
virtual void ConstructHistogramInt32(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt16(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt8(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* hessians,
hist_t* out) const = 0;
virtual void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* ordered_hessians,
hist_t* out) const = 0;
virtual void FinishLoad() = 0; virtual void FinishLoad() = 0;
virtual bool IsSparse() = 0; virtual bool IsSparse() = 0;
......
...@@ -592,6 +592,30 @@ struct Config { ...@@ -592,6 +592,30 @@ struct Config {
// desc = **Note**: can be used only in CLI version // desc = **Note**: can be used only in CLI version
int snapshot_freq = -1; int snapshot_freq = -1;
// [no-save]
// desc = whether to use gradient quantization when training
// desc = enabling this will discretize (quantize) the gradients and hessians into bins of ``num_grad_quant_bins``
// desc = with quantized training, most arithmetics in the training process will be integer operations
// desc = gradient quantization can accelerate training, with little accuracy drop in most cases
// desc = **Note**: can be used only with ``device_type = cpu``
bool use_quantized_grad = false;
// [no-save]
// desc = number of bins to quantization gradients and hessians
// desc = with more bins, the quantized training will be closer to full precision training
// desc = **Note**: can be used only with ``device_type = cpu``
int num_grad_quant_bins = 4;
// [no-save]
// desc = whether to renew the leaf values with original gradients when quantized training
// desc = renewing is very helpful for good quantized training accuracy for ranking objectives
// desc = **Note**: can be used only with ``device_type = cpu``
bool quant_train_renew_leaf = false;
// [no-save]
// desc = whether to use stochastic rounding in gradient quantization
bool stochastic_rounding = true;
#ifndef __NVCC__ #ifndef __NVCC__
#pragma endregion #pragma endregion
......
...@@ -598,10 +598,11 @@ class Dataset { ...@@ -598,10 +598,11 @@ class Dataset {
MultiValBin* GetMultiBinFromAllFeatures(const std::vector<uint32_t>& offsets) const; MultiValBin* GetMultiBinFromAllFeatures(const std::vector<uint32_t>& offsets) const;
template <bool USE_QUANT_GRAD, int HIST_BITS>
TrainingShareStates* GetShareStates( TrainingShareStates* GetShareStates(
score_t* gradients, score_t* hessians, score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise) const; bool force_col_wise, bool force_row_wise, const int num_grad_quant_bins) const;
LIGHTGBM_EXPORT void FinishLoad(); LIGHTGBM_EXPORT void FinishLoad();
...@@ -636,7 +637,7 @@ class Dataset { ...@@ -636,7 +637,7 @@ class Dataset {
void InitTrain(const std::vector<int8_t>& is_feature_used, void InitTrain(const std::vector<int8_t>& is_feature_used,
TrainingShareStates* share_state) const; TrainingShareStates* share_state) const;
template <bool USE_INDICES, bool USE_HESSIAN> template <bool USE_INDICES, bool USE_HESSIAN, bool USE_QUANT_GRAD, int HIST_BITS>
void ConstructHistogramsInner(const std::vector<int8_t>& is_feature_used, void ConstructHistogramsInner(const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, data_size_t num_data, const score_t* gradients,
...@@ -646,7 +647,7 @@ class Dataset { ...@@ -646,7 +647,7 @@ class Dataset {
TrainingShareStates* share_state, TrainingShareStates* share_state,
hist_t* hist_data) const; hist_t* hist_data) const;
template <bool USE_INDICES, bool ORDERED> template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
void ConstructHistogramsMultiVal(const data_size_t* data_indices, void ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* gradients,
...@@ -654,6 +655,7 @@ class Dataset { ...@@ -654,6 +655,7 @@ class Dataset {
TrainingShareStates* share_state, TrainingShareStates* share_state,
hist_t* hist_data) const; hist_t* hist_data) const;
template <bool USE_QUANT_GRAD, int HIST_BITS>
inline void ConstructHistograms( inline void ConstructHistograms(
const std::vector<int8_t>& is_feature_used, const std::vector<int8_t>& is_feature_used,
const data_size_t* data_indices, data_size_t num_data, const data_size_t* data_indices, data_size_t num_data,
...@@ -666,21 +668,21 @@ class Dataset { ...@@ -666,21 +668,21 @@ class Dataset {
bool use_indices = data_indices != nullptr && (num_data < num_data_); bool use_indices = data_indices != nullptr && (num_data < num_data_);
if (share_state->is_constant_hessian) { if (share_state->is_constant_hessian) {
if (use_indices) { if (use_indices) {
ConstructHistogramsInner<true, false>( ConstructHistogramsInner<true, false, USE_QUANT_GRAD, HIST_BITS>(
is_feature_used, data_indices, num_data, gradients, hessians, is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data); ordered_gradients, ordered_hessians, share_state, hist_data);
} else { } else {
ConstructHistogramsInner<false, false>( ConstructHistogramsInner<false, false, USE_QUANT_GRAD, HIST_BITS>(
is_feature_used, data_indices, num_data, gradients, hessians, is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data); ordered_gradients, ordered_hessians, share_state, hist_data);
} }
} else { } else {
if (use_indices) { if (use_indices) {
ConstructHistogramsInner<true, true>( ConstructHistogramsInner<true, true, USE_QUANT_GRAD, HIST_BITS>(
is_feature_used, data_indices, num_data, gradients, hessians, is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data); ordered_gradients, ordered_hessians, share_state, hist_data);
} else { } else {
ConstructHistogramsInner<false, true>( ConstructHistogramsInner<false, true, USE_QUANT_GRAD, HIST_BITS>(
is_feature_used, data_indices, num_data, gradients, hessians, is_feature_used, data_indices, num_data, gradients, hessians,
ordered_gradients, ordered_hessians, share_state, hist_data); ordered_gradients, ordered_hessians, share_state, hist_data);
} }
...@@ -689,6 +691,9 @@ class Dataset { ...@@ -689,6 +691,9 @@ class Dataset {
void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const; void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const;
template <typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
void FixHistogramInt(int feature_idx, int64_t sum_gradient_and_hessian, hist_t* data) const;
inline data_size_t Split(int feature, const uint32_t* threshold, inline data_size_t Split(int feature, const uint32_t* threshold,
int num_threshold, bool default_left, int num_threshold, bool default_left,
const data_size_t* data_indices, const data_size_t* data_indices,
......
...@@ -19,7 +19,7 @@ namespace LightGBM { ...@@ -19,7 +19,7 @@ namespace LightGBM {
class MultiValBinWrapper { class MultiValBinWrapper {
public: public:
MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, MultiValBinWrapper(MultiValBin* bin, data_size_t num_data,
const std::vector<int>& feature_groups_contained); const std::vector<int>& feature_groups_contained, const int num_grad_quant_bins);
bool IsSparse() { bool IsSparse() {
if (multi_val_bin_ != nullptr) { if (multi_val_bin_ != nullptr) {
...@@ -34,15 +34,17 @@ class MultiValBinWrapper { ...@@ -34,15 +34,17 @@ class MultiValBinWrapper {
const data_size_t* bagging_use_indices, const data_size_t* bagging_use_indices,
data_size_t bagging_indices_cnt); data_size_t bagging_indices_cnt);
template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
void HistMove(const std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf); void HistMove(const std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
void HistMerge(std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf); void HistMerge(std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
void ResizeHistBuf(std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf, void ResizeHistBuf(std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf,
MultiValBin* sub_multi_val_bin, MultiValBin* sub_multi_val_bin,
hist_t* origin_hist_data); hist_t* origin_hist_data);
template <bool USE_INDICES, bool ORDERED> template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
void ConstructHistograms(const data_size_t* data_indices, void ConstructHistograms(const data_size_t* data_indices,
data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* gradients,
...@@ -59,55 +61,145 @@ class MultiValBinWrapper { ...@@ -59,55 +61,145 @@ class MultiValBinWrapper {
Threading::BlockInfo<data_size_t>(num_threads_, num_data, min_block_size_, Threading::BlockInfo<data_size_t>(num_threads_, num_data, min_block_size_,
&n_data_block_, &data_block_size_); &n_data_block_, &data_block_size_);
ResizeHistBuf(hist_buf, cur_multi_val_bin, origin_hist_data); ResizeHistBuf(hist_buf, cur_multi_val_bin, origin_hist_data);
const int inner_hist_bits = (data_block_size_ * num_grad_quant_bins_ < 256 && HIST_BITS == 16) ? 8 : HIST_BITS;
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(num_threads_) #pragma omp parallel for schedule(static) num_threads(num_threads_)
for (int block_id = 0; block_id < n_data_block_; ++block_id) { for (int block_id = 0; block_id < n_data_block_; ++block_id) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
data_size_t start = block_id * data_block_size_; data_size_t start = block_id * data_block_size_;
data_size_t end = std::min<data_size_t>(start + data_block_size_, num_data); data_size_t end = std::min<data_size_t>(start + data_block_size_, num_data);
ConstructHistogramsForBlock<USE_INDICES, ORDERED>( if (inner_hist_bits == 8) {
cur_multi_val_bin, start, end, data_indices, gradients, hessians, ConstructHistogramsForBlock<USE_INDICES, ORDERED, USE_QUANT_GRAD, 8>(
block_id, hist_buf); cur_multi_val_bin, start, end, data_indices, gradients, hessians,
block_id, hist_buf);
} else {
ConstructHistogramsForBlock<USE_INDICES, ORDERED, USE_QUANT_GRAD, HIST_BITS>(
cur_multi_val_bin, start, end, data_indices, gradients, hessians,
block_id, hist_buf);
}
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
OMP_THROW_EX(); OMP_THROW_EX();
global_timer.Stop("Dataset::sparse_bin_histogram"); global_timer.Stop("Dataset::sparse_bin_histogram");
global_timer.Start("Dataset::sparse_bin_histogram_merge"); global_timer.Start("Dataset::sparse_bin_histogram_merge");
HistMerge(hist_buf); if (inner_hist_bits == 8) {
HistMerge<USE_QUANT_GRAD, HIST_BITS, 8>(hist_buf);
} else {
HistMerge<USE_QUANT_GRAD, HIST_BITS, HIST_BITS>(hist_buf);
}
global_timer.Stop("Dataset::sparse_bin_histogram_merge"); global_timer.Stop("Dataset::sparse_bin_histogram_merge");
global_timer.Start("Dataset::sparse_bin_histogram_move"); global_timer.Start("Dataset::sparse_bin_histogram_move");
HistMove(*hist_buf); if (inner_hist_bits == 8) {
HistMove<USE_QUANT_GRAD, HIST_BITS, 8>(*hist_buf);
} else {
HistMove<USE_QUANT_GRAD, HIST_BITS, HIST_BITS>(*hist_buf);
}
global_timer.Stop("Dataset::sparse_bin_histogram_move"); global_timer.Stop("Dataset::sparse_bin_histogram_move");
} }
} }
template <bool USE_INDICES, bool ORDERED> template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
void ConstructHistogramsForBlock(const MultiValBin* sub_multi_val_bin, void ConstructHistogramsForBlock(const MultiValBin* sub_multi_val_bin,
data_size_t start, data_size_t end, const data_size_t* data_indices, data_size_t start, data_size_t end, const data_size_t* data_indices,
const score_t* gradients, const score_t* hessians, int block_id, const score_t* gradients, const score_t* hessians, int block_id,
std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf) { std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf) {
hist_t* data_ptr = origin_hist_data_; if (USE_QUANT_GRAD) {
if (block_id == 0) { if (HIST_BITS == 8) {
if (is_use_subcol_) { int8_t* hist_buf_ptr = reinterpret_cast<int8_t*>(hist_buf->data());
data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_); int8_t* data_ptr = hist_buf_ptr +
static_cast<size_t>(num_bin_aligned_) * block_id * 2;
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kInt8HistBufferEntrySize);
if (USE_INDICES) {
if (ORDERED) {
sub_multi_val_bin->ConstructHistogramOrderedInt8(data_indices, start, end,
gradients, hessians,
reinterpret_cast<hist_t*>(data_ptr));
} else {
sub_multi_val_bin->ConstructHistogramInt8(data_indices, start, end, gradients,
hessians,
reinterpret_cast<hist_t*>(data_ptr));
}
} else {
sub_multi_val_bin->ConstructHistogramInt8(start, end, gradients, hessians,
reinterpret_cast<hist_t*>(data_ptr));
}
} else if (HIST_BITS == 16) {
int16_t* data_ptr = reinterpret_cast<int16_t*>(origin_hist_data_);
int16_t* hist_buf_ptr = reinterpret_cast<int16_t*>(hist_buf->data());
if (block_id == 0) {
if (is_use_subcol_) {
data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
}
} else {
data_ptr = hist_buf_ptr +
static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2;
}
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kInt16HistBufferEntrySize);
if (USE_INDICES) {
if (ORDERED) {
sub_multi_val_bin->ConstructHistogramOrderedInt16(data_indices, start, end,
gradients, hessians,
reinterpret_cast<hist_t*>(data_ptr));
} else {
sub_multi_val_bin->ConstructHistogramInt16(data_indices, start, end, gradients,
hessians,
reinterpret_cast<hist_t*>(data_ptr));
}
} else {
sub_multi_val_bin->ConstructHistogramInt16(start, end, gradients, hessians,
reinterpret_cast<hist_t*>(data_ptr));
}
} else {
int32_t* data_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
int32_t* hist_buf_ptr = reinterpret_cast<int32_t*>(hist_buf->data());
if (block_id == 0) {
if (is_use_subcol_) {
data_ptr = hist_buf_ptr + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
}
} else {
data_ptr = hist_buf_ptr +
static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2;
}
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kInt32HistBufferEntrySize);
if (USE_INDICES) {
if (ORDERED) {
sub_multi_val_bin->ConstructHistogramOrderedInt32(data_indices, start, end,
gradients, hessians,
reinterpret_cast<hist_t*>(data_ptr));
} else {
sub_multi_val_bin->ConstructHistogramInt32(data_indices, start, end, gradients,
hessians,
reinterpret_cast<hist_t*>(data_ptr));
}
} else {
sub_multi_val_bin->ConstructHistogramInt32(start, end, gradients, hessians,
reinterpret_cast<hist_t*>(data_ptr));
}
} }
} else { } else {
data_ptr = hist_buf->data() + hist_t* data_ptr = origin_hist_data_;
static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2; if (block_id == 0) {
} if (is_use_subcol_) {
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kHistBufferEntrySize); data_ptr = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
if (USE_INDICES) { }
if (ORDERED) {
sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
gradients, hessians, data_ptr);
} else { } else {
sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, data_ptr = hist_buf->data() +
hessians, data_ptr); static_cast<size_t>(num_bin_aligned_) * (block_id - 1) * 2;
}
std::memset(reinterpret_cast<void*>(data_ptr), 0, num_bin_ * kHistBufferEntrySize);
if (USE_INDICES) {
if (ORDERED) {
sub_multi_val_bin->ConstructHistogramOrdered(data_indices, start, end,
gradients, hessians, data_ptr);
} else {
sub_multi_val_bin->ConstructHistogram(data_indices, start, end, gradients,
hessians, data_ptr);
}
} else {
sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
data_ptr);
} }
} else {
sub_multi_val_bin->ConstructHistogram(start, end, gradients, hessians,
data_ptr);
} }
} }
...@@ -162,10 +254,14 @@ class MultiValBinWrapper { ...@@ -162,10 +254,14 @@ class MultiValBinWrapper {
int data_block_size_; int data_block_size_;
int min_block_size_; int min_block_size_;
int num_data_; int num_data_;
int num_grad_quant_bins_;
hist_t* origin_hist_data_; hist_t* origin_hist_data_;
const size_t kHistBufferEntrySize = 2 * sizeof(hist_t); const size_t kHistBufferEntrySize = 2 * sizeof(hist_t);
const size_t kInt32HistBufferEntrySize = 2 * sizeof(int32_t);
const size_t kInt16HistBufferEntrySize = 2 * sizeof(int16_t);
const size_t kInt8HistBufferEntrySize = 2 * sizeof(int8_t);
}; };
struct TrainingShareStates { struct TrainingShareStates {
...@@ -193,7 +289,7 @@ struct TrainingShareStates { ...@@ -193,7 +289,7 @@ struct TrainingShareStates {
void SetMultiValBin(MultiValBin* bin, data_size_t num_data, void SetMultiValBin(MultiValBin* bin, data_size_t num_data,
const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups, const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
bool dense_only, bool sparse_only); bool dense_only, bool sparse_only, const int num_grad_quant_bins);
void CalcBinOffsets(const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups, void CalcBinOffsets(const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
std::vector<uint32_t>* offsets, bool is_col_wise); std::vector<uint32_t>* offsets, bool is_col_wise);
...@@ -210,14 +306,14 @@ struct TrainingShareStates { ...@@ -210,14 +306,14 @@ struct TrainingShareStates {
} }
} }
template <bool USE_INDICES, bool ORDERED> template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
void ConstructHistograms(const data_size_t* data_indices, void ConstructHistograms(const data_size_t* data_indices,
data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* gradients,
const score_t* hessians, const score_t* hessians,
hist_t* hist_data) { hist_t* hist_data) {
if (multi_val_bin_wrapper_ != nullptr) { if (multi_val_bin_wrapper_ != nullptr) {
multi_val_bin_wrapper_->ConstructHistograms<USE_INDICES, ORDERED>( multi_val_bin_wrapper_->ConstructHistograms<USE_INDICES, ORDERED, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, gradients, hessians, &hist_buf_, hist_data); data_indices, num_data, gradients, hessians, &hist_buf_, hist_data);
} }
} }
......
...@@ -378,6 +378,10 @@ void Config::CheckParamConflict() { ...@@ -378,6 +378,10 @@ void Config::CheckParamConflict() {
if (deterministic) { if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
} }
if (use_quantized_grad) {
Log::Warning("Quantized training is not supported by GPU tree learner. Switch to full precision training.");
use_quantized_grad = false;
}
} else if (device_type == std::string("cuda")) { } else if (device_type == std::string("cuda")) {
// force row-wise for cuda version // force row-wise for cuda version
force_col_wise = false; force_col_wise = false;
...@@ -385,6 +389,10 @@ void Config::CheckParamConflict() { ...@@ -385,6 +389,10 @@ void Config::CheckParamConflict() {
if (deterministic) { if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
} }
if (use_quantized_grad) {
Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
use_quantized_grad = false;
}
} }
// linear tree learner must be serial type and run on CPU device // linear tree learner must be serial type and run on CPU device
if (linear_tree) { if (linear_tree) {
......
...@@ -251,6 +251,10 @@ const std::unordered_set<std::string>& Config::parameter_set() { ...@@ -251,6 +251,10 @@ const std::unordered_set<std::string>& Config::parameter_set() {
"output_model", "output_model",
"saved_feature_importance_type", "saved_feature_importance_type",
"snapshot_freq", "snapshot_freq",
"use_quantized_grad",
"num_grad_quant_bins",
"quant_train_renew_leaf",
"stochastic_rounding",
"linear_tree", "linear_tree",
"max_bin", "max_bin",
"max_bin_by_feature", "max_bin_by_feature",
...@@ -493,6 +497,14 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str ...@@ -493,6 +497,14 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
GetInt(params, "snapshot_freq", &snapshot_freq); GetInt(params, "snapshot_freq", &snapshot_freq);
GetBool(params, "use_quantized_grad", &use_quantized_grad);
GetInt(params, "num_grad_quant_bins", &num_grad_quant_bins);
GetBool(params, "quant_train_renew_leaf", &quant_train_renew_leaf);
GetBool(params, "stochastic_rounding", &stochastic_rounding);
GetBool(params, "linear_tree", &linear_tree); GetBool(params, "linear_tree", &linear_tree);
GetInt(params, "max_bin", &max_bin); GetInt(params, "max_bin", &max_bin);
...@@ -828,6 +840,10 @@ const std::unordered_map<std::string, std::vector<std::string>>& Config::paramet ...@@ -828,6 +840,10 @@ const std::unordered_map<std::string, std::vector<std::string>>& Config::paramet
{"output_model", {"model_output", "model_out"}}, {"output_model", {"model_output", "model_out"}},
{"saved_feature_importance_type", {}}, {"saved_feature_importance_type", {}},
{"snapshot_freq", {"save_period"}}, {"snapshot_freq", {"save_period"}},
{"use_quantized_grad", {}},
{"num_grad_quant_bins", {}},
{"quant_train_renew_leaf", {}},
{"stochastic_rounding", {}},
{"linear_tree", {"linear_trees"}}, {"linear_tree", {"linear_trees"}},
{"max_bin", {"max_bins"}}, {"max_bin", {"max_bins"}},
{"max_bin_by_feature", {}}, {"max_bin_by_feature", {}},
...@@ -966,6 +982,10 @@ const std::unordered_map<std::string, std::string>& Config::ParameterTypes() { ...@@ -966,6 +982,10 @@ const std::unordered_map<std::string, std::string>& Config::ParameterTypes() {
{"output_model", "string"}, {"output_model", "string"},
{"saved_feature_importance_type", "int"}, {"saved_feature_importance_type", "int"},
{"snapshot_freq", "int"}, {"snapshot_freq", "int"},
{"use_quantized_grad", "bool"},
{"num_grad_quant_bins", "int"},
{"quant_train_renew_leaf", "bool"},
{"stochastic_rounding", "bool"},
{"linear_tree", "bool"}, {"linear_tree", "bool"},
{"max_bin", "int"}, {"max_bin", "int"},
{"max_bin_by_feature", "vector<int>"}, {"max_bin_by_feature", "vector<int>"},
......
...@@ -608,10 +608,12 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& of ...@@ -608,10 +608,12 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& of
return ret.release(); return ret.release();
} }
template <bool USE_QUANT_GRAD, int HIST_BITS>
TrainingShareStates* Dataset::GetShareStates( TrainingShareStates* Dataset::GetShareStates(
score_t* gradients, score_t* hessians, score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian, const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise) const { bool force_col_wise, bool force_row_wise,
const int num_grad_quant_bins) const {
Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod", Common::FunctionTimer fun_timer("Dataset::TestMultiThreadingMethod",
global_timer); global_timer);
if (force_col_wise && force_row_wise) { if (force_col_wise && force_row_wise) {
...@@ -631,7 +633,7 @@ TrainingShareStates* Dataset::GetShareStates( ...@@ -631,7 +633,7 @@ TrainingShareStates* Dataset::GetShareStates(
share_state->CalcBinOffsets( share_state->CalcBinOffsets(
feature_groups_, &offsets, true); feature_groups_, &offsets, true);
share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets), share_state->SetMultiValBin(GetMultiBinFromSparseFeatures(offsets),
num_data_, feature_groups_, false, true); num_data_, feature_groups_, false, true, num_grad_quant_bins);
share_state->is_col_wise = true; share_state->is_col_wise = true;
share_state->is_constant_hessian = is_constant_hessian; share_state->is_constant_hessian = is_constant_hessian;
return share_state; return share_state;
...@@ -641,7 +643,7 @@ TrainingShareStates* Dataset::GetShareStates( ...@@ -641,7 +643,7 @@ TrainingShareStates* Dataset::GetShareStates(
share_state->CalcBinOffsets( share_state->CalcBinOffsets(
feature_groups_, &offsets, false); feature_groups_, &offsets, false);
share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_, share_state->SetMultiValBin(GetMultiBinFromAllFeatures(offsets), num_data_,
feature_groups_, false, false); feature_groups_, false, false, num_grad_quant_bins);
share_state->is_col_wise = false; share_state->is_col_wise = false;
share_state->is_constant_hessian = is_constant_hessian; share_state->is_constant_hessian = is_constant_hessian;
return share_state; return share_state;
...@@ -658,14 +660,14 @@ TrainingShareStates* Dataset::GetShareStates( ...@@ -658,14 +660,14 @@ TrainingShareStates* Dataset::GetShareStates(
std::vector<uint32_t> col_wise_offsets; std::vector<uint32_t> col_wise_offsets;
col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true); col_wise_state->CalcBinOffsets(feature_groups_, &col_wise_offsets, true);
col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_, col_wise_state->SetMultiValBin(GetMultiBinFromSparseFeatures(col_wise_offsets), num_data_,
feature_groups_, false, true); feature_groups_, false, true, num_grad_quant_bins);
col_wise_init_time = std::chrono::steady_clock::now() - start_time; col_wise_init_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
std::vector<uint32_t> row_wise_offsets; std::vector<uint32_t> row_wise_offsets;
row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false); row_wise_state->CalcBinOffsets(feature_groups_, &row_wise_offsets, false);
row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_, row_wise_state->SetMultiValBin(GetMultiBinFromAllFeatures(row_wise_offsets), num_data_,
feature_groups_, false, false); feature_groups_, false, false, num_grad_quant_bins);
row_wise_init_time = std::chrono::steady_clock::now() - start_time; row_wise_init_time = std::chrono::steady_clock::now() - start_time;
uint64_t max_total_bin = std::max<uint64_t>(row_wise_state->num_hist_total_bin(), uint64_t max_total_bin = std::max<uint64_t>(row_wise_state->num_hist_total_bin(),
...@@ -685,12 +687,12 @@ TrainingShareStates* Dataset::GetShareStates( ...@@ -685,12 +687,12 @@ TrainingShareStates* Dataset::GetShareStates(
InitTrain(is_feature_used, row_wise_state.get()); InitTrain(is_feature_used, row_wise_state.get());
std::chrono::duration<double, std::milli> col_wise_time, row_wise_time; std::chrono::duration<double, std::milli> col_wise_time, row_wise_time;
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, ConstructHistograms<USE_QUANT_GRAD, HIST_BITS>(is_feature_used, nullptr, num_data_, gradients,
hessians, gradients, hessians, col_wise_state.get(), hessians, gradients, hessians, col_wise_state.get(),
hist_data.data()); hist_data.data());
col_wise_time = std::chrono::steady_clock::now() - start_time; col_wise_time = std::chrono::steady_clock::now() - start_time;
start_time = std::chrono::steady_clock::now(); start_time = std::chrono::steady_clock::now();
ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, ConstructHistograms<USE_QUANT_GRAD, HIST_BITS>(is_feature_used, nullptr, num_data_, gradients,
hessians, gradients, hessians, row_wise_state.get(), hessians, gradients, hessians, row_wise_state.get(),
hist_data.data()); hist_data.data());
row_wise_time = std::chrono::steady_clock::now() - start_time; row_wise_time = std::chrono::steady_clock::now() - start_time;
...@@ -721,6 +723,24 @@ TrainingShareStates* Dataset::GetShareStates( ...@@ -721,6 +723,24 @@ TrainingShareStates* Dataset::GetShareStates(
} }
} }
template TrainingShareStates* Dataset::GetShareStates<false, 0>(
score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise,
const int num_grad_quant_bins) const;
template TrainingShareStates* Dataset::GetShareStates<true, 16>(
score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise,
const int num_grad_quant_bins) const;
template TrainingShareStates* Dataset::GetShareStates<true, 32>(
score_t* gradients, score_t* hessians,
const std::vector<int8_t>& is_feature_used, bool is_constant_hessian,
bool force_col_wise, bool force_row_wise,
const int num_grad_quant_bins) const;
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
feature_groups_.clear(); feature_groups_.clear();
num_features_ = dataset->num_features_; num_features_ = dataset->num_features_;
...@@ -1203,7 +1223,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used, ...@@ -1203,7 +1223,7 @@ void Dataset::InitTrain(const std::vector<int8_t>& is_feature_used,
is_feature_used); is_feature_used);
} }
template <bool USE_INDICES, bool ORDERED> template <bool USE_INDICES, bool ORDERED, bool USE_QUANT_GRAD, int HIST_BITS>
void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
data_size_t num_data, data_size_t num_data,
const score_t* gradients, const score_t* gradients,
...@@ -1212,18 +1232,18 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, ...@@ -1212,18 +1232,18 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices,
hist_t* hist_data) const { hist_t* hist_data) const {
Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal", Common::FunctionTimer fun_time("Dataset::ConstructHistogramsMultiVal",
global_timer); global_timer);
share_state->ConstructHistograms<USE_INDICES, ORDERED>( share_state->ConstructHistograms<USE_INDICES, ORDERED, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, gradients, hessians, hist_data); data_indices, num_data, gradients, hessians, hist_data);
} }
template <bool USE_INDICES, bool USE_HESSIAN> template <bool USE_INDICES, bool USE_HESSIAN, bool USE_QUANT_GRAD, int HIST_BITS>
void Dataset::ConstructHistogramsInner( void Dataset::ConstructHistogramsInner(
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices, const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians, data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians, score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const { TrainingShareStates* share_state, hist_t* hist_data) const {
if (!share_state->is_col_wise) { if (!share_state->is_col_wise) {
return ConstructHistogramsMultiVal<USE_INDICES, false>( return ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, gradients, hessians, share_state, hist_data); data_indices, num_data, gradients, hessians, share_state, hist_data);
} }
std::vector<int> used_dense_group; std::vector<int> used_dense_group;
...@@ -1275,30 +1295,80 @@ void Dataset::ConstructHistogramsInner( ...@@ -1275,30 +1295,80 @@ void Dataset::ConstructHistogramsInner(
for (int gi = 0; gi < num_used_dense_group; ++gi) { for (int gi = 0; gi < num_used_dense_group; ++gi) {
OMP_LOOP_EX_BEGIN(); OMP_LOOP_EX_BEGIN();
int group = used_dense_group[gi]; int group = used_dense_group[gi];
auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
const int num_bin = feature_groups_[group]->num_total_bin_; const int num_bin = feature_groups_[group]->num_total_bin_;
std::memset(reinterpret_cast<void*>(data_ptr), 0, if (USE_QUANT_GRAD) {
num_bin * kHistEntrySize); if (HIST_BITS == 16) {
if (USE_HESSIAN) { auto data_ptr = reinterpret_cast<hist_t*>(reinterpret_cast<int32_t*>(hist_data) + group_bin_boundaries_[group]);
if (USE_INDICES) { std::memset(reinterpret_cast<void*>(data_ptr), 0,
feature_groups_[group]->bin_data_->ConstructHistogram( num_bin * kInt16HistEntrySize);
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, if (USE_HESSIAN) {
data_ptr); if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogramInt16(
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
data_ptr);
} else {
feature_groups_[group]->bin_data_->ConstructHistogramInt16(
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
}
} else {
if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogramInt16(
data_indices, 0, num_data, ptr_ordered_grad,
data_ptr);
} else {
feature_groups_[group]->bin_data_->ConstructHistogramInt16(
0, num_data, ptr_ordered_grad, data_ptr);
}
}
} else { } else {
feature_groups_[group]->bin_data_->ConstructHistogram( auto data_ptr = hist_data + group_bin_boundaries_[group];
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); std::memset(reinterpret_cast<void*>(data_ptr), 0,
num_bin * kInt32HistEntrySize);
if (USE_HESSIAN) {
if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogramInt32(
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
data_ptr);
} else {
feature_groups_[group]->bin_data_->ConstructHistogramInt32(
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
}
} else {
if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogramInt32(
data_indices, 0, num_data, ptr_ordered_grad,
data_ptr);
} else {
feature_groups_[group]->bin_data_->ConstructHistogramInt32(
0, num_data, ptr_ordered_grad, data_ptr);
}
}
} }
} else { } else {
if (USE_INDICES) { auto data_ptr = hist_data + group_bin_boundaries_[group] * 2;
feature_groups_[group]->bin_data_->ConstructHistogram( std::memset(reinterpret_cast<void*>(data_ptr), 0,
data_indices, 0, num_data, ptr_ordered_grad, data_ptr); num_bin * kHistEntrySize);
if (USE_HESSIAN) {
if (USE_INDICES) {
feature_groups_[group]->bin_data_->ConstructHistogram(
data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess,
data_ptr);
} else {
feature_groups_[group]->bin_data_->ConstructHistogram(
0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr);
}
} else { } else {
feature_groups_[group]->bin_data_->ConstructHistogram( if (USE_INDICES) {
0, num_data, ptr_ordered_grad, data_ptr); feature_groups_[group]->bin_data_->ConstructHistogram(
} data_indices, 0, num_data, ptr_ordered_grad, data_ptr);
auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1); } else {
for (int i = 0; i < num_bin * 2; i += 2) { feature_groups_[group]->bin_data_->ConstructHistogram(
data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0]; 0, num_data, ptr_ordered_grad, data_ptr);
}
auto cnt_dst = reinterpret_cast<hist_cnt_t*>(data_ptr + 1);
for (int i = 0; i < num_bin * 2; i += 2) {
data_ptr[i + 1] = static_cast<double>(cnt_dst[i]) * hessians[0];
}
} }
} }
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
...@@ -1307,43 +1377,78 @@ void Dataset::ConstructHistogramsInner( ...@@ -1307,43 +1377,78 @@ void Dataset::ConstructHistogramsInner(
} }
global_timer.Stop("Dataset::dense_bin_histogram"); global_timer.Stop("Dataset::dense_bin_histogram");
if (multi_val_groud_id >= 0) { if (multi_val_groud_id >= 0) {
if (num_used_dense_group > 0) { if (USE_QUANT_GRAD) {
ConstructHistogramsMultiVal<USE_INDICES, true>( if (HIST_BITS == 32) {
data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, int32_t* hist_data_ptr = reinterpret_cast<int32_t*>(hist_data);
share_state, if (num_used_dense_group > 0) {
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); ConstructHistogramsMultiVal<USE_INDICES, true, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
share_state,
reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
} else {
ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, gradients, hessians, share_state,
reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
}
} else if (HIST_BITS == 16) {
int16_t* hist_data_ptr = reinterpret_cast<int16_t*>(hist_data);
if (num_used_dense_group > 0) {
ConstructHistogramsMultiVal<USE_INDICES, true, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
share_state,
reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
} else {
ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, gradients, hessians, share_state,
reinterpret_cast<hist_t*>(hist_data_ptr + group_bin_boundaries_[multi_val_groud_id] * 2));
}
}
} else { } else {
ConstructHistogramsMultiVal<USE_INDICES, false>( if (num_used_dense_group > 0) {
data_indices, num_data, gradients, hessians, share_state, ConstructHistogramsMultiVal<USE_INDICES, true, USE_QUANT_GRAD, HIST_BITS>(
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess,
share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
} else {
ConstructHistogramsMultiVal<USE_INDICES, false, USE_QUANT_GRAD, HIST_BITS>(
data_indices, num_data, gradients, hessians, share_state,
hist_data + group_bin_boundaries_[multi_val_groud_id] * 2);
}
} }
} }
} }
// explicitly initialize template methods, for cross module call // explicitly initialize template methods, for cross module call
template void Dataset::ConstructHistogramsInner<true, true>( #define CONSTRUCT_HISTOGRAMS_INNER_PARMA \
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices, const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices, \
data_size_t num_data, const score_t* gradients, const score_t* hessians, data_size_t num_data, const score_t* gradients, const score_t* hessians, \
score_t* ordered_gradients, score_t* ordered_hessians, score_t* ordered_gradients, score_t* ordered_hessians, \
TrainingShareStates* share_state, hist_t* hist_data) const; TrainingShareStates* share_state, hist_t* hist_data
template void Dataset::ConstructHistogramsInner<true, false>( // explicitly initialize template methods, for cross module call
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices, template void Dataset::ConstructHistogramsInner<true, true, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<false, true>( template void Dataset::ConstructHistogramsInner<true, false, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians,
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const;
template void Dataset::ConstructHistogramsInner<false, false>( template void Dataset::ConstructHistogramsInner<false, true, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
const std::vector<int8_t>& is_feature_used, const data_size_t* data_indices,
data_size_t num_data, const score_t* gradients, const score_t* hessians, template void Dataset::ConstructHistogramsInner<false, false, false, 0>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
score_t* ordered_gradients, score_t* ordered_hessians,
TrainingShareStates* share_state, hist_t* hist_data) const; template void Dataset::ConstructHistogramsInner<true, true, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
template void Dataset::ConstructHistogramsInner<true, false, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
template void Dataset::ConstructHistogramsInner<false, true, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
template void Dataset::ConstructHistogramsInner<false, false, true, 16>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
template void Dataset::ConstructHistogramsInner<true, true, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
template void Dataset::ConstructHistogramsInner<true, false, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
template void Dataset::ConstructHistogramsInner<false, true, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
template void Dataset::ConstructHistogramsInner<false, false, true, 32>(CONSTRUCT_HISTOGRAMS_INNER_PARMA) const;
void Dataset::FixHistogram(int feature_idx, double sum_gradient, void Dataset::FixHistogram(int feature_idx, double sum_gradient,
double sum_hessian, hist_t* data) const { double sum_hessian, hist_t* data) const {
...@@ -1365,6 +1470,49 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient, ...@@ -1365,6 +1470,49 @@ void Dataset::FixHistogram(int feature_idx, double sum_gradient,
} }
} }
template <typename PACKED_HIST_BIN_T, typename PACKED_HIST_ACC_T, int HIST_BITS_BIN, int HIST_BITS_ACC>
void Dataset::FixHistogramInt(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const {
const int group = feature2group_[feature_idx];
const int sub_feature = feature2subfeature_[feature_idx];
const BinMapper* bin_mapper =
feature_groups_[group]->bin_mappers_[sub_feature].get();
const int most_freq_bin = bin_mapper->GetMostFreqBin();
PACKED_HIST_BIN_T* data_ptr = reinterpret_cast<PACKED_HIST_BIN_T*>(data);
PACKED_HIST_ACC_T int_sum_gradient_and_hessian_local = HIST_BITS_ACC == 16 ?
((static_cast<int32_t>(int_sum_gradient_and_hessian >> 32) << 16) |
static_cast<int32_t>(int_sum_gradient_and_hessian & 0x0000ffff)) :
int_sum_gradient_and_hessian;
if (most_freq_bin > 0) {
const int num_bin = bin_mapper->num_bin();
if (HIST_BITS_BIN == HIST_BITS_ACC) {
for (int i = 0; i < num_bin; ++i) {
if (i != most_freq_bin) {
int_sum_gradient_and_hessian_local -= data_ptr[i];
}
}
data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local;
} else {
CHECK_EQ(HIST_BITS_ACC, 32);
CHECK_EQ(HIST_BITS_BIN, 16);
for (int i = 0; i < num_bin; ++i) {
if (i != most_freq_bin) {
const PACKED_HIST_BIN_T packed_hist = data_ptr[i];
const PACKED_HIST_ACC_T packed_hist_acc = (static_cast<int64_t>(static_cast<int16_t>(packed_hist >> 16)) << 32) |
static_cast<int64_t>(packed_hist & 0x0000ffff);
int_sum_gradient_and_hessian_local -= packed_hist_acc;
}
}
PACKED_HIST_BIN_T int_sum_gradient_and_hessian_local_bin =
(static_cast<int32_t>(int_sum_gradient_and_hessian_local >> 32) << 16) | static_cast<int32_t>(int_sum_gradient_and_hessian_local & 0x0000ffff);
data_ptr[most_freq_bin] = int_sum_gradient_and_hessian_local_bin;
}
}
}
template void Dataset::FixHistogramInt<int64_t, int64_t, 32, 32>(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const;
template void Dataset::FixHistogramInt<int32_t, int32_t, 16, 16>(int feature_idx, int64_t int_sum_gradient_and_hessian, hist_t* data) const;
template <typename T> template <typename T>
void PushVector(std::vector<T>* dest, const std::vector<T>& src) { void PushVector(std::vector<T>* dest, const std::vector<T>& src) {
dest->reserve(dest->size() + src.size()); dest->reserve(dest->size() + src.size());
......
...@@ -171,6 +171,146 @@ class DenseBin : public Bin { ...@@ -171,6 +171,146 @@ class DenseBin : public Bin {
} }
template <bool USE_INDICES, bool USE_PREFETCH, bool USE_HESSIAN, typename PACKED_HIST_T, int HIST_BITS>
void ConstructHistogramIntInner(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const {
data_size_t i = start;
PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
const int16_t* gradients_ptr = reinterpret_cast<const int16_t*>(ordered_gradients);
const VAL_T* data_ptr_base = data_.data();
if (USE_PREFETCH) {
const data_size_t pf_offset = 64 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx =
USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
if (IS_4BIT) {
PREFETCH_T0(data_ptr_base + (pf_idx >> 1));
} else {
PREFETCH_T0(data_ptr_base + pf_idx);
}
const auto ti = static_cast<uint32_t>(data(idx));
const int16_t gradient_16 = gradients_ptr[i];
if (USE_HESSIAN) {
const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
(static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
out_ptr[ti] += gradient_packed;
} else {
const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
(static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (1);
out_ptr[ti] += gradient_packed;
}
}
}
for (; i < end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto ti = static_cast<uint32_t>(data(idx));
const int16_t gradient_16 = gradients_ptr[i];
if (USE_HESSIAN) {
const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
(static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
out_ptr[ti] += gradient_packed;
} else {
const PACKED_HIST_T gradient_packed = HIST_BITS == 8 ? gradient_16 :
(static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (1);
out_ptr[ti] += gradient_packed;
}
}
}
void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int16_t, 8>(
data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, true, int16_t, 8>(
nullptr, start, end, ordered_gradients, out);
}
void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int16_t, 8>(
data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int16_t, 8>(
nullptr, start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int32_t, 16>(
data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, true, int32_t, 16>(
nullptr, start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int32_t, 16>(
data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int32_t, 16>(
nullptr, start, end, ordered_gradients, out);
}
void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int64_t, 32>(
data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, true, int64_t, 32>(
nullptr, start, end, ordered_gradients, out);
}
void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int64_t, 32>(
data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int64_t, 32>(
nullptr, start, end, ordered_gradients, out);
}
template <bool MISS_IS_ZERO, bool MISS_IS_NA, bool MFB_IS_ZERO, template <bool MISS_IS_ZERO, bool MISS_IS_NA, bool MFB_IS_ZERO,
bool MFB_IS_NA, bool USE_MIN_BIN> bool MFB_IS_NA, bool USE_MIN_BIN>
data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin, data_size_t SplitInner(uint32_t min_bin, uint32_t max_bin,
......
...@@ -124,6 +124,123 @@ class MultiValDenseBin : public MultiValBin { ...@@ -124,6 +124,123 @@ class MultiValDenseBin : public MultiValBin {
gradients, hessians, out); gradients, hessians, out);
} }
template<bool USE_INDICES, bool USE_PREFETCH, bool ORDERED, typename PACKED_HIST_T, int HIST_BITS>
void ConstructHistogramIntInner(const data_size_t* data_indices, data_size_t start, data_size_t end,
const score_t* gradients_and_hessians, hist_t* out) const {
data_size_t i = start;
const VAL_T* data_ptr_base = data_.data();
const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(gradients_and_hessians);
PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
if (USE_PREFETCH) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
if (!ORDERED) {
PREFETCH_T0(gradients_and_hessians_ptr + pf_idx);
}
PREFETCH_T0(data_ptr_base + RowPtr(pf_idx));
const auto j_start = RowPtr(idx);
const VAL_T* data_ptr = data_ptr_base + j_start;
const int16_t gradient_16 = gradients_and_hessians_ptr[idx];
const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
for (int j = 0; j < num_feature_; ++j) {
const uint32_t bin = static_cast<uint32_t>(data_ptr[j]);
const auto ti = (bin + offsets_[j]);
out_ptr[ti] += gradient_packed;
}
}
}
for (; i < end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
const VAL_T* data_ptr = data_ptr_base + j_start;
const int16_t gradient_16 = gradients_and_hessians_ptr[idx];
const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
for (int j = 0; j < num_feature_; ++j) {
const uint32_t bin = static_cast<uint32_t>(data_ptr[j]);
const auto ti = (bin + offsets_[j]);
out_ptr[ti] += gradient_packed;
}
}
}
void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* /*hessians*/, hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int64_t, 32>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int64_t, 32>(
nullptr, start, end, gradients, out);
}
void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int64_t, 32>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* /*hessians*/, hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int32_t, 16>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int32_t, 16>(
nullptr, start, end, gradients, out);
}
void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int32_t, 16>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* /*hessians*/, hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int16_t, 8>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int16_t, 8>(
nullptr, start, end, gradients, out);
}
void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int16_t, 8>(data_indices, start, end,
gradients, out);
}
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double, MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double,
const std::vector<uint32_t>& offsets) const override { const std::vector<uint32_t>& offsets) const override {
return new MultiValDenseBin<VAL_T>(num_data, num_bin, num_feature, offsets); return new MultiValDenseBin<VAL_T>(num_data, num_bin, num_feature, offsets);
......
...@@ -180,6 +180,124 @@ class MultiValSparseBin : public MultiValBin { ...@@ -180,6 +180,124 @@ class MultiValSparseBin : public MultiValBin {
gradients, hessians, out); gradients, hessians, out);
} }
template <bool USE_INDICES, bool USE_PREFETCH, bool ORDERED, typename PACKED_HIST_T, int HIST_BITS>
void ConstructHistogramIntInner(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients_and_hessians, hist_t* out) const {
data_size_t i = start;
PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(gradients_and_hessians);
const VAL_T* data_ptr = data_.data();
const INDEX_T* row_ptr_base = row_ptr_.data();
if (USE_PREFETCH) {
const data_size_t pf_offset = 32 / sizeof(VAL_T);
const data_size_t pf_end = end - pf_offset;
for (; i < pf_end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto pf_idx =
USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset;
if (!ORDERED) {
PREFETCH_T0(gradients_and_hessians_ptr + pf_idx);
}
PREFETCH_T0(row_ptr_base + pf_idx);
PREFETCH_T0(data_ptr + row_ptr_[pf_idx]);
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
for (auto j = j_start; j < j_end; ++j) {
const auto ti = static_cast<uint32_t>(data_ptr[j]);
out_ptr[ti] += gradient_packed;
}
}
}
for (; i < end; ++i) {
const auto idx = USE_INDICES ? data_indices[i] : i;
const auto j_start = RowPtr(idx);
const auto j_end = RowPtr(idx + 1);
const int16_t gradient_16 = ORDERED ? gradients_and_hessians_ptr[i] : gradients_and_hessians_ptr[idx];
const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
((static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) |
static_cast<PACKED_HIST_T>(gradient_16 & 0xff));
for (auto j = j_start; j < j_end; ++j) {
const auto ti = static_cast<uint32_t>(data_ptr[j]);
out_ptr[ti] += gradient_packed;
}
}
}
void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* /*hessians*/, hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int64_t, 32>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int64_t, 32>(
nullptr, start, end, gradients, out);
}
void ConstructHistogramOrderedInt32(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int64_t, 32>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* /*hessians*/, hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int32_t, 16>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int32_t, 16>(
nullptr, start, end, gradients, out);
}
void ConstructHistogramOrderedInt16(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int32_t, 16>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* gradients,
const score_t* /*hessians*/, hist_t* out) const override {
ConstructHistogramIntInner<true, true, false, int16_t, 8>(data_indices, start, end,
gradients, out);
}
void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* gradients, const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<false, false, false, int16_t, 8>(
nullptr, start, end, gradients, out);
}
void ConstructHistogramOrderedInt8(const data_size_t* data_indices,
data_size_t start, data_size_t end,
const score_t* gradients,
const score_t* /*hessians*/,
hist_t* out) const override {
ConstructHistogramIntInner<true, true, true, int16_t, 8>(data_indices, start, end,
gradients, out);
}
MultiValBin* CreateLike(data_size_t num_data, int num_bin, int, MultiValBin* CreateLike(data_size_t num_data, int num_bin, int,
double estimate_element_per_row, double estimate_element_per_row,
const std::vector<uint32_t>& /*offsets*/) const override { const std::vector<uint32_t>& /*offsets*/) const override {
......
...@@ -203,6 +203,184 @@ class SparseBin : public Bin { ...@@ -203,6 +203,184 @@ class SparseBin : public Bin {
} }
#undef ACC_GH #undef ACC_GH
template <bool USE_HESSIAN, typename PACKED_HIST_T, typename GRAD_HIST_T, typename HESS_HIST_T, int HIST_BITS>
void ConstructIntHistogramInner(data_size_t start, data_size_t end,
const score_t* ordered_gradients_and_hessians,
hist_t* out) const {
data_size_t i_delta, cur_pos;
InitIndex(start, &i_delta, &cur_pos);
if (USE_HESSIAN) {
PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(ordered_gradients_and_hessians);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const VAL_T bin = vals_[i_delta];
const int16_t gradient_16 = gradients_and_hessians_ptr[cur_pos];
const PACKED_HIST_T gradient_64 = (static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
out_ptr[bin] += gradient_64;
cur_pos += deltas_[++i_delta];
}
} else {
GRAD_HIST_T* grad = reinterpret_cast<GRAD_HIST_T*>(out);
HESS_HIST_T* cnt = reinterpret_cast<HESS_HIST_T*>(out) + 1;
const int8_t* gradients_and_hessians_ptr = reinterpret_cast<const int8_t*>(ordered_gradients_and_hessians);
while (cur_pos < start && i_delta < num_vals_) {
cur_pos += deltas_[++i_delta];
}
while (cur_pos < end && i_delta < num_vals_) {
const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
grad[ti] += gradients_and_hessians_ptr[cur_pos];
++cnt[ti];
cur_pos += deltas_[++i_delta];
}
}
}
template <bool USE_HESSIAN, typename PACKED_HIST_T, typename GRAD_HIST_T, typename HESS_HIST_T, int HIST_BITS>
void ConstructIntHistogramInner(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients_and_hessians,
hist_t* out) const {
data_size_t i_delta, cur_pos;
InitIndex(data_indices[start], &i_delta, &cur_pos);
data_size_t i = start;
if (USE_HESSIAN) {
PACKED_HIST_T* out_ptr = reinterpret_cast<PACKED_HIST_T*>(out);
const int16_t* gradients_and_hessians_ptr = reinterpret_cast<const int16_t*>(ordered_gradients_and_hessians);
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) {
break;
}
} else if (cur_pos > data_indices[i]) {
if (++i >= end) {
break;
}
} else {
const VAL_T bin = vals_[i_delta];
const int16_t gradient_16 = gradients_and_hessians_ptr[i];
const PACKED_HIST_T gradient_packed = (HIST_BITS == 8) ? gradient_16 :
(static_cast<PACKED_HIST_T>(static_cast<int8_t>(gradient_16 >> 8)) << HIST_BITS) | (gradient_16 & 0xff);
out_ptr[bin] += gradient_packed;
if (++i >= end) {
break;
}
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) {
break;
}
}
}
} else {
GRAD_HIST_T* grad = reinterpret_cast<GRAD_HIST_T*>(out);
HESS_HIST_T* cnt = reinterpret_cast<HESS_HIST_T*>(out) + 1;
const int8_t* gradients_and_hessians_ptr = reinterpret_cast<const int8_t*>(ordered_gradients_and_hessians);
for (;;) {
if (cur_pos < data_indices[i]) {
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) {
break;
}
} else if (cur_pos > data_indices[i]) {
if (++i >= end) {
break;
}
} else {
const uint32_t ti = static_cast<uint32_t>(vals_[i_delta]) << 1;
grad[ti] += gradients_and_hessians_ptr[i << 1];
++cnt[ti];
if (++i >= end) {
break;
}
cur_pos += deltas_[++i_delta];
if (i_delta >= num_vals_) {
break;
}
}
}
}
}
void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructIntHistogramInner<true, int64_t, int32_t, uint32_t, 32>(data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructIntHistogramInner<true, int64_t, int32_t, uint32_t, 32>(start, end, ordered_gradients, out);
}
void ConstructHistogramInt32(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
hist_t* out) const override {
ConstructIntHistogramInner<false, int64_t, int32_t, uint32_t, 32>(data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt32(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructIntHistogramInner<false, int64_t, int32_t, uint32_t, 32>(start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructIntHistogramInner<true, int32_t, int16_t, uint16_t, 16>(data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructIntHistogramInner<true, int32_t, int16_t, uint16_t, 16>(start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
hist_t* out) const override {
ConstructIntHistogramInner<false, int32_t, int16_t, uint16_t, 16>(data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt16(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructIntHistogramInner<false, int32_t, int16_t, uint16_t, 16>(start, end, ordered_gradients, out);
}
void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructIntHistogramInner<true, int16_t, uint8_t, uint8_t, 8>(data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
const score_t* /*ordered_hessians*/,
hist_t* out) const override {
ConstructIntHistogramInner<true, int16_t, uint8_t, uint8_t, 8>(start, end, ordered_gradients, out);
}
void ConstructHistogramInt8(const data_size_t* data_indices, data_size_t start,
data_size_t end, const score_t* ordered_gradients,
hist_t* out) const override {
ConstructIntHistogramInner<false, int16_t, uint8_t, uint8_t, 8>(data_indices, start, end, ordered_gradients, out);
}
void ConstructHistogramInt8(data_size_t start, data_size_t end,
const score_t* ordered_gradients,
hist_t* out) const override {
ConstructIntHistogramInner<false, int16_t, uint8_t, uint8_t, 8>(start, end, ordered_gradients, out);
}
inline void NextNonzeroFast(data_size_t* i_delta, inline void NextNonzeroFast(data_size_t* i_delta,
data_size_t* cur_pos) const { data_size_t* cur_pos) const {
*cur_pos += deltas_[++(*i_delta)]; *cur_pos += deltas_[++(*i_delta)];
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
namespace LightGBM { namespace LightGBM {
MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data,
const std::vector<int>& feature_groups_contained): const std::vector<int>& feature_groups_contained, const int num_grad_quant_bins):
feature_groups_contained_(feature_groups_contained) { feature_groups_contained_(feature_groups_contained) {
num_threads_ = OMP_NUM_THREADS(); num_threads_ = OMP_NUM_THREADS();
num_data_ = num_data; num_data_ = num_data;
...@@ -19,6 +19,7 @@ MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data, ...@@ -19,6 +19,7 @@ MultiValBinWrapper::MultiValBinWrapper(MultiValBin* bin, data_size_t num_data,
} }
num_bin_ = bin->num_bin(); num_bin_ = bin->num_bin();
num_bin_aligned_ = (num_bin_ + kAlignedSize - 1) / kAlignedSize * kAlignedSize; num_bin_aligned_ = (num_bin_ + kAlignedSize - 1) / kAlignedSize * kAlignedSize;
num_grad_quant_bins_ = num_grad_quant_bins;
} }
void MultiValBinWrapper::InitTrain(const std::vector<int>& group_feature_start, void MultiValBinWrapper::InitTrain(const std::vector<int>& group_feature_start,
...@@ -45,43 +46,161 @@ void MultiValBinWrapper::InitTrain(const std::vector<int>& group_feature_start, ...@@ -45,43 +46,161 @@ void MultiValBinWrapper::InitTrain(const std::vector<int>& group_feature_start,
} }
} }
template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
void MultiValBinWrapper::HistMove(const std::vector<hist_t, void MultiValBinWrapper::HistMove(const std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf) { Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf) {
if (!is_use_subcol_) { if (!is_use_subcol_ && INNER_HIST_BITS != 8) {
return; return;
} }
const hist_t* src = hist_buf.data() + hist_buf.size() - if (USE_QUANT_GRAD) {
2 * static_cast<size_t>(num_bin_aligned_); if (HIST_BITS == 32) {
#pragma omp parallel for schedule(static) const int64_t* src = reinterpret_cast<const int64_t*>(hist_buf.data()) + hist_buf.size() / 2 -
for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) { static_cast<size_t>(num_bin_aligned_);
std::copy_n(src + hist_move_src_[i], hist_move_size_[i], #pragma omp parallel for schedule(static)
origin_hist_data_ + hist_move_dest_[i]); for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
}
} else if (HIST_BITS == 16) {
const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
static_cast<size_t>(num_bin_aligned_);
if (is_use_subcol_) {
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
}
} else {
int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
#pragma omp parallel for schedule(static)
for (int i = 0; i < num_bin_; ++i) {
orig_ptr[i] = src[i];
}
}
}
} else {
const hist_t* src = hist_buf.data() + hist_buf.size() -
2 * static_cast<size_t>(num_bin_aligned_);
#pragma omp parallel for schedule(static)
for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
std::copy_n(src + hist_move_src_[i], hist_move_size_[i],
origin_hist_data_ + hist_move_dest_[i]);
}
} }
} }
template void MultiValBinWrapper::HistMove<false, 0, 0>(const std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
template void MultiValBinWrapper::HistMove<false, 0, 8>(const std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
template void MultiValBinWrapper::HistMove<true, 16, 8>(const std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
template void MultiValBinWrapper::HistMove<true, 16, 16>(const std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
template void MultiValBinWrapper::HistMove<true, 32, 8>(const std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
template void MultiValBinWrapper::HistMove<true, 32, 32>(const std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>& hist_buf);
template <bool USE_QUANT_GRAD, int HIST_BITS, int INNER_HIST_BITS>
void MultiValBinWrapper::HistMerge(std::vector<hist_t, void MultiValBinWrapper::HistMerge(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf) { Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf) {
int n_bin_block = 1; int n_bin_block = 1;
int bin_block_size = num_bin_; int bin_block_size = num_bin_;
Threading::BlockInfo<data_size_t>(num_threads_, num_bin_, 512, &n_bin_block, Threading::BlockInfo<data_size_t>(num_threads_, num_bin_, 512, &n_bin_block,
&bin_block_size); &bin_block_size);
hist_t* dst = origin_hist_data_; if (USE_QUANT_GRAD) {
if (is_use_subcol_) { if (HIST_BITS == 32) {
dst = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_); int64_t* dst = reinterpret_cast<int64_t*>(origin_hist_data_);
} if (is_use_subcol_) {
#pragma omp parallel for schedule(static, 1) num_threads(num_threads_) dst = reinterpret_cast<int64_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
for (int t = 0; t < n_bin_block; ++t) { }
const int start = t * bin_block_size; #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
const int end = std::min(start + bin_block_size, num_bin_); for (int t = 0; t < n_bin_block; ++t) {
for (int tid = 1; tid < n_data_block_; ++tid) { const int start = t * bin_block_size;
auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1); const int end = std::min(start + bin_block_size, num_bin_);
for (int i = start * 2; i < end * 2; ++i) { for (int tid = 1; tid < n_data_block_; ++tid) {
dst[i] += src_ptr[i]; auto src_ptr = reinterpret_cast<const int64_t*>(hist_buf->data()) + static_cast<size_t>(num_bin_aligned_) * (tid - 1);
for (int i = start; i < end; ++i) {
dst[i] += src_ptr[i];
}
}
}
} else if (HIST_BITS == 16 && INNER_HIST_BITS == 16) {
int32_t* dst = reinterpret_cast<int32_t*>(origin_hist_data_);
if (is_use_subcol_) {
dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
}
#pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin_);
for (int tid = 1; tid < n_data_block_; ++tid) {
auto src_ptr = reinterpret_cast<const int32_t*>(hist_buf->data()) + static_cast<size_t>(num_bin_aligned_) * (tid - 1);
for (int i = start; i < end; ++i) {
dst[i] += src_ptr[i];
}
}
}
} else if (HIST_BITS == 16 && INNER_HIST_BITS == 8) {
int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
std::memset(reinterpret_cast<void*>(dst), 0, num_bin_ * kInt16HistBufferEntrySize);
#pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin_);
for (int tid = 0; tid < n_data_block_; ++tid) {
auto src_ptr = reinterpret_cast<const int16_t*>(hist_buf->data()) + static_cast<size_t>(num_bin_aligned_) * tid;
for (int i = start; i < end; ++i) {
const int16_t packed_hist = src_ptr[i];
const int32_t packed_hist_int32 = (static_cast<int32_t>(static_cast<int8_t>(packed_hist >> 8)) << 16) | static_cast<int32_t>(packed_hist & 0x00ff);
dst[i] += packed_hist_int32;
}
}
}
}
} else {
hist_t* dst = origin_hist_data_;
if (is_use_subcol_) {
dst = hist_buf->data() + hist_buf->size() - 2 * static_cast<size_t>(num_bin_aligned_);
}
#pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
for (int t = 0; t < n_bin_block; ++t) {
const int start = t * bin_block_size;
const int end = std::min(start + bin_block_size, num_bin_);
for (int tid = 1; tid < n_data_block_; ++tid) {
auto src_ptr = hist_buf->data() + static_cast<size_t>(num_bin_aligned_) * 2 * (tid - 1);
for (int i = start * 2; i < end * 2; ++i) {
dst[i] += src_ptr[i];
}
} }
} }
} }
} }
template void MultiValBinWrapper::HistMerge<false, 0, 0>(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
template void MultiValBinWrapper::HistMerge<false, 0, 8>(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
template void MultiValBinWrapper::HistMerge<true, 16, 8>(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
template void MultiValBinWrapper::HistMerge<true, 16, 16>(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
template void MultiValBinWrapper::HistMerge<true, 32, 8>(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
template void MultiValBinWrapper::HistMerge<true, 32, 32>(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf);
void MultiValBinWrapper::ResizeHistBuf(std::vector<hist_t, void MultiValBinWrapper::ResizeHistBuf(std::vector<hist_t,
Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf, Common::AlignmentAllocator<hist_t, kAlignedSize>>* hist_buf,
MultiValBin* sub_multi_val_bin, MultiValBin* sub_multi_val_bin,
...@@ -389,7 +508,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vector<std::unique_ptr<Featu ...@@ -389,7 +508,7 @@ void TrainingShareStates::CalcBinOffsets(const std::vector<std::unique_ptr<Featu
void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data,
const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups, const std::vector<std::unique_ptr<FeatureGroup>>& feature_groups,
bool dense_only, bool sparse_only) { bool dense_only, bool sparse_only, const int num_grad_quant_bins) {
num_threads = OMP_NUM_THREADS(); num_threads = OMP_NUM_THREADS();
if (bin == nullptr) { if (bin == nullptr) {
return; return;
...@@ -408,7 +527,7 @@ void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, ...@@ -408,7 +527,7 @@ void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data,
num_total_bin_ += bin->num_bin(); num_total_bin_ += bin->num_bin();
num_elements_per_row_ += bin->num_element_per_row(); num_elements_per_row_ += bin->num_element_per_row();
multi_val_bin_wrapper_.reset(new MultiValBinWrapper( multi_val_bin_wrapper_.reset(new MultiValBinWrapper(
bin, num_data, feature_groups_contained)); bin, num_data, feature_groups_contained, num_grad_quant_bins));
} }
} // namespace LightGBM } // namespace LightGBM
This diff is collapsed.
...@@ -991,7 +991,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -991,7 +991,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
nullptr, nullptr, nullptr, nullptr,
nullptr, nullptr); nullptr, nullptr);
// then construct sparse features on CPU // then construct sparse features on CPU
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms<false, 0>(is_sparse_feature_used,
smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), ordered_gradients_.data(), ordered_hessians_.data(),
...@@ -1056,7 +1056,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u ...@@ -1056,7 +1056,7 @@ void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_u
gradients_, hessians_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data()); ordered_gradients_.data(), ordered_hessians_.data());
// then construct sparse features on CPU // then construct sparse features on CPU
train_data_->ConstructHistograms(is_sparse_feature_used, train_data_->ConstructHistograms<false, 0>(is_sparse_feature_used,
larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
gradients_, hessians_, gradients_, hessians_,
ordered_gradients_.data(), ordered_hessians_.data(), ordered_gradients_.data(), ordered_hessians_.data(),
......
/*!
* Copyright (c) 2022 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
#include "gradient_discretizer.hpp"
#include <LightGBM/network.h>
#include <algorithm>
#include <string>
#include <vector>
namespace LightGBM {
void GradientDiscretizer::Init(
const data_size_t num_data, const int num_leaves,
const int num_features, const Dataset* train_data) {
discretized_gradients_and_hessians_vector_.resize(num_data * 2);
gradient_random_values_.resize(num_data);
hessian_random_values_.resize(num_data);
random_values_use_start_eng_ = std::mt19937(random_seed_);
random_values_use_start_dist_ = std::uniform_int_distribution<data_size_t>(0, num_data);
const int num_threads = OMP_NUM_THREADS();
int num_blocks = 0;
data_size_t block_size = 0;
Threading::BlockInfo<data_size_t>(num_data, 512, &num_blocks, &block_size);
#pragma omp parallel for schedule(static, 1) num_threads(num_threads)
for (int thread_id = 0; thread_id < num_blocks; ++thread_id) {
const data_size_t start = thread_id * block_size;
const data_size_t end = std::min(start + block_size, num_data);
std::mt19937 gradient_random_values_eng(random_seed_ + thread_id);
std::uniform_real_distribution<double> gradient_random_values_dist(0.0f, 1.0f);
std::mt19937 hessian_random_values_eng(random_seed_ + thread_id + num_threads);
std::uniform_real_distribution<double> hessian_random_values_dist(0.0f, 1.0f);
for (data_size_t i = start; i < end; ++i) {
gradient_random_values_[i] = gradient_random_values_dist(gradient_random_values_eng);
hessian_random_values_[i] = hessian_random_values_dist(hessian_random_values_eng);
}
}
max_gradient_abs_ = 0.0f;
max_hessian_abs_ = 0.0f;
gradient_scale_ = 0.0f;
hessian_scale_ = 0.0f;
inverse_gradient_scale_ = 0.0f;
inverse_hessian_scale_ = 0.0f;
num_leaves_ = num_leaves;
leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
node_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
global_leaf_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
global_node_num_bits_in_histogram_bin_.resize(num_leaves_, 0);
leaf_grad_hess_stats_.resize(num_leaves_ * 2, 0.0);
change_hist_bits_buffer_.resize(num_features);
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (int feature_index = 0; feature_index < num_features; ++feature_index) {
const BinMapper* bin_mapper = train_data->FeatureBinMapper(feature_index);
change_hist_bits_buffer_[feature_index].resize((bin_mapper->num_bin() - static_cast<int>(bin_mapper->GetMostFreqBin() == 0)) * 2);
}
ordered_int_gradients_and_hessians_.resize(2 * num_data);
}
void GradientDiscretizer::DiscretizeGradients(
const data_size_t num_data,
const score_t* input_gradients,
const score_t* input_hessians) {
double max_gradient = std::fabs(input_gradients[0]);
double max_hessian = std::fabs(input_hessians[0]);
const int num_threads = OMP_NUM_THREADS();
std::vector<double> thread_max_gradient(num_threads, max_gradient);
std::vector<double> thread_max_hessian(num_threads, max_hessian);
Threading::For<data_size_t>(0, num_data, 1024,
[input_gradients, input_hessians, &thread_max_gradient, &thread_max_hessian]
(int, data_size_t start, data_size_t end) {
int thread_id = omp_get_thread_num();
for (data_size_t i = start; i < end; ++i) {
double fabs_grad = std::fabs(input_gradients[i]);
double fabs_hess = std::fabs(input_hessians[i]);
if (fabs_grad > thread_max_gradient[thread_id]) {
thread_max_gradient[thread_id] = fabs_grad;
}
if (fabs_hess > thread_max_hessian[thread_id]) {
thread_max_hessian[thread_id] = fabs_hess;
}
}});
max_gradient = thread_max_gradient[0];
max_hessian = thread_max_hessian[0];
for (int thread_id = 1; thread_id < num_threads; ++thread_id) {
if (max_gradient < thread_max_gradient[thread_id]) {
max_gradient = thread_max_gradient[thread_id];
}
if (max_hessian < thread_max_hessian[thread_id]) {
max_hessian = thread_max_hessian[thread_id];
}
}
if (Network::num_machines() > 1) {
max_gradient = Network::GlobalSyncUpByMax(max_gradient);
max_hessian = Network::GlobalSyncUpByMax(max_hessian);
}
max_gradient_abs_ = max_gradient;
max_hessian_abs_ = max_hessian;
gradient_scale_ = max_gradient_abs_ / static_cast<double>(num_grad_quant_bins_ / 2);
if (is_constant_hessian_) {
hessian_scale_ = max_hessian_abs_;
} else {
hessian_scale_ = max_hessian_abs_ / static_cast<double>(num_grad_quant_bins_);
}
inverse_gradient_scale_ = 1.0f / gradient_scale_;
inverse_hessian_scale_ = 1.0f / hessian_scale_;
const int random_values_use_start = random_values_use_start_dist_(random_values_use_start_eng_);
int8_t* discretized_int8 = discretized_gradients_and_hessians_vector_.data();
if (stochastic_rounding_) {
if (is_constant_hessian_) {
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (data_size_t i = 0; i < num_data; ++i) {
const double gradient = input_gradients[i];
const data_size_t random_value_pos = (i + random_values_use_start) % num_data;
discretized_int8[2 * i + 1] = gradient >= 0.0f ?
static_cast<int8_t>(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) :
static_cast<int8_t>(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]);
discretized_int8[2 * i] = static_cast<int8_t>(1);
}
} else {
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (data_size_t i = 0; i < num_data; ++i) {
const double gradient = input_gradients[i];
const data_size_t random_value_pos = (i + random_values_use_start) % num_data;
discretized_int8[2 * i + 1] = gradient >= 0.0f ?
static_cast<int8_t>(gradient * inverse_gradient_scale_ + gradient_random_values_[random_value_pos]) :
static_cast<int8_t>(gradient * inverse_gradient_scale_ - gradient_random_values_[random_value_pos]);
discretized_int8[2 * i] = static_cast<int8_t>(input_hessians[i] * inverse_hessian_scale_ + hessian_random_values_[random_value_pos]);
}
}
} else {
if (is_constant_hessian_) {
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (data_size_t i = 0; i < num_data; ++i) {
const double gradient = input_gradients[i];
discretized_int8[2 * i + 1] = gradient >= 0.0f ?
static_cast<int8_t>(gradient * inverse_gradient_scale_ + 0.5) :
static_cast<int8_t>(gradient * inverse_gradient_scale_ - 0.5);
discretized_int8[2 * i] = static_cast<int8_t>(1);
}
} else {
#pragma omp parallel for schedule(static) num_threads(num_threads)
for (data_size_t i = 0; i < num_data; ++i) {
const double gradient = input_gradients[i];
discretized_int8[2 * i + 1] = gradient >= 0.0f ?
static_cast<int8_t>(gradient * inverse_gradient_scale_ + 0.5) :
static_cast<int8_t>(gradient * inverse_gradient_scale_ - 0.5);
discretized_int8[2 * i] = static_cast<int8_t>(input_hessians[i] * inverse_hessian_scale_ + 0.5);
}
}
}
}
template <bool IS_GLOBAL>
void GradientDiscretizer::SetNumBitsInHistogramBin(
const int left_leaf_index, const int right_leaf_index,
const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf) {
std::vector<int8_t>& leaf_num_bits_in_histogram_bin = IS_GLOBAL ?
global_leaf_num_bits_in_histogram_bin_ : leaf_num_bits_in_histogram_bin_;
std::vector<int8_t>& node_num_bits_in_histogram_bin = IS_GLOBAL ?
global_node_num_bits_in_histogram_bin_ : node_num_bits_in_histogram_bin_;
if (right_leaf_index == -1) {
const uint64_t max_stat_per_bin = static_cast<uint64_t>(num_data_in_left_leaf) * static_cast<uint64_t>(num_grad_quant_bins_);
if (max_stat_per_bin < 256) {
leaf_num_bits_in_histogram_bin[left_leaf_index] = 8;
} else if (max_stat_per_bin < 65536) {
leaf_num_bits_in_histogram_bin[left_leaf_index] = 16;
} else {
leaf_num_bits_in_histogram_bin[left_leaf_index] = 32;
}
} else {
const uint64_t max_stat_left_per_bin = static_cast<uint64_t>(num_data_in_left_leaf) * static_cast<uint64_t>(num_grad_quant_bins_);
const uint64_t max_stat_right_per_bin = static_cast<uint64_t>(num_data_in_right_leaf) * static_cast<uint64_t>(num_grad_quant_bins_);
node_num_bits_in_histogram_bin[left_leaf_index] = leaf_num_bits_in_histogram_bin[left_leaf_index];
if (max_stat_left_per_bin < 256) {
leaf_num_bits_in_histogram_bin[left_leaf_index] = 8;
} else if (max_stat_left_per_bin < 65536) {
leaf_num_bits_in_histogram_bin[left_leaf_index] = 16;
} else {
leaf_num_bits_in_histogram_bin[left_leaf_index] = 32;
}
if (max_stat_right_per_bin < 256) {
leaf_num_bits_in_histogram_bin[right_leaf_index] = 8;
} else if (max_stat_right_per_bin < 65536) {
leaf_num_bits_in_histogram_bin[right_leaf_index] = 16;
} else {
leaf_num_bits_in_histogram_bin[right_leaf_index] = 32;
}
}
}
template void GradientDiscretizer::SetNumBitsInHistogramBin<false>(
const int left_leaf_index, const int right_leaf_index,
const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf);
template void GradientDiscretizer::SetNumBitsInHistogramBin<true>(
const int left_leaf_index, const int right_leaf_index,
const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf);
void GradientDiscretizer::RenewIntGradTreeOutput(
Tree* tree, const Config* config, const DataPartition* data_partition,
const score_t* gradients, const score_t* hessians,
const std::function<data_size_t(int)>& leaf_index_to_global_num_data) {
global_timer.Start("GradientDiscretizer::RenewIntGradTreeOutput");
if (config->tree_learner == std::string("data")) {
for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) {
data_size_t leaf_cnt = 0;
const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
double sum_gradient = 0.0f, sum_hessian = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
for (data_size_t i = 0; i < leaf_cnt; ++i) {
const data_size_t index = data_indices[i];
const score_t grad = gradients[index];
const score_t hess = hessians[index];
sum_gradient += grad;
sum_hessian += hess;
}
leaf_grad_hess_stats_[2 * leaf_id] = sum_gradient;
leaf_grad_hess_stats_[2 * leaf_id + 1] = sum_hessian;
}
std::vector<double> global_leaf_grad_hess_stats = Network::GlobalSum<double>(&leaf_grad_hess_stats_);
for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) {
const double sum_gradient = global_leaf_grad_hess_stats[2 * leaf_id];
const double sum_hessian = global_leaf_grad_hess_stats[2 * leaf_id + 1];
const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(
sum_gradient, sum_hessian,
config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth,
leaf_index_to_global_num_data(leaf_id), 0.0f);
tree->SetLeafOutput(leaf_id, leaf_output);
}
} else {
for (int leaf_id = 0; leaf_id < tree->num_leaves(); ++leaf_id) {
data_size_t leaf_cnt = 0;
const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
double sum_gradient = 0.0f, sum_hessian = 0.0f;
#pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
for (data_size_t i = 0; i < leaf_cnt; ++i) {
const data_size_t index = data_indices[i];
const score_t grad = gradients[index];
const score_t hess = hessians[index];
sum_gradient += grad;
sum_hessian += hess;
}
const double leaf_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(sum_gradient, sum_hessian,
config->lambda_l1, config->lambda_l2, config->max_delta_step, config->path_smooth,
leaf_cnt, 0.0f);
tree->SetLeafOutput(leaf_id, leaf_output);
}
}
global_timer.Stop("GradientDiscretizer::RenewIntGradTreeOutput");
}
} // namespace LightGBM
/*!
* Copyright (c) 2022 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
#ifndef LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_
#define LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_
#include <LightGBM/bin.h>
#include <LightGBM/meta.h>
#include <LightGBM/tree.h>
#include <LightGBM/utils/threading.h>
#include <random>
#include <vector>
#include "data_partition.hpp"
#include "feature_histogram.hpp"
namespace LightGBM {
class GradientDiscretizer {
public:
GradientDiscretizer(int num_grad_quant_bins, int num_trees, int random_seed, bool is_constant_hessian, const bool stochastic_rounding) {
num_grad_quant_bins_ = num_grad_quant_bins;
iter_ = 0;
num_trees_ = num_trees;
random_seed_ = random_seed;
is_constant_hessian_ = is_constant_hessian;
stochastic_rounding_ = stochastic_rounding;
}
~GradientDiscretizer() {}
virtual void DiscretizeGradients(
const data_size_t num_data,
const score_t* input_gradients,
const score_t* input_hessians);
virtual const int8_t* discretized_gradients_and_hessians() const {
return discretized_gradients_and_hessians_vector_.data();
}
virtual double grad_scale() const {
return gradient_scale_;
}
virtual double hess_scale() const {
return hessian_scale_;
}
virtual void Init(
const data_size_t num_data, const int num_leaves,
const int num_features, const Dataset* train_data);
template <bool IS_GLOBAL>
void SetNumBitsInHistogramBin(
const int left_leaf_index, const int right_leaf_index,
const data_size_t num_data_in_left_leaf, const data_size_t num_data_in_right_leaf);
template <bool IS_GLOBAL>
int8_t GetHistBitsInLeaf(const int leaf_index) {
if (IS_GLOBAL) {
return global_leaf_num_bits_in_histogram_bin_[leaf_index];
} else {
return leaf_num_bits_in_histogram_bin_[leaf_index];
}
}
template <bool IS_GLOBAL>
int8_t GetHistBitsInNode(const int node_index) {
if (IS_GLOBAL) {
return global_node_num_bits_in_histogram_bin_[node_index];
} else {
return node_num_bits_in_histogram_bin_[node_index];
}
}
int8_t* ordered_int_gradients_and_hessians() {
return ordered_int_gradients_and_hessians_.data();
}
void RenewIntGradTreeOutput(
Tree* tree, const Config* config, const DataPartition* data_partition,
const score_t* gradients, const score_t* hessians,
const std::function<data_size_t(int)>& leaf_index_to_global_num_data);
int32_t* GetChangeHistBitsBuffer(const int feature_index) {
return change_hist_bits_buffer_[feature_index].data();
}
protected:
int num_grad_quant_bins_;
int iter_;
int num_trees_;
int random_seed_;
bool stochastic_rounding_;
std::vector<double> gradient_random_values_;
std::vector<double> hessian_random_values_;
std::mt19937 random_values_use_start_eng_;
std::uniform_int_distribution<data_size_t> random_values_use_start_dist_;
std::vector<int8_t> discretized_gradients_and_hessians_vector_;
std::vector<int8_t> ordered_int_gradients_and_hessians_;
double max_gradient_abs_;
double max_hessian_abs_;
double gradient_scale_;
double hessian_scale_;
double inverse_gradient_scale_;
double inverse_hessian_scale_;
bool is_constant_hessian_;
int num_leaves_;
std::vector<int8_t> leaf_num_bits_in_histogram_bin_;
std::vector<int8_t> node_num_bits_in_histogram_bin_;
std::vector<int8_t> global_leaf_num_bits_in_histogram_bin_;
std::vector<int8_t> global_node_num_bits_in_histogram_bin_;
std::vector<double> leaf_grad_hess_stats_;
std::vector<std::vector<int32_t>> change_hist_bits_buffer_;
};
} // namespace LightGBM
#endif // LIGHTGBM_TREE_LEARNER_GRADIENT_DISCRETIZER_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment