Add quantized training (CPU part) (#5800)

* add quantized training (first stage) * add histogram construction functions for integer gradients * add stochastic rounding * update docs * fix compilation errors by adding template instantiations * update files for compilation * fix compilation of gpu version * initialize gradient discretizer before share states * add a test case for quantized training * add quantized training for data distributed training * Delete origin.pred * Delete ifelse.pred * Delete LightGBM_model.txt * remove useless changes * fix lint error * remove debug loggings * fix mismatch of vector and allocator types * remove changes in main.cpp * fix bugs with uninitialized gradient discretizer * initialize ordered gradients in gradient discretizer * disable quantized training with gpu and cuda fix msvc compilation errors and warnings * fix bug in data parallel tree learner * make quantized training test deterministic * make quantized training in test case more accurate * refactor test_quantized_training * fix leaf splits initialization with quantized training * check distributed quantized training result

Add quantized training (CPU part) (#5800)
* add quantized training (first stage) * add histogram construction functions for integer gradients * add stochastic rounding * update docs * fix compilation errors by adding template instantiations * update files for compilation * fix compilation of gpu version * initialize gradient discretizer before share states * add a test case for quantized training * add quantized training for data distributed training * Delete origin.pred * Delete ifelse.pred * Delete LightGBM_model.txt * remove useless changes * fix lint error * remove debug loggings * fix mismatch of vector and allocator types * remove changes in main.cpp * fix bugs with uninitialized gradient discretizer * initialize ordered gradients in gradient discretizer * disable quantized training with gpu and cuda fix msvc compilation errors and warnings * fix bug in data parallel tree learner * make quantized training test deterministic * make quantized training in test case more accurate * refactor test_quantized_training * fix leaf splits initialization with quantized training * check distributed quantized training result
17ecfab3 · shiyu1994 · GitHub · a97c444b · 17ecfab3 · 17ecfab3
Unverified Commit 17ecfab3 authored May 05, 2023 by shiyu1994 Committed by GitHub May 05, 2023
9 changed files
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -85,6 +85,38 @@ class LeafSplits {
    sum_hessians_ = tmp_sum_hessians;
  }

+
+  /*!
+   * \brief Init splits on the current leaf, it will traverse all data to sum up the results
+   * \param int_gradients_and_hessians Discretized gradients and hessians
+   * \param grad_scale Scaling factor to recover original gradients from discretized gradients
+   * \param hess_scale Scaling factor to recover original hessians from discretized hessians
+   */
+  void Init(const int8_t* int_gradients_and_hessians,
+    const double grad_scale, const double hess_scale) {
+    num_data_in_leaf_ = num_data_;
+    leaf_index_ = 0;
+    data_indices_ = nullptr;
+    double tmp_sum_gradients = 0.0f;
+    double tmp_sum_hessians = 0.0f;
+    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
+    int64_t tmp_sum_gradients_and_hessians = 0;
+#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+      tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
+      tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
+      const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
+      const int64_t packed_long_int_grad_and_hess =
+        (static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
+        (static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
+      tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
+    }
+    sum_gradients_ = tmp_sum_gradients;
+    sum_hessians_ = tmp_sum_hessians;
+    int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
+  }
+
+
  /*!
   * \brief Init splits on current leaf of partial data.
   * \param leaf Index of current leaf
@@ -109,6 +141,40 @@ class LeafSplits {
  }


+  /*!
+   * \brief Init splits on current leaf of partial data.
+   * \param leaf Index of current leaf
+   * \param data_partition current data partition
+   * \param int_gradients_and_hessians Discretized gradients and hessians
+   * \param grad_scale Scaling factor to recover original gradients from discretized gradients
+   * \param hess_scale Scaling factor to recover original hessians from discretized hessians
+   */
+  void Init(int leaf, const DataPartition* data_partition,
+            const int8_t* int_gradients_and_hessians,
+            const score_t grad_scale, const score_t hess_scale) {
+    leaf_index_ = leaf;
+    data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
+    double tmp_sum_gradients = 0.0f;
+    double tmp_sum_hessians = 0.0f;
+    const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
+    int64_t tmp_sum_gradients_and_hessians = 0;
+#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
+    for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
+      const data_size_t idx = data_indices_[i];
+      tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;
+      tmp_sum_hessians += int_gradients_and_hessians[2 * idx] * hess_scale;
+      const int16_t packed_int_grad_and_hess = packed_int_gradients_and_hessians[i];
+      const int64_t packed_long_int_grad_and_hess =
+        (static_cast<int64_t>(static_cast<int8_t>(packed_int_grad_and_hess >> 8)) << 32) |
+        (static_cast<int64_t>(packed_int_grad_and_hess & 0x00ff));
+      tmp_sum_gradients_and_hessians += packed_long_int_grad_and_hess;
+    }
+    sum_gradients_ = tmp_sum_gradients;
+    sum_hessians_ = tmp_sum_hessians;
+    int_sum_gradients_and_hessians_ = tmp_sum_gradients_and_hessians;
+  }
+
+
  /*!
  * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
  * \param sum_gradients
@@ -120,6 +186,19 @@ class LeafSplits {
    sum_hessians_ = sum_hessians;
  }

+  /*!
+  * \brief Init splits on current leaf, only update sum_gradients and sum_hessians
+  * \param sum_gradients
+  * \param sum_hessians
+  * \param int_sum_gradients_and_hessians
+  */
+  void Init(double sum_gradients, double sum_hessians, int64_t int_sum_gradients_and_hessians) {
+    leaf_index_ = 0;
+    sum_gradients_ = sum_gradients;
+    sum_hessians_ = sum_hessians;
+    int_sum_gradients_and_hessians_ = int_sum_gradients_and_hessians;
+  }
+
  /*!
  * \brief Init splits on current leaf
  */
@@ -142,6 +221,9 @@ class LeafSplits {
  /*! \brief Get sum of Hessians of current leaf */
  double sum_hessians() const { return sum_hessians_; }

+  /*! \brief Get sum of discretized gradients and Hessians of current leaf */
+  int64_t int_sum_gradients_and_hessians() const { return int_sum_gradients_and_hessians_; }
+
  /*! \brief Get indices of data of current leaf */
  const data_size_t* data_indices() const { return data_indices_; }

@@ -162,6 +244,8 @@ class LeafSplits {
  double sum_gradients_;
  /*! \brief sum of Hessians of current leaf */
  double sum_hessians_;
+  /*! \brief sum of discretized gradients and Hessians of current leaf */
+  int64_t int_sum_gradients_and_hessians_;
  /*! \brief indices of data of current leaf */
  const data_size_t* data_indices_;
  /*! \brief weight of current leaf */

--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -71,15 +71,24 @@ class DataParallelTreeLearner: public TREELEARNER_T {
    }
  }

+  void PrepareBufferPos(
+    const std::vector<std::vector<int>>& feature_distribution,
+    std::vector<comm_size_t>* block_start,
+    std::vector<comm_size_t>* block_len,
+    std::vector<comm_size_t>* buffer_write_start_pos,
+    std::vector<comm_size_t>* buffer_read_start_pos,
+    comm_size_t* reduce_scatter_size,
+    size_t hist_entry_size);
+
 private:
  /*! \brief Rank of local machine */
  int rank_;
  /*! \brief Number of machines of this parallel task */
  int num_machines_;
  /*! \brief Buffer for network send */
-  std::vector<char> input_buffer_;
+  std::vector<char, Common::AlignmentAllocator<char, 32>> input_buffer_;
  /*! \brief Buffer for network receive */
-  std::vector<char> output_buffer_;
+  std::vector<char, Common::AlignmentAllocator<char, 32>> output_buffer_;
  /*! \brief different machines will aggregate histograms for different features,
       use this to mark local aggregate features*/
  std::vector<bool> is_feature_aggregated_;
@@ -87,12 +96,22 @@ class DataParallelTreeLearner: public TREELEARNER_T {
  std::vector<comm_size_t> block_start_;
  /*! \brief Block size for reduce scatter */
  std::vector<comm_size_t> block_len_;
+  /*! \brief Block start index for reduce scatter with int16 histograms */
+  std::vector<comm_size_t> block_start_int16_;
+  /*! \brief Block size for reduce scatter with int16 histograms */
+  std::vector<comm_size_t> block_len_int16_;
  /*! \brief Write positions for feature histograms */
  std::vector<comm_size_t> buffer_write_start_pos_;
  /*! \brief Read positions for local feature histograms */
  std::vector<comm_size_t> buffer_read_start_pos_;
+  /*! \brief Write positions for feature histograms with int16 histograms*/
+  std::vector<comm_size_t> buffer_write_start_pos_int16_;
+  /*! \brief Read positions for local feature histograms with int16 histograms */
+  std::vector<comm_size_t> buffer_read_start_pos_int16_;
  /*! \brief Size for reduce scatter */
  comm_size_t reduce_scatter_size_;
+  /*! \brief Size for reduce scatter with int16 histogram*/
+  comm_size_t reduce_scatter_size_int16_;
  /*! \brief Store global number of data in leaves  */
  std::vector<data_size_t> global_data_count_in_leaf_;
 };

--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -21,6 +21,7 @@ namespace LightGBM {

 SerialTreeLearner::SerialTreeLearner(const Config* config)
    : config_(config), col_sampler_(config) {
+  gradient_discretizer_ = nullptr;
 }

 SerialTreeLearner::~SerialTreeLearner() {
@@ -60,6 +61,11 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);

+  if (config_->use_quantized_grad) {
+    gradient_discretizer_.reset(new GradientDiscretizer(config_->num_grad_quant_bins, config_->num_iterations, config_->seed, is_constant_hessian, config_->stochastic_rounding));
+    gradient_discretizer_->Init(num_data_, config_->num_leaves, num_features_, train_data_);
+  }
+
  GetShareStates(train_data_, is_constant_hessian, true);
  histogram_pool_.DynamicChangeSize(train_data_,
  share_state_->num_hist_total_bin(),
@@ -76,17 +82,31 @@ void SerialTreeLearner::GetShareStates(const Dataset* dataset,
                                       bool is_constant_hessian,
                                       bool is_first_time) {
  if (is_first_time) {
-    share_state_.reset(dataset->GetShareStates(
+    if (config_->use_quantized_grad) {
+      share_state_.reset(dataset->GetShareStates<true, 32>(
+        reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr,
+        col_sampler_.is_feature_used_bytree(), is_constant_hessian,
+        config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins));
+    } else {
+      share_state_.reset(dataset->GetShareStates<false, 0>(
          ordered_gradients_.data(), ordered_hessians_.data(),
          col_sampler_.is_feature_used_bytree(), is_constant_hessian,
-        config_->force_col_wise, config_->force_row_wise));
+          config_->force_col_wise, config_->force_row_wise, config_->num_grad_quant_bins));
+    }
  } else {
    CHECK_NOTNULL(share_state_);
    // cannot change is_hist_col_wise during training
-    share_state_.reset(dataset->GetShareStates(
+    if (config_->use_quantized_grad) {
+      share_state_.reset(dataset->GetShareStates<true, 32>(
+          reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), nullptr,
+          col_sampler_.is_feature_used_bytree(), is_constant_hessian,
+          share_state_->is_col_wise, !share_state_->is_col_wise, config_->num_grad_quant_bins));
+    } else {
+      share_state_.reset(dataset->GetShareStates<false, 0>(
          ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(),
          is_constant_hessian, share_state_->is_col_wise,
-        !share_state_->is_col_wise));
+          !share_state_->is_col_wise, config_->num_grad_quant_bins));
+    }
  }
  CHECK_NOTNULL(share_state_);
 }
@@ -169,6 +189,10 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
  }
  share_state_->num_threads = num_threads;

+  if (config_->use_quantized_grad) {
+    gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_);
+  }
+
  // some initial works before training
  BeforeTrain();

@@ -205,6 +229,11 @@ Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians
    cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
  }

+  if (config_->use_quantized_grad && config_->quant_train_renew_leaf) {
+    gradient_discretizer_->RenewIntGradTreeOutput(tree.get(), config_, data_partition_.get(), gradients_, hessians_,
+      [this] (int leaf_index) { return GetGlobalDataCountInLeaf(leaf_index); });
+  }
+
  Log::Debug("Trained a tree with leaves = %d and depth = %d", tree->num_leaves(), cur_depth);
  return tree.release();
 }
@@ -270,11 +299,25 @@ void SerialTreeLearner::BeforeTrain() {
  // Sumup for root
  if (data_partition_->leaf_count(0) == num_data_) {
    // use all data
+    if (!config_->use_quantized_grad) {
      smaller_leaf_splits_->Init(gradients_, hessians_);
-
+    } else {
+      smaller_leaf_splits_->Init(
+        gradient_discretizer_->discretized_gradients_and_hessians(),
+        gradient_discretizer_->grad_scale(),
+        gradient_discretizer_->hess_scale());
+    }
  } else {
    // use bagging, only use part of data
+    if (!config_->use_quantized_grad) {
      smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
+    } else {
+      smaller_leaf_splits_->Init(
+        0, data_partition_.get(),
+        gradient_discretizer_->discretized_gradients_and_hessians(),
+        gradient_discretizer_->grad_scale(),
+        gradient_discretizer_->hess_scale());
+    }
  }

  larger_leaf_splits_->Init();
@@ -282,6 +325,10 @@ void SerialTreeLearner::BeforeTrain() {
  if (cegb_ != nullptr) {
    cegb_->BeforeTrain();
  }
+
+  if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
+    gradient_discretizer_->SetNumBitsInHistogramBin<false>(0, -1, data_partition_->leaf_count(0), 0);
+  }
 }

 bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
@@ -353,9 +400,53 @@ void SerialTreeLearner::ConstructHistograms(
  Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
                                  global_timer);
  // construct smaller leaf
+  if (config_->use_quantized_grad) {
+    const uint8_t smaller_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
+    hist_t* ptr_smaller_leaf_hist_data =
+        smaller_leaf_num_bits <= 16 ?
+        reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) :
+        reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[0].RawDataInt32() - kHistOffset);
+    #define SMALLER_LEAF_ARGS \
+      is_feature_used, smaller_leaf_splits_->data_indices(), \
+      smaller_leaf_splits_->num_data_in_leaf(), \
+      reinterpret_cast<const score_t*>(gradient_discretizer_->discretized_gradients_and_hessians()), \
+      nullptr, \
+      reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), \
+      nullptr, \
+      share_state_.get(), \
+      reinterpret_cast<hist_t*>(ptr_smaller_leaf_hist_data)
+    if (smaller_leaf_num_bits <= 16) {
+      train_data_->ConstructHistograms<true, 16>(SMALLER_LEAF_ARGS);
+    } else {
+      train_data_->ConstructHistograms<true, 32>(SMALLER_LEAF_ARGS);
+    }
+    #undef SMALLER_LEAF_ARGS
+    if (larger_leaf_histogram_array_ && !use_subtract) {
+      const uint8_t larger_leaf_num_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+      hist_t* ptr_larger_leaf_hist_data =
+        larger_leaf_num_bits <= 16 ?
+        reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[0].RawDataInt16() - kHistOffset) :
+        reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[0].RawDataInt32() - kHistOffset);
+      #define LARGER_LEAF_ARGS \
+        is_feature_used, larger_leaf_splits_->data_indices(), \
+        larger_leaf_splits_->num_data_in_leaf(), \
+        reinterpret_cast<const score_t*>(gradient_discretizer_->discretized_gradients_and_hessians()), \
+        nullptr, \
+        reinterpret_cast<score_t*>(gradient_discretizer_->ordered_int_gradients_and_hessians()), \
+        nullptr, \
+        share_state_.get(), \
+        reinterpret_cast<hist_t*>(ptr_larger_leaf_hist_data)
+      if (larger_leaf_num_bits <= 16) {
+        train_data_->ConstructHistograms<true, 16>(LARGER_LEAF_ARGS);
+      } else {
+        train_data_->ConstructHistograms<true, 32>(LARGER_LEAF_ARGS);
+      }
+      #undef LARGER_LEAF_ARGS
+    }
+  } else {
    hist_t* ptr_smaller_leaf_hist_data =
        smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
-  train_data_->ConstructHistograms(
+    train_data_->ConstructHistograms<false, 0>(
        is_feature_used, smaller_leaf_splits_->data_indices(),
        smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
        ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
@@ -364,12 +455,13 @@ void SerialTreeLearner::ConstructHistograms(
      // construct larger leaf
      hist_t* ptr_larger_leaf_hist_data =
          larger_leaf_histogram_array_[0].RawData() - kHistOffset;
-    train_data_->ConstructHistograms(
+      train_data_->ConstructHistograms<false, 0>(
          is_feature_used, larger_leaf_splits_->data_indices(),
          larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
          ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
          ptr_larger_leaf_hist_data);
    }
+  }
 }

 void SerialTreeLearner::FindBestSplitsFromHistograms(
@@ -388,6 +480,26 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
  if (larger_leaf_splits_->leaf_index() >= 0) {
    larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index());
  }
+
+  if (use_subtract && config_->use_quantized_grad) {
+    const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index());
+    const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode<false>(parent_index);
+    const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+    if (parent_hist_bits > 16 && larger_hist_bits <= 16) {
+      OMP_INIT_EX();
+      #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
+      for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
+        OMP_LOOP_EX_BEGIN();
+        if (!is_feature_used[feature_index]) {
+          continue;
+        }
+        larger_leaf_histogram_array_[feature_index].CopyToBuffer(gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
+        OMP_LOOP_EX_END();
+      }
+      OMP_THROW_EX();
+    }
+  }
+
  OMP_INIT_EX();
 // find splits
 #pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
@@ -397,10 +509,24 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
      continue;
    }
    const int tid = omp_get_thread_num();
+    if (config_->use_quantized_grad) {
+      const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
+      const int64_t int_sum_gradient_and_hessian = smaller_leaf_splits_->int_sum_gradients_and_hessians();
+      if (hist_bits_bin <= 16) {
+        train_data_->FixHistogramInt<int32_t, int32_t, 16, 16>(
+            feature_index, int_sum_gradient_and_hessian,
+            reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[feature_index].RawDataInt16()));
+      } else {
+        train_data_->FixHistogramInt<int64_t, int64_t, 32, 32>(
+            feature_index, int_sum_gradient_and_hessian,
+            reinterpret_cast<hist_t*>(smaller_leaf_histogram_array_[feature_index].RawDataInt32()));
+      }
+    } else {
      train_data_->FixHistogram(
          feature_index, smaller_leaf_splits_->sum_gradients(),
          smaller_leaf_splits_->sum_hessians(),
          smaller_leaf_histogram_array_[feature_index].RawData());
+    }
    int real_fidx = train_data_->RealFeatureIndex(feature_index);

    ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index,
@@ -417,14 +543,51 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(
    }

    if (use_subtract) {
-      larger_leaf_histogram_array_[feature_index].Subtract(
+      if (config_->use_quantized_grad) {
+        const int parent_index = std::min(smaller_leaf_splits_->leaf_index(), larger_leaf_splits_->leaf_index());
+        const uint8_t parent_hist_bits = gradient_discretizer_->GetHistBitsInNode<false>(parent_index);
+        const uint8_t smaller_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_splits_->leaf_index());
+        const uint8_t larger_hist_bits = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+        if (parent_hist_bits <= 16) {
+          CHECK_LE(smaller_hist_bits, 16);
+          CHECK_LE(larger_hist_bits, 16);
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int32_t, int32_t, int32_t, 16, 16, 16>(
              smaller_leaf_histogram_array_[feature_index]);
+        } else if (larger_hist_bits <= 16) {
+          CHECK_LE(smaller_hist_bits, 16);
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int32_t, int32_t, 32, 16, 16>(
+              smaller_leaf_histogram_array_[feature_index], gradient_discretizer_->GetChangeHistBitsBuffer(feature_index));
+        } else if (smaller_hist_bits <= 16) {
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int32_t, int64_t, 32, 16, 32>(
+              smaller_leaf_histogram_array_[feature_index]);
+        } else {
+          larger_leaf_histogram_array_[feature_index].Subtract<true, int64_t, int64_t, int64_t, 32, 32, 32>(
+              smaller_leaf_histogram_array_[feature_index]);
+        }
+      } else {
+        larger_leaf_histogram_array_[feature_index].Subtract<false>(
+            smaller_leaf_histogram_array_[feature_index]);
+      }
+    } else {
+      if (config_->use_quantized_grad) {
+        const int64_t int_sum_gradient_and_hessian = larger_leaf_splits_->int_sum_gradients_and_hessians();
+        const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_splits_->leaf_index());
+        if (hist_bits_bin <= 16) {
+          train_data_->FixHistogramInt<int32_t, int32_t, 16, 16>(
+              feature_index, int_sum_gradient_and_hessian,
+              reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[feature_index].RawDataInt16()));
+        } else {
+          train_data_->FixHistogramInt<int64_t, int64_t, 32, 32>(
+              feature_index, int_sum_gradient_and_hessian,
+              reinterpret_cast<hist_t*>(larger_leaf_histogram_array_[feature_index].RawDataInt32()));
+        }
      } else {
        train_data_->FixHistogram(
            feature_index, larger_leaf_splits_->sum_gradients(),
            larger_leaf_splits_->sum_hessians(),
            larger_leaf_histogram_array_[feature_index].RawData());
      }
+    }

    ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index,
                               real_fidx,
@@ -699,6 +862,11 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
                              best_split_info.left_sum_hessian,
                              best_split_info.left_output);
  }
+  if (config_->use_quantized_grad && config_->tree_learner != std::string("data")) {
+    gradient_discretizer_->SetNumBitsInHistogramBin<false>(*left_leaf, *right_leaf,
+                                                    data_partition_->leaf_count(*left_leaf),
+                                                    data_partition_->leaf_count(*right_leaf));
+  }
  auto leaves_need_update = constraints_->Update(
      is_numerical_split, *left_leaf, *right_leaf,
      best_split_info.monotone_type, best_split_info.right_output,
@@ -762,9 +930,21 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
        train_data_->FeatureNumBin(feature_index));
  }
  SplitInfo new_split;
+  if (config_->use_quantized_grad) {
+    const uint8_t hist_bits_bin = gradient_discretizer_->GetHistBitsInLeaf<false>(leaf_splits->leaf_index());
+    histogram_array_[feature_index].FindBestThresholdInt(
+        leaf_splits->int_sum_gradients_and_hessians(),
+        gradient_discretizer_->grad_scale(),
+        gradient_discretizer_->hess_scale(),
+        hist_bits_bin,
+        hist_bits_bin,
+        num_data,
+        constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split);
+  } else {
    histogram_array_[feature_index].FindBestThreshold(
        leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
        constraints_->GetFeatureConstraint(leaf_splits->leaf_index(), feature_index), parent_output, &new_split);
+  }
  new_split.feature = real_fidx;
  if (cegb_ != nullptr) {
    new_split.gain -=

--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -24,6 +24,7 @@
 #include "col_sampler.hpp"
 #include "data_partition.hpp"
 #include "feature_histogram.hpp"
+#include "gradient_discretizer.hpp"
 #include "leaf_splits.hpp"
 #include "monotone_constraints.hpp"
 #include "split_info.hpp"
@@ -170,6 +171,8 @@ class SerialTreeLearner: public TreeLearner {

  std::set<int> FindAllForceFeatures(Json force_split_leaf_setting);

+  void CheckSplit(const SplitInfo& best_split_info, const int left_leaf_index, const int right_leaf_index);
+
  /*!
  * \brief Get the number of data in a leaf
  * \param leaf_idx The index of leaf
@@ -230,6 +233,7 @@ class SerialTreeLearner: public TreeLearner {
  const Json* forced_split_json_;
  std::unique_ptr<TrainingShareStates> share_state_;
  std::unique_ptr<CostEfficientGradientBoosting> cegb_;
+  std::unique_ptr<GradientDiscretizer> gradient_discretizer_;
 };

 inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {

--- a/src/treelearner/split_info.hpp
+++ b/src/treelearner/split_info.hpp
@@ -40,10 +40,14 @@ struct SplitInfo {
  double left_sum_gradient = 0;
  /*! \brief Left sum hessian after split */
  double left_sum_hessian = 0;
+  /*! \brief Left sum discretized gradient and hessian after split */
+  int64_t left_sum_gradient_and_hessian = 0;
  /*! \brief Right sum gradient after split */
  double right_sum_gradient = 0;
  /*! \brief Right sum hessian after split */
  double right_sum_hessian = 0;
+  /*! \brief Right sum discretized gradient and hessian after split */
+  int64_t right_sum_gradient_and_hessian = 0;
  std::vector<uint32_t> cat_threshold;
  /*! \brief True if default split is left */
  bool default_left = true;
@@ -71,10 +75,14 @@ struct SplitInfo {
    buffer += sizeof(left_sum_gradient);
    std::memcpy(buffer, &left_sum_hessian, sizeof(left_sum_hessian));
    buffer += sizeof(left_sum_hessian);
+    std::memcpy(buffer, &left_sum_gradient_and_hessian, sizeof(left_sum_gradient_and_hessian));
+    buffer += sizeof(left_sum_gradient_and_hessian);
    std::memcpy(buffer, &right_sum_gradient, sizeof(right_sum_gradient));
    buffer += sizeof(right_sum_gradient);
    std::memcpy(buffer, &right_sum_hessian, sizeof(right_sum_hessian));
    buffer += sizeof(right_sum_hessian);
+    std::memcpy(buffer, &right_sum_gradient_and_hessian, sizeof(right_sum_gradient_and_hessian));
+    buffer += sizeof(right_sum_gradient_and_hessian);
    std::memcpy(buffer, &default_left, sizeof(default_left));
    buffer += sizeof(default_left);
    std::memcpy(buffer, &monotone_type, sizeof(monotone_type));
@@ -103,10 +111,14 @@ struct SplitInfo {
    buffer += sizeof(left_sum_gradient);
    std::memcpy(&left_sum_hessian, buffer, sizeof(left_sum_hessian));
    buffer += sizeof(left_sum_hessian);
+    std::memcpy(&left_sum_gradient_and_hessian, buffer, sizeof(left_sum_gradient_and_hessian));
+    buffer += sizeof(left_sum_gradient_and_hessian);
    std::memcpy(&right_sum_gradient, buffer, sizeof(right_sum_gradient));
    buffer += sizeof(right_sum_gradient);
    std::memcpy(&right_sum_hessian, buffer, sizeof(right_sum_hessian));
    buffer += sizeof(right_sum_hessian);
+    std::memcpy(&right_sum_gradient_and_hessian, buffer, sizeof(right_sum_gradient_and_hessian));
+    buffer += sizeof(right_sum_gradient_and_hessian);
    std::memcpy(&default_left, buffer, sizeof(default_left));
    buffer += sizeof(default_left);
    std::memcpy(&monotone_type, buffer, sizeof(monotone_type));

--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -1854,3 +1854,44 @@ def test_predict_with_raw_score(task, output, cluster):
        if task.endswith('classification'):
            pred_proba_raw = model.predict_proba(dX, raw_score=True).compute()
            assert_eq(raw_predictions, pred_proba_raw)
+
+
+def test_distributed_quantized_training(cluster):
+    with Client(cluster) as client:
+        X, y, w, _, dX, dy, dw, _ = _create_data(
+            objective='regression',
+            output='array'
+        )
+
+        np.savetxt("data_dask.csv", np.hstack([np.array([y]).T, X]), fmt="%f,%f,%f,%f,%f")
+
+        params = {
+            "boosting_type": 'gbdt',
+            "n_estimators": 50,
+            "num_leaves": 31,
+            'use_quantized_grad': True,
+            'num_grad_quant_bins': 30,
+            'quant_train_renew_leaf': True,
+            'verbose': -1,
+            'force_row_wise': True,
+        }
+
+        quant_dask_classifier = lgb.DaskLGBMRegressor(
+            client=client,
+            time_out=5,
+            **params
+        )
+        quant_dask_classifier = quant_dask_classifier.fit(dX, dy, sample_weight=dw)
+        quant_p1 = quant_dask_classifier.predict(dX)
+        quant_rmse = np.sqrt(np.mean((quant_p1.compute() - y) ** 2))
+
+        params["use_quantized_grad"] = False
+        dask_classifier = lgb.DaskLGBMRegressor(
+            client=client,
+            time_out=5,
+            **params
+        )
+        dask_classifier = dask_classifier.fit(dX, dy, sample_weight=dw)
+        p1 = dask_classifier.predict(dX)
+        rmse = np.sqrt(np.mean((p1.compute() - y) ** 2))
+        assert quant_rmse < rmse + 7.0
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -4116,3 +4116,19 @@ def test_train_raises_informative_error_for_params_of_wrong_type():
    dtrain = lgb.Dataset(X, label=y)
    with pytest.raises(lgb.basic.LightGBMError, match="Parameter early_stopping_round should be of type int, got \"too-many\""):
        lgb.train(params, dtrain)
+
+
+def test_quantized_training():
+    X, y = make_synthetic_regression()
+    ds = lgb.Dataset(X, label=y)
+    bst_params = {'num_leaves': 15, 'verbose': -1, 'seed': 0}
+    bst = lgb.train(bst_params, ds, num_boost_round=10)
+    rmse = np.sqrt(np.mean((bst.predict(X) - y) ** 2))
+    bst_params.update({
+        'use_quantized_grad': True,
+        'num_grad_quant_bins': 30,
+        'quant_train_renew_leaf': True,
+    })
+    quant_bst = lgb.train(bst_params, ds, num_boost_round=10)
+    quant_rmse = np.sqrt(np.mean((quant_bst.predict(X) - y) ** 2))
+    assert quant_rmse < rmse + 6.0
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -306,6 +306,7 @@
    <ClInclude Include="..\src\treelearner\parallel_tree_learner.h" />
    <ClInclude Include="..\src\treelearner\serial_tree_learner.h" />
    <ClInclude Include="..\src\treelearner\split_info.hpp" />
+    <ClInclude Include="..\src\treelearner\gradient_discretizer.hpp" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\src\application\application.cpp" />
@@ -341,6 +342,7 @@
    <ClCompile Include="..\src\treelearner\serial_tree_learner.cpp" />
    <ClCompile Include="..\src\treelearner\tree_learner.cpp" />
    <ClCompile Include="..\src\treelearner\voting_parallel_tree_learner.cpp" />
+    <ClCompile Include="..\src\treelearner\gradient_discretizer.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">

--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -51,6 +51,9 @@
    <ClInclude Include="..\src\treelearner\serial_tree_learner.h">
      <Filter>src\treelearner</Filter>
    </ClInclude>
+    <ClInclude Include="..\src\treelearner\gradient_discretizer.hpp">
+      <Filter>src\treelearner</Filter>
+    </ClInclude>
    <ClInclude Include="..\src\application\predictor.hpp">
      <Filter>src\application</Filter>
    </ClInclude>
@@ -338,5 +341,8 @@
    <ClCompile Include="..\src\treelearner\linear_tree_learner.cpp">
      <Filter>src\treelearner</Filter>
    </ClCompile>
+    <ClCompile Include="..\src\treelearner\gradient_discretizer.cpp">
+      <Filter>src\treelearner</Filter>
+    </ClCompile>
  </ItemGroup>
 </Project>
\ No newline at end of file