[CUDA] CUDA Quantized Training (fixes #5606) (#5933)

* add quantized training (first stage) * add histogram construction functions for integer gradients * add stochastic rounding * update docs * fix compilation errors by adding template instantiations * update files for compilation * fix compilation of gpu version * initialize gradient discretizer before share states * add a test case for quantized training * add quantized training for data distributed training * Delete origin.pred * Delete ifelse.pred * Delete LightGBM_model.txt * remove useless changes * fix lint error * remove debug loggings * fix mismatch of vector and allocator types * remove changes in main.cpp * fix bugs with uninitialized gradient discretizer * initialize ordered gradients in gradient discretizer * disable quantized training with gpu and cuda fix msvc compilation errors and warnings * fix bug in data parallel tree learner * make quantized training test deterministic * make quantized training in test case more accurate * refactor test_quantized_training * fix leaf splits initialization with quantized training * check distributed quantized training result * add cuda gradient discretizer * add quantized training for CUDA version in tree learner * remove cuda computability 6.1 and 6.2 * fix parts of gpu quantized training errors and warnings * fix build-python.sh to install locally built version * fix memory access bugs * fix lint errors * mark cuda quantized training on cuda with categorical features as unsupported * rename cuda_utils.h to cuda_utils.hu * enable quantized training with cuda * fix cuda quantized training with sparse row data * allow using global memory buffer in histogram construction with cuda quantized training * recover build-python.sh enlarge allowed package size to 100M

[CUDA] CUDA Quantized Training (fixes #5606) (#5933)
* add quantized training (first stage) * add histogram construction functions for integer gradients * add stochastic rounding * update docs * fix compilation errors by adding template instantiations * update files for compilation * fix compilation of gpu version * initialize gradient discretizer before share states * add a test case for quantized training * add quantized training for data distributed training * Delete origin.pred * Delete ifelse.pred * Delete LightGBM_model.txt * remove useless changes * fix lint error * remove debug loggings * fix mismatch of vector and allocator types * remove changes in main.cpp * fix bugs with uninitialized gradient discretizer * initialize ordered gradients in gradient discretizer * disable quantized training with gpu and cuda fix msvc compilation errors and warnings * fix bug in data parallel tree learner * make quantized training test deterministic * make quantized training in test case more accurate * refactor test_quantized_training * fix leaf splits initialization with quantized training * check distributed quantized training result * add cuda gradient discretizer * add quantized training for CUDA version in tree learner * remove cuda computability 6.1 and 6.2 * fix parts of gpu quantized training errors and warnings * fix build-python.sh to install locally built version * fix memory access bugs * fix lint errors * mark cuda quantized training on cuda with categorical features as unsupported * rename cuda_utils.h to cuda_utils.hu * enable quantized training with cuda * fix cuda quantized training with sparse row data * allow using global memory buffer in histogram construction with cuda quantized training * recover build-python.sh enlarge allowed package size to 100M
f901f471 · shiyu1994 · GitHub · 3d9ada76 · f901f471 · f901f471
Unverified Commit f901f471 authored Oct 08, 2023 by shiyu1994 Committed by GitHub Oct 08, 2023
20 changed files
--- a/.ci/check_python_dists.sh
+++ b/.ci/check_python_dists.sh
@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
        pydistcheck \
            --inspect \
            --ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
-            --max-allowed-size-uncompressed '70M' \
+            --max-allowed-size-uncompressed '100M' \
            --max-allowed-files 800 \
            ${DIST_DIR}/* || exit -1
    elif { test $(uname -m) = "aarch64"; }; then

--- a/include/LightGBM/cuda/cuda_algorithms.hpp
+++ b/include/LightGBM/cuda/cuda_algorithms.hpp
@@ -13,7 +13,7 @@
 #include <stdio.h>
 #include <LightGBM/bin.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/utils/log.h>
 #include <algorithm>

--- a/include/LightGBM/cuda/cuda_column_data.hpp
+++ b/include/LightGBM/cuda/cuda_column_data.hpp
@@ -9,7 +9,7 @@
 #define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
 #include <LightGBM/config.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/openmp_wrapper.h>

--- a/include/LightGBM/cuda/cuda_metadata.hpp
+++ b/include/LightGBM/cuda/cuda_metadata.hpp
@@ -8,7 +8,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/meta.h>
 #include <vector>

--- a/include/LightGBM/cuda/cuda_metric.hpp
+++ b/include/LightGBM/cuda/cuda_metric.hpp
@@ -9,7 +9,7 @@
 #ifdef USE_CUDA
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/metric.h>
 namespace LightGBM {

--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -9,7 +9,7 @@
 #ifdef USE_CUDA
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/meta.h>

--- a/include/LightGBM/cuda/cuda_row_data.hpp
+++ b/include/LightGBM/cuda/cuda_row_data.hpp
@@ -10,7 +10,7 @@
 #include <LightGBM/bin.h>
 #include <LightGBM/config.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/dataset.h>
 #include <LightGBM/train_share_states.h>
 #include <LightGBM/utils/openmp_wrapper.h>

--- a/include/LightGBM/cuda/cuda_split_info.hpp
+++ b/include/LightGBM/cuda/cuda_split_info.hpp
@@ -24,12 +24,14 @@ class CUDASplitInfo {
  double left_sum_gradients;
  double left_sum_hessians;
+  int64_t left_sum_of_gradients_hessians;
  data_size_t left_count;
  double left_gain;
  double left_value;
  double right_sum_gradients;
  double right_sum_hessians;
+  int64_t right_sum_of_gradients_hessians;
  data_size_t right_count;
  double right_gain;
  double right_value;

--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -7,15 +7,21 @@
 #define LIGHTGBM_CUDA_CUDA_UTILS_H_
 #ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <LightGBM/utils/log.h>
+#include <algorithm>
 #include <vector>
 #include <cmath>
 namespace LightGBM {
+typedef unsigned long long atomic_add_long_t;
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
  if (code != cudaSuccess) {
@@ -125,13 +131,19 @@ class CUDAVector {
    T* new_data = nullptr;
    AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
    if (size_ > 0 && data_ != nullptr) {
-      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__);
+      const size_t size_for_old_content = std::min<size_t>(size_, size);
+      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
    }
    DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
    data_ = new_data;
    size_ = size;
  }
+  void InitFromHostVector(const std::vector<T>& host_vector) {
+    Resize(host_vector.size());
+    CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
+  }
  void Clear() {
    if (size_ > 0 && data_ != nullptr) {
      DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
@@ -171,6 +183,10 @@ class CUDAVector {
    return data_;
  }
+  void SetValue(int value) {
+    SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
+  }
  const T* RawDataReadOnly() const {
    return data_;
  }

--- a/include/LightGBM/sample_strategy.h
+++ b/include/LightGBM/sample_strategy.h
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
 #define LIGHTGBM_SAMPLE_STRATEGY_H_
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/threading.h>

--- a/src/boosting/cuda/cuda_score_updater.hpp
+++ b/src/boosting/cuda/cuda_score_updater.hpp
@@ -8,7 +8,7 @@
 #ifdef USE_CUDA
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include "../score_updater.hpp"

--- a/src/cuda/cuda_utils.cpp
+++ b/src/cuda/cuda_utils.cpp
@@ -5,7 +5,7 @@
 #ifdef USE_CUDA
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 namespace LightGBM {

--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -389,10 +389,6 @@ void Config::CheckParamConflict() {
    if (deterministic) {
      Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
    }
-    if (use_quantized_grad) {
-      Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
-      use_quantized_grad = false;
-    }
  }
  // linear tree learner must be serial type and run on CPU device
  if (linear_tree) {

--- a/src/metric/cuda/cuda_binary_metric.hpp
+++ b/src/metric/cuda/cuda_binary_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <vector>

--- a/src/metric/cuda/cuda_pointwise_metric.hpp
+++ b/src/metric/cuda/cuda_pointwise_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <vector>

--- a/src/metric/cuda/cuda_regression_metric.hpp
+++ b/src/metric/cuda/cuda_regression_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <vector>

--- a/src/treelearner/cuda/cuda_best_split_finder.cpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.cpp
@@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder(
  select_features_by_node_(select_features_by_node),
  cuda_hist_(cuda_hist) {
  InitFeatureMetaInfo(train_data);
+  if (has_categorical_feature_ && config->use_quantized_grad) {
+    Log::Fatal("Quantized training on GPU with categorical features is not supported yet.");
+  }
  cuda_leaf_best_split_info_ = nullptr;
  cuda_best_split_info_ = nullptr;
  cuda_best_split_info_buffer_ = nullptr;
@@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
  const data_size_t num_data_in_smaller_leaf,
  const data_size_t num_data_in_larger_leaf,
  const double sum_hessians_in_smaller_leaf,
-  const double sum_hessians_in_larger_leaf) {
+  const double sum_hessians_in_larger_leaf,
+  const score_t* grad_scale,
+  const score_t* hess_scale,
+  const uint8_t smaller_num_bits_in_histogram_bins,
+  const uint8_t larger_num_bits_in_histogram_bins) {
  const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
    sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
  const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
    sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
-  LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+  if (grad_scale != nullptr && hess_scale != nullptr) {
-    smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+    LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+      smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
+      grad_scale, hess_scale, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
+  } else {
+    LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+      smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+  }
  global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
  LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
  SynchronizeCUDADevice(__FILE__, __LINE__);

--- a/src/treelearner/cuda/cuda_best_split_finder.cu
+++ b/src/treelearner/cuda/cuda_best_split_finder.cu
--- a/src/treelearner/cuda/cuda_best_split_finder.hpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.hpp
@@ -67,7 +67,11 @@ class CUDABestSplitFinder {
    const data_size_t num_data_in_smaller_leaf,
    const data_size_t num_data_in_larger_leaf,
    const double sum_hessians_in_smaller_leaf,
-    const double sum_hessians_in_larger_leaf);
+    const double sum_hessians_in_larger_leaf,
+    const score_t* grad_scale,
+    const score_t* hess_scale,
+    const uint8_t smaller_num_bits_in_histogram_bins,
+    const uint8_t larger_num_bits_in_histogram_bins);
  const CUDASplitInfo* FindBestFromAllSplits(
    const int cur_num_leaves,
@@ -114,6 +118,31 @@ class CUDABestSplitFinder {
  #undef LaunchFindBestSplitsForLeafKernel_PARAMS
+  #define LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS \
+  const CUDALeafSplitsStruct* smaller_leaf_splits, \
+  const CUDALeafSplitsStruct* larger_leaf_splits, \
+  const int smaller_leaf_index, \
+  const int larger_leaf_index, \
+  const bool is_smaller_leaf_valid, \
+  const bool is_larger_leaf_valid, \
+  const score_t* grad_scale, \
+  const score_t* hess_scale, \
+  const uint8_t smaller_num_bits_in_histogram_bins, \
+  const uint8_t larger_num_bits_in_histogram_bins
+  void LaunchFindBestSplitsDiscretizedForLeafKernel(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+  template <bool USE_RAND>
+  void LaunchFindBestSplitsDiscretizedForLeafKernelInner0(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+  template <bool USE_RAND, bool USE_L1>
+  void LaunchFindBestSplitsDiscretizedForLeafKernelInner1(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+  template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
+  void LaunchFindBestSplitsDiscretizedForLeafKernelInner2(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+  #undef LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS
  void LaunchSyncBestSplitForLeafKernel(
    const int host_smaller_leaf_index,
    const int host_larger_leaf_index,

--- a/src/treelearner/cuda/cuda_data_partition.cpp
+++ b/src/treelearner/cuda/cuda_data_partition.cpp
@@ -368,6 +368,12 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n
  cur_num_leaves_ = num_leaves;
 }
+void CUDADataPartition::ReduceLeafGradStat(
+  const score_t* gradients, const score_t* hessians,
+  CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const {
+  LaunchReduceLeafGradStat(gradients, hessians, tree, leaf_grad_stat_buffer, leaf_hess_state_buffer);
+}
 }  // namespace LightGBM
 #endif  // USE_CUDA