[CUDA] consolidate CUDA versions (#5677)

* [ci] speed up if-else, swig, and lint conda setup * add 'source activate' * python constraint * start removing cuda v1 * comment out CI * remove more references * revert some unnecessaary changes * revert a few more mistakes * revert another change that ignored params * sigh * remove CUDATreeLearner * fix tests, docs * fix quoting in setup.py * restore all CI * Apply suggestions from code review Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> * Apply suggestions from code review * completely remove cuda_exp, update docs --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>

[CUDA] consolidate CUDA versions (#5677)
* [ci] speed up if-else, swig, and lint conda setup * add 'source activate' * python constraint * start removing cuda v1 * comment out CI * remove more references * revert some unnecessaary changes * revert a few more mistakes * revert another change that ignored params * sigh * remove CUDATreeLearner * fix tests, docs * fix quoting in setup.py * restore all CI * Apply suggestions from code review Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> * Apply suggestions from code review * completely remove cuda_exp, update docs --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>
4f47547c · James Lamb · GitHub · 5ffd7571 · 4f47547c · 4f47547c
Unverified Commit 4f47547c authored Jan 31, 2023 by James Lamb Committed by GitHub Feb 01, 2023
20 changed files
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -97,7 +97,7 @@ class ObjectiveFunction {
  */
  virtual bool IsCUDAObjective() const { return false; }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  /*!
  * \brief Convert output for CUDA version
  */
@@ -107,7 +107,7 @@ class ObjectiveFunction {
  virtual bool NeedConvertOutputCUDA () const { return false; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 }  // namespace LightGBM

--- a/include/LightGBM/sample_strategy.h
+++ b/include/LightGBM/sample_strategy.h
@@ -38,9 +38,9 @@ class SampleStrategy {
  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices() { return bag_data_indices_; }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  CUDAVector<data_size_t>& cuda_bag_data_indices() { return cuda_bag_data_indices_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) {
    objective_function_ = objective_function;
@@ -72,10 +72,10 @@ class SampleStrategy {
  /*! \brief whether need to resize the gradient vectors */
  bool need_resize_gradients_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
-  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */
+  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda */
  CUDAVector<data_size_t> cuda_bag_data_indices_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 }  // namespace LightGBM

--- a/include/LightGBM/train_share_states.h
+++ b/include/LightGBM/train_share_states.h
@@ -126,7 +126,7 @@ class MultiValBinWrapper {
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const void* GetRowWiseData(
    uint8_t* bit_type,
    size_t* total_size,
@@ -142,7 +142,7 @@ class MultiValBinWrapper {
      return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type);
    }
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 private:
  bool is_use_subcol_ = false;
@@ -183,9 +183,9 @@ struct TrainingShareStates {
  const std::vector<uint32_t>& feature_hist_offsets() const { return feature_hist_offsets_; }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const std::vector<uint32_t>& column_hist_offsets() const { return column_hist_offsets_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  bool IsSparseRowwise() {
    return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse());
@@ -235,7 +235,7 @@ struct TrainingShareStates {
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const void* GetRowWiseData(uint8_t* bit_type,
    size_t* total_size,
    bool* is_sparse,
@@ -250,13 +250,13 @@ struct TrainingShareStates {
      return nullptr;
    }
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 private:
  std::vector<uint32_t> feature_hist_offsets_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  std::vector<uint32_t> column_hist_offsets_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  int num_hist_total_bin_ = 0;
  std::unique_ptr<MultiValBinWrapper> multi_val_bin_wrapper_;
  std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;

--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -319,9 +319,9 @@ class Tree {
  inline bool is_linear() const { return is_linear_; }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  inline bool is_cuda_tree() const { return is_cuda_tree_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  inline void SetIsLinear(bool is_linear) {
    is_linear_ = is_linear;
@@ -532,10 +532,10 @@ class Tree {
  std::vector<std::vector<int>> leaf_features_;
  /* \brief features used in leaf linear models; indexing is relative to used_features_ */
  std::vector<std::vector<int>> leaf_features_inner_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  /*! \brief Marks whether this tree is a CUDATree */
  bool is_cuda_tree_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 inline void Tree::Split(int leaf, int feature, int real_feature,

--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -121,11 +121,9 @@ Build CUDA Version
 All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well, and `CMake`_ (version 3.16 or higher) is strongly required.
-**CUDA** library (version 9.0 or higher) is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-cuda-version-experimental>`__.
+**CUDA** library (version 10.0 or higher) is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-cuda-version-experimental>`__.
-Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. Note that this new version uses twice the memory, since it stores data row-wise as well as column-wise in memory to improve performance (see this `issue <https://github.com/microsoft/LightGBM/issues/5318>`__ for discussion). 
+To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters.
-To use the regular or experimental CUDA versions within Python, pass ``{"device": "cuda"}`` or ``{"device": "cuda_exp"}`` respectively as parameters.
 Build HDFS Version
 ~~~~~~~~~~~~~~~~~~
@@ -211,8 +209,6 @@ Run ``python setup.py install --gpu`` to enable GPU support. All requirements fr
 Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
-Run ``python setup.py install --cuda-exp`` to enable the new experimental version of CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
 Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.
 Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.

--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -21,7 +21,6 @@ LIGHTGBM_OPTIONS = [
    ('integrated-opencl', None, 'Compile integrated OpenCL version'),
    ('gpu', 'g', 'Compile GPU version'),
    ('cuda', None, 'Compile CUDA version'),
-    ('cuda-exp', None, 'Compile CUDA Experimental version'),
    ('mpi', None, 'Compile MPI version'),
    ('nomp', None, 'Compile version without OpenMP support'),
    ('hdfs', 'h', 'Compile HDFS version'),
@@ -106,7 +105,6 @@ def compile_cpp(
    use_mingw: bool = False,
    use_gpu: bool = False,
    use_cuda: bool = False,
-    use_cuda_exp: bool = False,
    use_mpi: bool = False,
    use_hdfs: bool = False,
    boost_root: Optional[str] = None,
@@ -148,8 +146,6 @@ def compile_cpp(
            cmake_cmd.append(f"-DOpenCL_LIBRARY={opencl_library}")
    elif use_cuda:
        cmake_cmd.append("-DUSE_CUDA=ON")
-    elif use_cuda_exp:
-        cmake_cmd.append("-DUSE_CUDA_EXP=ON")
    if use_mpi:
        cmake_cmd.append("-DUSE_MPI=ON")
    if nomp:
@@ -171,7 +167,7 @@ def compile_cpp(
        else:
            status = 1
            lib_path = CURRENT_DIR / "compile" / "windows" / "x64" / "DLL" / "lib_lightgbm.dll"
-            if not any((use_gpu, use_cuda, use_cuda_exp, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
+            if not any((use_gpu, use_cuda, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
                logger.info("Starting to compile with MSBuild from existing solution file.")
                platform_toolsets = ("v143", "v142", "v141", "v140")
                for pt in platform_toolsets:
@@ -235,7 +231,6 @@ class CustomInstall(install):
        self.integrated_opencl = False
        self.gpu = False
        self.cuda = False
-        self.cuda_exp = False
        self.boost_root = None
        self.boost_dir = None
        self.boost_include_dir = None
@@ -260,7 +255,7 @@ class CustomInstall(install):
        LOG_PATH.touch()
        if not self.precompile:
            copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu)
-            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_cuda_exp=self.cuda_exp, use_mpi=self.mpi,
+            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi,
                        use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
                        boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
                        opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library,
@@ -281,7 +276,6 @@ class CustomBdistWheel(bdist_wheel):
        self.integrated_opencl = False
        self.gpu = False
        self.cuda = False
-        self.cuda_exp = False
        self.boost_root = None
        self.boost_dir = None
        self.boost_include_dir = None
@@ -304,7 +298,6 @@ class CustomBdistWheel(bdist_wheel):
        install.integrated_opencl = self.integrated_opencl
        install.gpu = self.gpu
        install.cuda = self.cuda
-        install.cuda_exp = self.cuda_exp
        install.boost_root = self.boost_root
        install.boost_dir = self.boost_dir
        install.boost_include_dir = self.boost_include_dir

--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -36,7 +36,7 @@ Application::Application(int argc, char** argv) {
    Log::Fatal("No training/prediction data, application quit");
  }
-  if (config_.device_type == std::string("cuda") || config_.device_type == std::string("cuda_exp")) {
+  if (config_.device_type == std::string("cuda")) {
      LGBM_config_::current_device = lgbm_device_cuda;
  }
 }

--- a/src/boosting/bagging.hpp
+++ b/src/boosting/bagging.hpp
@@ -47,33 +47,33 @@ class BaggingSampleStrategy : public SampleStrategy {
      Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
      // set bagging data to tree learner
      if (!is_use_subset_) {
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
-        if (config_->device_type == std::string("cuda_exp")) {
+        if (config_->device_type == std::string("cuda")) {
          CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
          tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
        } else {
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
          tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
        }
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
      } else {
        // get subset
        tmp_subset_->ReSize(bag_data_cnt_);
        tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                                bag_data_cnt_, false);
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
-        if (config_->device_type == std::string("cuda_exp")) {
+        if (config_->device_type == std::string("cuda")) {
          CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
          tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
                                       bag_data_cnt_);
        } else {
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
          tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                       bag_data_cnt_);
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
        }
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
      }
    }
  }
@@ -103,11 +103,11 @@ class BaggingSampleStrategy : public SampleStrategy {
        bag_data_cnt_ = static_cast<data_size_t>(config_->bagging_fraction * num_data_);
      }
      bag_data_indices_.resize(num_data_);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
-      if (config_->device_type == std::string("cuda_exp")) {
+      if (config_->device_type == std::string("cuda")) {
        cuda_bag_data_indices_.Resize(num_data_);
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
      bagging_runner_.ReSize(num_data_);
      bagging_rands_.clear();
      for (int i = 0;
@@ -118,7 +118,7 @@ class BaggingSampleStrategy : public SampleStrategy {
      double average_bag_rate =
          (static_cast<double>(bag_data_cnt_) / num_data_) / config_->bagging_freq;
      is_use_subset_ = false;
-      if (config_->device_type != std::string("cuda_exp")) {
+      if (config_->device_type != std::string("cuda")) {
        const int group_threshold_usesubset = 100;
        const double average_bag_rate_threshold = 0.5;
        if (average_bag_rate <= average_bag_rate_threshold
@@ -141,9 +141,9 @@ class BaggingSampleStrategy : public SampleStrategy {
    } else {
      bag_data_cnt_ = num_data_;
      bag_data_indices_.clear();
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      cuda_bag_data_indices_.Clear();
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
      bagging_runner_.ReSize(0);
      is_use_subset_ = false;
    }

--- a/src/boosting/cuda/cuda_score_updater.cpp
+++ b/src/boosting/cuda/cuda_score_updater.cpp
@@ -5,7 +5,7 @@
 #include "cuda_score_updater.hpp"
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 namespace LightGBM {
@@ -91,4 +91,4 @@ inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) {
 }  // namespace LightGBM
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/boosting/cuda/cuda_score_updater.cu
+++ b/src/boosting/cuda/cuda_score_updater.cu
@@ -5,7 +5,7 @@
 #include "cuda_score_updater.hpp"
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 namespace LightGBM {
@@ -42,4 +42,4 @@ void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const
 }  // namespace LightGBM
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/boosting/cuda/cuda_score_updater.hpp
+++ b/src/boosting/cuda/cuda_score_updater.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
 #define LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_utils.h>
@@ -60,6 +60,6 @@ class CUDAScoreUpdater: public ScoreUpdater {
 }  // namespace LightGBM
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -68,14 +68,14 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
  es_first_metric_only_ = config_->first_metric_only;
  shrinkage_rate_ = config_->learning_rate;
-  if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) {
+  if (config_->device_type == std::string("cuda")) {
    LGBM_config_::current_learner = use_cuda_learner;
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
-    if (config_->device_type == std::string("cuda_exp")) {
+    if (config_->device_type == std::string("cuda")) {
      const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
    }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
  }
  // load forced_splits file
@@ -116,15 +116,15 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
  }
  training_metrics_.shrink_to_fit();
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
-  if (config_->device_type == std::string("cuda_exp")) {
+  if (config_->device_type == std::string("cuda")) {
    train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_));
  } else {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
    train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  num_data_ = train_data_->num_data();
@@ -186,11 +186,11 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
  }
  // for a validation dataset, we need its score and metric
  auto new_score_updater =
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
-    config_->device_type == std::string("cuda_exp") ?
+    config_->device_type == std::string("cuda") ?
    std::unique_ptr<CUDAScoreUpdater>(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_,
      objective_function_ != nullptr && objective_function_->IsCUDAObjective())) :
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
    std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
  // update score
  for (int i = 0; i < iter_; ++i) {
@@ -481,15 +481,15 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
    const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
    // we need to predict out-of-bag scores of data for boosting
    if (num_data_ - bag_data_cnt > 0) {
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
-      if (config_->device_type == std::string("cuda_exp")) {
+      if (config_->device_type == std::string("cuda")) {
        train_score_updater_->AddScore(tree, data_sample_strategy_->cuda_bag_data_indices().RawData() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
      } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
        train_score_updater_->AddScore(tree, data_sample_strategy_->bag_data_indices().data() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
    }
  } else {
@@ -503,17 +503,17 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
  }
 }
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const {
 #else
 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t /*num_data*/) const {
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  const bool evaluation_on_cuda = metric->IsCUDAMetric();
  if ((boosting_on_gpu_ && evaluation_on_cuda) || (!boosting_on_gpu_ && !evaluation_on_cuda)) {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
    return metric->Eval(score, objective_function_);
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  } else if (boosting_on_gpu_ && !evaluation_on_cuda) {
    const size_t total_size = static_cast<size_t>(num_data) * static_cast<size_t>(num_tree_per_iteration_);
    if (total_size > host_score_.size()) {
@@ -529,7 +529,7 @@ std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* scor
    CopyFromHostToCUDADevice<double>(cuda_score_.RawData(), score, total_size, __FILE__, __LINE__);
    return metric->Eval(cuda_score_.RawData(), objective_function_);
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 std::string GBDT::OutputMetric(int iter) {
@@ -660,14 +660,14 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
    num_data = valid_score_updater_[used_idx]->num_data();
    *out_len = static_cast<int64_t>(num_data) * num_class_;
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  std::vector<double> host_raw_scores;
  if (boosting_on_gpu_) {
    host_raw_scores.resize(static_cast<size_t>(*out_len), 0.0);
    CopyFromCUDADeviceToHost<double>(host_raw_scores.data(), raw_scores, static_cast<size_t>(*out_len), __FILE__, __LINE__);
    raw_scores = host_raw_scores.data();
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  if (objective_function_ != nullptr) {
    #pragma omp parallel for schedule(static)
    for (data_size_t i = 0; i < num_data; ++i) {
@@ -730,26 +730,26 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
  }
  training_metrics_.shrink_to_fit();
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() &&
                    !data_sample_strategy_->IsHessianChange();  // for sample strategy with Hessian change, fall back to boosting on CPU
  tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_);
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  if (train_data != train_data_) {
    train_data_ = train_data;
    data_sample_strategy_->UpdateTrainingData(train_data);
    // not same training data, need reset score and others
    // create score tracker
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
-    if (config_->device_type == std::string("cuda_exp")) {
+    if (config_->device_type == std::string("cuda")) {
      train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_));
    } else {
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
      train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
    }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
    // update score
    for (int i = 0; i < iter_; ++i) {
@@ -827,8 +827,8 @@ void GBDT::ResetGradientBuffers() {
  const bool is_use_subset = data_sample_strategy_->is_use_subset();
  const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
  if (objective_function_ != nullptr) {
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
-    if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) {
+    if (config_->device_type == std::string("cuda") && boosting_on_gpu_) {
      if (cuda_gradients_.Size() < total_size) {
        cuda_gradients_.Resize(total_size);
        cuda_hessians_.Resize(total_size);
@@ -836,16 +836,16 @@ void GBDT::ResetGradientBuffers() {
      gradients_pointer_ = cuda_gradients_.RawData();
      hessians_pointer_ = cuda_hessians_.RawData();
    } else {
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
      if (gradients_.size() < total_size) {
        gradients_.resize(total_size);
        hessians_.resize(total_size);
      }
      gradients_pointer_ = gradients_.data();
      hessians_pointer_ = hessians_.data();
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
    }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
  } else if (data_sample_strategy_->IsHessianChange() || (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_)) {
    if (gradients_.size() < total_size) {
      gradients_.resize(total_size);

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -542,7 +542,7 @@ class GBDT : public GBDTBase {
  /*! \brief Parser config file content */
  std::string parser_config_str_ = "";
-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
  /*! \brief First order derivative of training data */
  std::vector<score_t, CHAllocator<score_t>> gradients_;
  /*! \brief Second order derivative of training data */
@@ -557,18 +557,18 @@ class GBDT : public GBDTBase {
  score_t* gradients_pointer_;
  /*! \brief Pointer to hessian vector, can be on CPU or GPU */
  score_t* hessians_pointer_;
-  /*! \brief Whether boosting is done on GPU, used for cuda_exp */
+  /*! \brief Whether boosting is done on GPU, used for device_type=cuda */
  bool boosting_on_gpu_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  /*! \brief Gradient vector on GPU */
  CUDAVector<score_t> cuda_gradients_;
  /*! \brief Hessian vector on GPU */
  CUDAVector<score_t> cuda_hessians_;
-  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */
+  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with device_type=cuda */
  mutable std::vector<double> host_score_;
-  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */
+  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with device_type=cuda */
  mutable CUDAVector<double> cuda_score_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
  /*! \brief Number of training data */
  data_size_t num_data_;

--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -43,33 +43,33 @@ class GOSSStrategy : public SampleStrategy {
    bag_data_cnt_ = left_cnt;
    // set bagging data to tree learner
    if (!is_use_subset_) {
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
-      if (config_->device_type == std::string("cuda_exp")) {
+      if (config_->device_type == std::string("cuda")) {
        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
        tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
      } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
        tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
    } else {
      // get subset
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                              bag_data_cnt_, false);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
-      if (config_->device_type == std::string("cuda_exp")) {
+      if (config_->device_type == std::string("cuda")) {
        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
        tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
                                      bag_data_cnt_);
      } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
        tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                     bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
      }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
    }
  }

--- a/src/cuda/cuda_algorithms.cu
+++ b/src/cuda/cuda_algorithms.cu
@@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_algorithms.hpp>
@@ -509,4 +509,4 @@ template __device__ double PercentileDevice<double, data_size_t, label_t, double
 }  // namespace LightGBM
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/cuda/cuda_utils.cpp
+++ b/src/cuda/cuda_utils.cpp
@@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_utils.h>
@@ -28,4 +28,4 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) {
 }  // namespace LightGBM
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -886,7 +886,7 @@ namespace LightGBM {
    return nullptr;
  }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
  template <>
  const void* MultiValDenseBin<uint8_t>::GetRowWiseData(uint8_t* bit_type,
      size_t* total_size,
@@ -1081,6 +1081,6 @@ namespace LightGBM {
    return to_return;
  }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }  // namespace LightGBM
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -177,8 +177,6 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
      *device_type = "gpu";
    } else if (value == std::string("cuda")) {
      *device_type = "cuda";
-    } else if (value == std::string("cuda_exp")) {
-      *device_type = "cuda_exp";
    } else {
      Log::Fatal("Unknown device type %s", value.c_str());
    }
@@ -260,7 +258,7 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
  GetObjectiveType(params, &objective);
  GetMetricType(params, objective, &metric);
  GetDeviceType(params, &device_type);
-  if (device_type == std::string("cuda") || device_type == std::string("cuda_exp")) {
+  if (device_type == std::string("cuda")) {
    LGBM_config_::current_device = lgbm_device_cuda;
  }
  GetTreeLearnerType(params, &tree_learner);
@@ -373,26 +371,21 @@ void Config::CheckParamConflict() {
      num_leaves = static_cast<int>(full_num_leaves);
    }
  }
-  if (device_type == std::string("gpu") || device_type == std::string("cuda")) {
+  if (device_type == std::string("gpu")) {
    // force col-wise for gpu, and cuda version
    force_col_wise = true;
    force_row_wise = false;
    if (deterministic) {
      Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
    }
-  } else if (device_type == std::string("cuda_exp")) {
+  } else if (device_type == std::string("cuda")) {
-    // force row-wise for cuda_exp version
+    // force row-wise for cuda version
    force_col_wise = false;
    force_row_wise = true;
    if (deterministic) {
      Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
    }
  }
-  // force gpu_use_dp for CUDA
-  if (device_type == std::string("cuda") && !gpu_use_dp) {
-    Log::Warning("CUDA currently requires double precision calculations.");
-    gpu_use_dp = true;
-  }
  // linear tree learner must be serial type and run on CPU device
  if (linear_tree) {
    if (device_type != std::string("cpu")) {

--- a/src/io/cuda/cuda_column_data.cpp
+++ b/src/io/cuda/cuda_column_data.cpp
@@ -3,7 +3,7 @@
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_column_data.hpp>
@@ -308,4 +308,4 @@ void CUDAColumnData::InitColumnMetaInfo() {
 }  // namespace LightGBM
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/io/cuda/cuda_column_data.cu
+++ b/src/io/cuda/cuda_column_data.cu
@@ -4,7 +4,7 @@
 */
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 #include <LightGBM/cuda/cuda_column_data.hpp>
@@ -58,4 +58,4 @@ void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column)
 }  // namespace LightGBM
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA