[CUDA] consolidate CUDA versions (#5677)

* [ci] speed up if-else, swig, and lint conda setup * add 'source activate' * python constraint * start removing cuda v1 * comment out CI * remove more references * revert some unnecessaary changes * revert a few more mistakes * revert another change that ignored params * sigh * remove CUDATreeLearner * fix tests, docs * fix quoting in setup.py * restore all CI * Apply suggestions from code review Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> * Apply suggestions from code review * completely remove cuda_exp, update docs --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>

[CUDA] consolidate CUDA versions (#5677)
* [ci] speed up if-else, swig, and lint conda setup * add 'source activate' * python constraint * start removing cuda v1 * comment out CI * remove more references * revert some unnecessaary changes * revert a few more mistakes * revert another change that ignored params * sigh * remove CUDATreeLearner * fix tests, docs * fix quoting in setup.py * restore all CI * Apply suggestions from code review Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> * Apply suggestions from code review * completely remove cuda_exp, update docs --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>
4f47547c · James Lamb · GitHub · 5ffd7571 · 4f47547c · 4f47547c
Unverified Commit 4f47547c authored Jan 31, 2023 by James Lamb Committed by GitHub Feb 01, 2023
20 changed files
--- a/src/treelearner/cuda/cuda_histogram_constructor.cu
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cu
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_histogram_constructor.hpp"

@@ -429,4 +429,4 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_histogram_constructor.hpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_row_data.hpp>
 #include <LightGBM/feature_group.h>
@@ -165,5 +165,5 @@ class CUDAHistogramConstructor {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
--- a/src/treelearner/cuda/cuda_leaf_splits.cpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.cpp
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"

@@ -68,4 +68,4 @@ void CUDALeafSplits::Resize(const data_size_t num_data) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_leaf_splits.cu
+++ b/src/treelearner/cuda/cuda_leaf_splits.cu
@@ -5,7 +5,7 @@
 */


-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"
 #include <LightGBM/cuda/cuda_algorithms.hpp>
@@ -126,4 +126,4 @@ void CUDALeafSplits::LaunchInitValuesKernal(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_leaf_splits.hpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_utils.h>
 #include <LightGBM/bin.h>
@@ -156,5 +156,5 @@ class CUDALeafSplits {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_single_gpu_tree_learner.hpp"

@@ -515,4 +515,4 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_algorithms.hpp>

@@ -258,4 +258,4 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@@ -9,7 +9,7 @@
 #include <memory>
 #include <vector>

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"
 #include "cuda_histogram_constructor.hpp"
@@ -137,7 +137,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {

 }  // namespace LightGBM

-#else  // USE_CUDA_EXP
+#else  // USE_CUDA

 // When GPU support is not compiled in, quit with an error message

@@ -147,12 +147,12 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
 public:
    #pragma warning(disable : 4702)
    explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA_EXP=1");
+      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
+                 "Please recompile with CMake option -DUSE_CUDAP=1");
    }
 };

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifdef USE_CUDA
-
-#include "cuda_kernel_launcher.h"
-
-#include <LightGBM/utils/log.h>
-
-#include <cuda_runtime.h>
-
-#include <cstdio>
-
-namespace LightGBM {
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature) {
-  if (histogram_size == 16) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else if (histogram_size == 64) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  }
-}
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-
-#ifdef USE_CUDA
-#include <chrono>
-#include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
-
-namespace LightGBM {
-
-struct ThreadData {
-          // device id
-          int             device_id;
-          // parameters for cuda_histogram
-          int             histogram_size;
-          data_size_t     leaf_num_data;
-          data_size_t     num_data;
-          bool            use_all_features;
-          bool            is_constant_hessian;
-          int             num_workgroups;
-          cudaStream_t    stream;
-          uint8_t*        device_features;
-          uint8_t*        device_feature_masks;
-          data_size_t*    device_data_indices;
-          score_t*        device_gradients;
-          score_t*        device_hessians;
-          score_t         hessians_const;
-          char*           device_subhistograms;
-          volatile int*   sync_counters;
-          void*           device_histogram_outputs;
-          size_t          exp_workgroups_per_feature;
-          // cuda events
-          cudaEvent_t*    kernel_start;
-          cudaEvent_t*    kernel_wait_obj;
-          std::chrono::duration<double, std::milli>* kernel_input_wait_time;
-          // copy histogram
-          size_t        output_size;
-          char*                 host_histogram_output;
-          cudaEvent_t*          histograms_wait_obj;
-};
-
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature);
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-
-#include <LightGBM/utils/random.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/tree.h>
-
-#include <string>
-#include <cmath>
-#include <cstdio>
-#include <memory>
-#include <random>
-#include <vector>
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#include "feature_histogram.hpp"
-#include "serial_tree_learner.h"
-#include "data_partition.hpp"
-#include "split_info.hpp"
-#include "leaf_splits.hpp"
-
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#include "cuda_kernel_launcher.h"
-
-
-using json11::Json;
-
-namespace LightGBM {
-
-/*!
-* \brief CUDA-based parallel learning algorithm.
-*/
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    explicit CUDATreeLearner(const Config* tree_config);
-    ~CUDATreeLearner();
-    void Init(const Dataset* train_data, bool is_constant_hessian) override;
-    void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
-    Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override;
-    void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
-      SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
-      if (subset == nullptr && used_indices != nullptr) {
-        if (num_data != num_data_) {
-          use_bagging_ = true;
-          return;
-        }
-      }
-      use_bagging_ = false;
-    }
-
- protected:
-    void BeforeTrain() override;
-    bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
-    void FindBestSplits(const Tree* tree) override;
-    void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
-    void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
-
- private:
-    typedef float gpu_hist_t;
-
-    /*!
-     * \brief Find the best number of workgroups processing one feature for maximizing efficiency
-     * \param leaf_num_data The number of data examples on the current leaf being processed
-     * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
-     */
-    int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
-
-    /*!
-     * \brief Initialize GPU device
-     * \param num_gpu: number of maximum gpus
-     */
-    void InitGPU(int num_gpu);
-
-    /*!
-     * \brief Allocate memory for GPU computation // alloc only
-     */
-    void CountDenseFeatureGroups();  // compute num_dense_feature_group
-    void prevAllocateGPUMemory();  // compute CPU-side param calculation & Pin HostMemory
-    void AllocateGPUMemory();
-
-    /*!
-     * \ ResetGPUMemory
-     */
-    void ResetGPUMemory();
-
-    /*!
-     * \ copy dense feature from CPU to GPU
-     */
-    void copyDenseFeature();
-
-    /*! 
-     * \brief Compute GPU feature histogram for the current leaf.
-     *        Indices, gradients and Hessians have been copied to the device.
-     * \param leaf_num_data Number of data on current leaf
-     * \param use_all_features Set to true to not use feature masks, with a faster kernel
-     */
-    void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
-
-    void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
-                int leaf_num_data, bool use_all_features,
-                int num_workgroups, int exp_workgroups_per_feature) {
-      ThreadData* td = &thread_data[device_id];
-      td->device_id             = device_id;
-      td->histogram_size        = histogram_size;
-      td->leaf_num_data         = leaf_num_data;
-      td->num_data              = num_data_;
-      td->use_all_features      = use_all_features;
-      td->is_constant_hessian   = share_state_->is_constant_hessian;
-      td->num_workgroups        = num_workgroups;
-      td->stream                = stream_[device_id];
-      td->device_features       = device_features_[device_id];
-      td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
-      td->device_data_indices   = device_data_indices_[device_id];
-      td->device_gradients      = device_gradients_[device_id];
-      td->device_hessians       = device_hessians_[device_id];
-      td->hessians_const        = hessians_[0];
-      td->device_subhistograms  = device_subhistograms_[device_id];
-      td->sync_counters         = sync_counters_[device_id];
-      td->device_histogram_outputs   = device_histogram_outputs_[device_id];
-      td->exp_workgroups_per_feature = exp_workgroups_per_feature;
-
-      td->kernel_start           = &(kernel_start_[device_id]);
-      td->kernel_wait_obj        = &(kernel_wait_obj_[device_id]);
-      td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);
-
-      size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      td->output_size           = output_size;
-      td->host_histogram_output = reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset;
-      td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
-    }
-
-    /*!
-     * \brief Wait for GPU kernel execution and read histogram
-     * \param histograms Destination of histogram results from GPU.
-     */
-    template <typename HistType>
-    void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
-
-    /*!
-     * \brief Construct GPU histogram asynchronously. 
-     *        Interface is similar to Dataset::ConstructHistograms().
-     * \param is_feature_used A predicate vector for enabling each feature
-     * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
-     *                     Set to nullptr to skip copy to GPU.
-     * \param num_data Number of data examples to be included in histogram
-     * \return true if GPU kernel is launched, false if GPU is not used
-    */
-    bool ConstructGPUHistogramsAsync(
-      const std::vector<int8_t>& is_feature_used,
-      const data_size_t* data_indices, data_size_t num_data);
-
-    /*! brief Log2 of max number of workgroups per feature*/
-    const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
-    /*! brief Max total number of workgroups with preallocated workspace.
-     *        If we use more than this number of workgroups, we have to reallocate subhistograms */
-    std::vector<int> preallocd_max_num_wg_;
-
-    /*! \brief True if bagging is used */
-    bool use_bagging_;
-
-    /*! \brief GPU command queue object */
-    std::vector<cudaStream_t> stream_;
-
-    /*! \brief total number of feature-groups */
-    int num_feature_groups_;
-    /*! \brief total number of dense feature-groups, which will be processed on GPU */
-    int num_dense_feature_groups_;
-    std::vector<int> num_gpu_feature_groups_;
-    std::vector<int> offset_gpu_feature_groups_;
-    /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
-     *  With bin size > 16, there are 4 features per DWORD.
-     *  With bin size <=16, there are 8 features per DWORD.
-     */
-    int dword_features_;
-    /*! \brief Max number of bins of training data, used to determine 
-     * which GPU kernel to use */
-    int max_num_bin_;
-    /*! \brief Used GPU kernel bin size (64, 256) */
-    int histogram_size_;
-    int device_bin_size_;
-    /*! \brief Size of histogram bin entry, depending if single or double precision is used */
-    size_t hist_bin_entry_sz_;
-    /*! \brief Indices of all dense feature-groups */
-    std::vector<int> dense_feature_group_map_;
-    /*! \brief Indices of all sparse feature-groups */
-    std::vector<int> sparse_feature_group_map_;
-    /*! \brief GPU memory object holding the training data */
-    std::vector<uint8_t*> device_features_;
-    /*! \brief GPU memory object holding the ordered gradient */
-    std::vector<score_t*> device_gradients_;
-    /*! \brief GPU memory object holding the ordered hessian */
-    std::vector<score_t*> device_hessians_;
-    /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
-    std::vector<char> feature_masks_;
-    /*! \brief GPU memory object holding the feature masks */
-    std::vector<char*> device_feature_masks_;
-    /*! \brief Pointer to pinned memory of feature masks */
-    char* ptr_pinned_feature_masks_ = nullptr;
-    /*! \brief GPU memory object holding indices of the leaf being processed */
-    std::vector<data_size_t*> device_data_indices_;
-    /*! \brief GPU memory object holding counters for workgroup coordination */
-    std::vector<int*> sync_counters_;
-    /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
-    std::vector<char*> device_subhistograms_;
-    /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
-    std::vector<void*> device_histogram_outputs_;
-    /*! \brief Host memory pointer for histogram outputs */
-    void *host_histogram_outputs_;
-    /*! CUDA waitlist object for waiting for data transfer before kernel execution */
-    std::vector<cudaEvent_t> kernel_wait_obj_;
-    /*! CUDA waitlist object for reading output histograms after kernel execution */
-    std::vector<cudaEvent_t> histograms_wait_obj_;
-    /*! CUDA Asynchronous waiting object for copying indices */
-    std::vector<cudaEvent_t> indices_future_;
-    /*! Asynchronous waiting object for copying gradients */
-    std::vector<cudaEvent_t> gradients_future_;
-    /*! Asynchronous waiting object for copying Hessians */
-    std::vector<cudaEvent_t> hessians_future_;
-    /*! Asynchronous waiting object for copying dense features */
-    std::vector<cudaEvent_t> features_future_;
-
-    // host-side buffer for converting feature data into featre4 data
-    int nthreads_;  // number of Feature4* vector on host4_vecs_
-    std::vector<cudaEvent_t> kernel_start_;
-    std::vector<float> kernel_time_;  // measure histogram kernel time
-    std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
-    int num_gpu_;
-    int allocated_num_data_;  // allocated data instances
-    pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
-};
-
-}  // namespace LightGBM
-#else  // USE_CUDA
-
-// When GPU support is not compiled in, quit with an error message
-
-namespace LightGBM {
-
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    #pragma warning(disable : 4702)
-    explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA=1");
-    }
-};
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -276,7 +276,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class DataParallelTreeLearner<CUDATreeLearner>;
 template class DataParallelTreeLearner<GPUTreeLearner>;
 template class DataParallelTreeLearner<SerialTreeLearner>;


--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -77,7 +77,6 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class FeatureParallelTreeLearner<CUDATreeLearner>;
 template class FeatureParallelTreeLearner<GPUTreeLearner>;
 template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -12,7 +12,6 @@
 #include <memory>
 #include <vector>

-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"


--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -344,15 +344,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set<int>* fo
  }
  bool use_subtract = parent_leaf_histogram_array_ != nullptr;

-#ifdef USE_CUDA
-  if (LGBM_config_::current_learner == use_cpu_learner) {
-    SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-  } else {
-    ConstructHistograms(is_feature_used, use_subtract);
-  }
-#else
  ConstructHistograms(is_feature_used, use_subtract);
-#endif
  FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
 }


--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -211,7 +211,7 @@ class SerialTreeLearner: public TreeLearner {
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#elif defined(USE_CUDA)
  /*! \brief gradients of current iteration, ordered for cache optimized */
  std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized */

--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -4,7 +4,6 @@
 */
 #include <LightGBM/tree_learner.h>

-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "linear_tree_learner.h"
 #include "parallel_tree_learner.h"
@@ -40,24 +39,14 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
      return new VotingParallelTreeLearner<GPUTreeLearner>(config);
    }
  } else if (device_type == std::string("cuda")) {
-    if (learner_type == std::string("serial")) {
-      return new CUDATreeLearner(config);
-    } else if (learner_type == std::string("feature")) {
-      return new FeatureParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("data")) {
-      return new DataParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("voting")) {
-      return new VotingParallelTreeLearner<CUDATreeLearner>(config);
-    }
-  } else if (device_type == std::string("cuda_exp")) {
    if (learner_type == std::string("serial")) {
      if (config->num_gpu == 1) {
        return new CUDASingleGPUTreeLearner(config, boosting_on_cuda);
      } else {
-        Log::Fatal("cuda_exp only supports training on a single GPU.");
+        Log::Fatal("Currently cuda version only supports training on a single GPU.");
      }
    } else {
-      Log::Fatal("cuda_exp only supports training on a single machine.");
+      Log::Fatal("Currently cuda version only supports training on a single machine.");
    }
  }
  return nullptr;

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -501,7 +501,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class VotingParallelTreeLearner<CUDATreeLearner>;
 template class VotingParallelTreeLearner<GPUTreeLearner>;
 template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -48,7 +48,7 @@ def test_basic(tmp_path):
    assert bst.current_iteration() == 20
    assert bst.num_trees() == 20
    assert bst.num_model_per_iteration() == 1
-    if getenv('TASK', '') != 'cuda_exp':
+    if getenv('TASK', '') != 'cuda':
        assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
        assert bst.upper_bound() == pytest.approx(3.3182142872462883)