[CUDA] consolidate CUDA versions (#5677)

* [ci] speed up if-else, swig, and lint conda setup * add 'source activate' * python constraint * start removing cuda v1 * comment out CI * remove more references * revert some unnecessaary changes * revert a few more mistakes * revert another change that ignored params * sigh * remove CUDATreeLearner * fix tests, docs * fix quoting in setup.py * restore all CI * Apply suggestions from code review Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> * Apply suggestions from code review * completely remove cuda_exp, update docs --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>

[CUDA] consolidate CUDA versions (#5677)
* [ci] speed up if-else, swig, and lint conda setup * add 'source activate' * python constraint * start removing cuda v1 * comment out CI * remove more references * revert some unnecessaary changes * revert a few more mistakes * revert another change that ignored params * sigh * remove CUDATreeLearner * fix tests, docs * fix quoting in setup.py * restore all CI * Apply suggestions from code review Co-authored-by: shiyu1994 <shiyu_k1994@qq.com> * Apply suggestions from code review * completely remove cuda_exp, update docs --------- Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>
4f47547c · James Lamb · GitHub · 5ffd7571 · 4f47547c · 4f47547c
Unverified Commit 4f47547c authored Jan 31, 2023 by James Lamb Committed by GitHub Feb 01, 2023
20 changed files
--- a/src/treelearner/cuda/cuda_histogram_constructor.cu
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cu
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_histogram_constructor.hpp"

@@ -429,4 +429,4 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_histogram_constructor.hpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_row_data.hpp>
 #include <LightGBM/feature_group.h>
@@ -165,5 +165,5 @@ class CUDAHistogramConstructor {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
--- a/src/treelearner/cuda/cuda_leaf_splits.cpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.cpp
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"

@@ -68,4 +68,4 @@ void CUDALeafSplits::Resize(const data_size_t num_data) {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_leaf_splits.cu
+++ b/src/treelearner/cuda/cuda_leaf_splits.cu
@@ -5,7 +5,7 @@
 */


-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"
 #include <LightGBM/cuda/cuda_algorithms.hpp>
@@ -126,4 +126,4 @@ void CUDALeafSplits::LaunchInitValuesKernal(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_leaf_splits.hpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_utils.h>
 #include <LightGBM/bin.h>
@@ -156,5 +156,5 @@ class CUDALeafSplits {

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_single_gpu_tree_learner.hpp"

@@ -515,4 +515,4 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@@ -4,7 +4,7 @@
 * license information.
 */

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include <LightGBM/cuda/cuda_algorithms.hpp>

@@ -258,4 +258,4 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel(

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@@ -9,7 +9,7 @@
 #include <memory>
 #include <vector>

-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA

 #include "cuda_leaf_splits.hpp"
 #include "cuda_histogram_constructor.hpp"
@@ -137,7 +137,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {

 }  // namespace LightGBM

-#else  // USE_CUDA_EXP
+#else  // USE_CUDA

 // When GPU support is not compiled in, quit with an error message

@@ -147,12 +147,12 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
 public:
    #pragma warning(disable : 4702)
    explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA_EXP=1");
+      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
+                 "Please recompile with CMake option -DUSE_CUDAP=1");
    }
 };

 }  // namespace LightGBM

-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ b/src/treelearner/cuda_kernel_launcher.cu
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifdef USE_CUDA
-
-#include "cuda_kernel_launcher.h"
-
-#include <LightGBM/utils/log.h>
-
-#include <cuda_runtime.h>
-
-#include <cstdio>
-
-namespace LightGBM {
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature) {
-  if (histogram_size == 16) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else if (histogram_size == 64) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  }
-}
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
--- a/src/treelearner/cuda_kernel_launcher.h
+++ b/src/treelearner/cuda_kernel_launcher.h
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-
-#ifdef USE_CUDA
-#include <chrono>
-#include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
-
-namespace LightGBM {
-
-struct ThreadData {
-          // device id
-          int             device_id;
-          // parameters for cuda_histogram
-          int             histogram_size;
-          data_size_t     leaf_num_data;
-          data_size_t     num_data;
-          bool            use_all_features;
-          bool            is_constant_hessian;
-          int             num_workgroups;
-          cudaStream_t    stream;
-          uint8_t*        device_features;
-          uint8_t*        device_feature_masks;
-          data_size_t*    device_data_indices;
-          score_t*        device_gradients;
-          score_t*        device_hessians;
-          score_t         hessians_const;
-          char*           device_subhistograms;
-          volatile int*   sync_counters;
-          void*           device_histogram_outputs;
-          size_t          exp_workgroups_per_feature;
-          // cuda events
-          cudaEvent_t*    kernel_start;
-          cudaEvent_t*    kernel_wait_obj;
-          std::chrono::duration<double, std::milli>* kernel_input_wait_time;
-          // copy histogram
-          size_t        output_size;
-          char*                 host_histogram_output;
-          cudaEvent_t*          histograms_wait_obj;
-};
-
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature);
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
--- a/src/treelearner/cuda_tree_learner.cpp
+++ b/src/treelearner/cuda_tree_learner.cpp
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifdef USE_CUDA
-#include "cuda_tree_learner.h"
-
-#include <LightGBM/bin.h>
-#include <LightGBM/network.h>
-#include <LightGBM/cuda/cuda_utils.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/common.h>
-
-#include <pthread.h>
-
-#include <algorithm>
-#include <cinttypes>
-#include <vector>
-
-#include "../io/dense_bin.hpp"
-
-namespace LightGBM {
-
-#define cudaMemcpy_DEBUG 0  // 1: DEBUG cudaMemcpy
-#define ResetTrainingData_DEBUG 0  // 1: Debug ResetTrainingData
-
-#define CUDA_DEBUG 0
-
-static void *launch_cuda_histogram(void *thread_data) {
-  ThreadData td = *(reinterpret_cast<ThreadData*>(thread_data));
-  int device_id = td.device_id;
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-
-  // launch cuda kernel
-  cuda_histogram(td.histogram_size,
-                 td.leaf_num_data, td.num_data, td.use_all_features,
-                 td.is_constant_hessian, td.num_workgroups, td.stream,
-                 td.device_features,
-                 td.device_feature_masks,
-                 td.num_data,
-                 td.device_data_indices,
-                 td.leaf_num_data,
-                 td.device_gradients,
-                 td.device_hessians, td.hessians_const,
-                 td.device_subhistograms, td.sync_counters,
-                 td.device_histogram_outputs,
-                 td.exp_workgroups_per_feature);
-
-  CUDASUCCESS_OR_FATAL(cudaGetLastError());
-
-  return NULL;
-}
-
-CUDATreeLearner::CUDATreeLearner(const Config* config)
-  :SerialTreeLearner(config) {
-  use_bagging_ = false;
-  nthreads_ = 0;
-  if (config->gpu_use_dp && USE_DP_FLOAT) {
-    Log::Info("LightGBM using CUDA trainer with DP float!!");
-  } else {
-    Log::Info("LightGBM using CUDA trainer with SP float!!");
-  }
-}
-
-CUDATreeLearner::~CUDATreeLearner() {
-  #pragma omp parallel for schedule(static, num_gpu_)
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-
-    if (device_features_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
-    }
-
-    if (device_gradients_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
-    }
-
-    if (device_hessians_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id]));
-    }
-
-    if (device_feature_masks_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
-    }
-
-    if (device_data_indices_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
-    }
-
-    if (sync_counters_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
-    }
-
-    if (device_subhistograms_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-    }
-
-    if (device_histogram_outputs_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
-    }
-  }
-}
-
-
-void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
-  // initialize SerialTreeLearner
-  SerialTreeLearner::Init(train_data, is_constant_hessian);
-
-  // some additional variables needed for GPU trainer
-  num_feature_groups_ = train_data_->num_feature_groups();
-
-  // Initialize GPU buffers and kernels: get device info
-  InitGPU(config_->num_gpu);
-}
-
-// some functions used for debugging the GPU histogram construction
-#if CUDA_DEBUG > 0
-
-void PrintHistograms(hist_t* h, size_t size) {
-  double total_hess = 0;
-  for (size_t i = 0; i < size; ++i) {
-    printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
-    if ((i & 3) == 3)
-        printf("\n");
-    total_hess += GET_HESS(h, i);
-  }
-  printf("\nSum hessians: %9.3g\n", total_hess);
-}
-
-union Float_t {
-    int64_t i;
-    double f;
-    static int64_t ulp_diff(Float_t a, Float_t b) {
-      return abs(a.i - b.i);
-    }
-};
-
-int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
-  int i;
-  int retval = 0;
-  printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, static_cast<int>(size));
-  if (dp_flag) {  // double precision
-    double af, bf;
-    int64_t ai, bi;
-    for (i = 0; i < static_cast<int>(size); ++i) {
-      af = GET_GRAD(h1, i);
-      bf = GET_GRAD(h2, i);
-      if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
-        printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf);
-        ++retval;
-      }
-      if (const_flag) {
-        ai = GET_HESS((reinterpret_cast<int64_t *>(h1)), i);
-        bi = GET_HESS((reinterpret_cast<int64_t *>(h2)), i);
-        if (ai != bi) {
-          printf("i = %5d, h1.hess %" PRId64 ", h2.hess %" PRId64 "\n", i, ai, bi);
-          ++retval;
-        }
-      } else {
-        af = GET_HESS(h1, i);
-        bf = GET_HESS(h2, i);
-        if (((std::fabs(af - bf))/af) >= 1e-6) {
-          printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf);
-          ++retval;
-        }
-      }
-    }
-  } else {  // single precision
-    float af, bf;
-    int ai, bi;
-    for (i = 0; i < static_cast<int>(size); ++i) {
-      af = GET_GRAD(h1, i);
-      bf = GET_GRAD(h2, i);
-      if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
-        printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf);
-        ++retval;
-      }
-      if (const_flag) {
-        ai = GET_HESS(h1, i);
-        bi = GET_HESS(h2, i);
-        if (ai != bi) {
-          printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi);
-          ++retval;
-        }
-      } else {
-        af = GET_HESS(h1, i);
-        bf = GET_HESS(h2, i);
-        if (((std::fabs(af - bf))/af) >= 1e-5) {
-          printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf);
-          ++retval;
-        }
-      }
-    }
-  }
-  printf("DONE Comparing Histograms...\n");
-  return retval;
-}
-#endif
-
-int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
-  // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
-  // also guarantee that there are at least 2K examples per workgroup
-  double x = 256.0 / num_dense_feature_groups_;
-
-  int exp_workgroups_per_feature = static_cast<int>(ceil(log2(x)));
-  double t = leaf_num_data / 1024.0;
-
-  Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons\n"
-         "Best workgroup size per feature for full utilization is %d\n", static_cast<int>(ceil(t)), (1 << exp_workgroups_per_feature));
-
-  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log(static_cast<double>(t))/log(2.0))));
-  if (exp_workgroups_per_feature < 0)
-      exp_workgroups_per_feature = 0;
-  if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
-      exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature;
-
-  return exp_workgroups_per_feature;
-}
-
-void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
-  // we have already copied ordered gradients, ordered hessians and indices to GPU
-  // decide the best number of workgroups working on one feature4 tuple
-  // set work group size based on feature size
-  // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
-  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
-  std::vector<int> num_gpu_workgroups;
-  ThreadData *thread_data = reinterpret_cast<ThreadData*>(_mm_malloc(sizeof(ThreadData) * num_gpu_, 16));
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
-    int num_workgroups = (1 << exp_workgroups_per_feature) * num_gpu_feature_groups;
-    num_gpu_workgroups.push_back(num_workgroups);
-    if (num_workgroups > preallocd_max_num_wg_[device_id]) {
-      preallocd_max_num_wg_.at(device_id) = num_workgroups;
-      CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast<size_t>(num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))));
-    }
-    // set thread_data
-    SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
-                  num_workgroups, exp_workgroups_per_feature);
-  }
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, reinterpret_cast<void *>(&thread_data[device_id]))) {
-      Log::Fatal("Error in creating threads.");
-    }
-  }
-
-  /* Wait for the threads to finish */
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    if (pthread_join(*(cpu_threads_[device_id]), NULL)) {
-      Log::Fatal("Error in joining threads.");
-    }
-  }
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    // copy the results asynchronously. Size depends on if double precision is used
-
-    size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-    size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-
-    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
-    CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id]));
-  }
-}
-
-
-template <typename HistType>
-void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array) {
-  HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
-
-  #pragma omp parallel for schedule(static, num_gpu_)
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    // when the output is ready, the computation is done
-    CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id]));
-  }
-
-  HistType* histograms = reinterpret_cast<HistType*>(leaf_histogram_array[0].RawData() - kHistOffset);
-  #pragma omp parallel for schedule(static)
-  for (int i = 0; i < num_dense_feature_groups_; ++i) {
-    if (!feature_masks_[i]) {
-      continue;
-    }
-    int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
-    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
-
-    for (int j = 0; j < bin_size; ++j) {
-      GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
-      GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
-    }
-  }
-}
-
-void CUDATreeLearner::CountDenseFeatureGroups() {
-  num_dense_feature_groups_ = 0;
-
-  for (int i = 0; i < num_feature_groups_; ++i) {
-    if (!train_data_->IsMultiGroup(i)) {
-      num_dense_feature_groups_++;
-    }
-  }
-  if (!num_dense_feature_groups_) {
-    Log::Warning("GPU acceleration is disabled because no non-trivial dense features can be found");
-  }
-}
-
-void CUDATreeLearner::prevAllocateGPUMemory() {
-  // how many feature-group tuples we have
-  // leave some safe margin for prefetching
-  // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
-
-  allocated_num_data_ = std::max(num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature), allocated_num_data_);
-
-  // clear sparse/dense maps
-  dense_feature_group_map_.clear();
-  sparse_feature_group_map_.clear();
-
-  // do nothing it there is no dense feature
-  if (!num_dense_feature_groups_) {
-    return;
-  }
-
-  // calculate number of feature groups per gpu
-  num_gpu_feature_groups_.resize(num_gpu_);
-  offset_gpu_feature_groups_.resize(num_gpu_);
-  int num_features_per_gpu = num_dense_feature_groups_ / num_gpu_;
-  int remain_features = num_dense_feature_groups_ - num_features_per_gpu * num_gpu_;
-
-  int offset = 0;
-
-  for (int i = 0; i < num_gpu_; ++i) {
-    offset_gpu_feature_groups_.at(i) = offset;
-    num_gpu_feature_groups_.at(i) = (i < remain_features) ? num_features_per_gpu + 1 : num_features_per_gpu;
-    offset += num_gpu_feature_groups_.at(i);
-  }
-
-  feature_masks_.resize(num_dense_feature_groups_);
-  Log::Debug("Resized feature masks");
-
-  ptr_pinned_feature_masks_ = feature_masks_.data();
-  Log::Debug("Memset pinned_feature_masks_");
-  memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_);
-
-  // histogram bin entry size depends on the precision (single/double)
-  hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t));  // two elements in this "size"
-
-  CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast<void **>(&host_histogram_outputs_), static_cast<size_t>(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable));
-
-  nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_);
-  nthreads_ = std::max(nthreads_, 1);
-}
-
-// allocate GPU memory for each GPU
-void CUDATreeLearner::AllocateGPUMemory() {
-  #pragma omp parallel for schedule(static, num_gpu_)
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    // do nothing it there is no gpu feature
-    int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
-    if (num_gpu_feature_groups) {
-      CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-
-      // allocate memory for all features
-      if (device_features_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
-      }
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]),  static_cast<size_t>(num_gpu_feature_groups * num_data_ * sizeof(uint8_t))));
-      Log::Debug("Allocated device_features_ addr=%p sz=%lu", device_features_[device_id], num_gpu_feature_groups * num_data_);
-
-      // allocate space for gradients and hessians on device
-      // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
-      if (device_gradients_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
-      }
-
-      if (device_hessians_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id]));
-      }
-
-      if (device_feature_masks_[device_id] != NULL) {
-         CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
-      }
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), static_cast<size_t>(allocated_num_data_ * sizeof(score_t))));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]),  static_cast<size_t>(allocated_num_data_ * sizeof(score_t))));
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), static_cast<size_t>(num_gpu_feature_groups)));
-
-      // copy indices to the device
-      if (device_data_indices_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
-      }
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), static_cast<size_t>(allocated_num_data_ * sizeof(data_size_t))));
-      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id]));
-
-      Log::Debug("Memset device_data_indices_");
-
-      // create output buffer, each feature has a histogram with device_bin_size_ bins,
-      // each work group generates a sub-histogram of dword_features_ features.
-      if (!device_subhistograms_[device_id]) {
-        // only initialize once here, as this will not need to change when ResetTrainingData() is called
-        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast<size_t>(preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))));
-
-        Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
-      }
-
-      // create atomic counters for inter-group coordination
-      CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), static_cast<size_t>(num_gpu_feature_groups * sizeof(int))));
-      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id]));
-
-      // The output buffer is allocated to host directly, to overlap compute and data transfer
-      CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), static_cast<size_t>(num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_)));
-    }
-  }
-}
-
-void CUDATreeLearner::ResetGPUMemory() {
-  // clear sparse/dense maps
-  dense_feature_group_map_.clear();
-  sparse_feature_group_map_.clear();
-}
-
-void CUDATreeLearner::copyDenseFeature() {
-  if (num_feature_groups_ == 0) {
-    LGBM_config_::current_learner = use_cpu_learner;
-    return;
-  }
-
-  Log::Debug("Started copying dense features from CPU to GPU");
-  // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
-  size_t copied_feature = 0;
-  // set device info
-  int device_id = 0;
-  uint8_t* device_features = device_features_[device_id];
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-  Log::Debug("Started copying dense features from CPU to GPU - 1");
-
-  for (int i = 0; i < num_feature_groups_; ++i) {
-    // looking for dword_features_ non-sparse feature-groups
-    if (!train_data_->IsMultiGroup(i)) {
-      dense_feature_group_map_.push_back(i);
-      auto sizes_in_byte = std::min(train_data_->FeatureGroupSizesInByte(i), static_cast<size_t>(num_data_));
-      void* tmp_data = train_data_->FeatureGroupData(i);
-      Log::Debug("Started copying dense features from CPU to GPU - 2");
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id]));
-      Log::Debug("Started copying dense features from CPU to GPU - 3");
-      copied_feature++;
-      // reset device info
-      if (copied_feature == static_cast<size_t>(num_gpu_feature_groups_[device_id])) {
-         CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id]));
-         device_id += 1;
-         copied_feature = 0;
-         if (device_id < num_gpu_) {
-           device_features = device_features_[device_id];
-           CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-         }
-      }
-    } else {
-      sparse_feature_group_map_.push_back(i);
-    }
-  }
-}
-
-
-
-// InitGPU w/ num_gpu
-void CUDATreeLearner::InitGPU(int num_gpu) {
-  // Get the max bin size, used for selecting best GPU kernel
-  max_num_bin_ = 0;
-
-  #if CUDA_DEBUG >= 1
-  printf("bin_size: ");
-  #endif
-  for (int i = 0; i < num_feature_groups_; ++i) {
-    if (train_data_->IsMultiGroup(i)) {
-      continue;
-    }
-    #if CUDA_DEBUG >= 1
-    printf("%d, ", train_data_->FeatureGroupNumBin(i));
-    #endif
-    max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
-  }
-  #if CUDA_DEBUG >= 1
-  printf("\n");
-  #endif
-
-  if (max_num_bin_ <= 16) {
-    device_bin_size_ = 16;
-    histogram_size_ = 16;
-    dword_features_ = 1;
-  } else if (max_num_bin_ <= 64) {
-    device_bin_size_ = 64;
-    histogram_size_ = 64;
-    dword_features_ = 1;
-  } else if (max_num_bin_ <= 256) {
-    Log::Debug("device_bin_size_ = 256");
-    device_bin_size_ = 256;
-    histogram_size_ = 256;
-    dword_features_ = 1;
-  } else {
-    Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
-  }
-
-  // ignore the feature groups that contain categorical features when producing warnings about max_bin.
-  // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
-  int max_num_bin_no_categorical = 0;
-  int cur_feature_group = 0;
-  bool categorical_feature_found = false;
-  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
-    const int feature_group = train_data_->Feature2Group(inner_feature_index);
-    const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
-    if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
-      categorical_feature_found = true;
-    }
-    if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
-      if (!categorical_feature_found) {
-        max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
-      }
-      categorical_feature_found = false;
-      cur_feature_group = feature_group;
-    }
-  }
-  if (max_num_bin_no_categorical == 65) {
-    Log::Warning("Setting max_bin to 63 is suggested for best performance");
-  }
-  if (max_num_bin_no_categorical == 17) {
-    Log::Warning("Setting max_bin to 15 is suggested for best performance");
-  }
-
-  // get num_dense_feature_groups_
-  CountDenseFeatureGroups();
-
-  if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_;
-
-  // initialize GPU
-  int gpu_count;
-
-  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count));
-  num_gpu_ = (gpu_count < num_gpu) ? gpu_count : num_gpu;
-
-  // set cpu threads
-  cpu_threads_ = reinterpret_cast<pthread_t **>(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16));
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    cpu_threads_[device_id] = reinterpret_cast<pthread_t *>(_mm_malloc(sizeof(pthread_t), 16));
-  }
-
-  // resize device memory pointers
-  device_features_.resize(num_gpu_);
-  device_gradients_.resize(num_gpu_);
-  device_hessians_.resize(num_gpu_);
-  device_feature_masks_.resize(num_gpu_);
-  device_data_indices_.resize(num_gpu_);
-  sync_counters_.resize(num_gpu_);
-  device_subhistograms_.resize(num_gpu_);
-  device_histogram_outputs_.resize(num_gpu_);
-
-  // create stream & events to handle multiple GPUs
-  preallocd_max_num_wg_.resize(num_gpu_, 1024);
-  stream_.resize(num_gpu_);
-  hessians_future_.resize(num_gpu_);
-  gradients_future_.resize(num_gpu_);
-  indices_future_.resize(num_gpu_);
-  features_future_.resize(num_gpu_);
-  kernel_start_.resize(num_gpu_);
-  kernel_wait_obj_.resize(num_gpu_);
-  histograms_wait_obj_.resize(num_gpu_);
-
-  for (int i = 0; i < num_gpu_; ++i) {
-    CUDASUCCESS_OR_FATAL(cudaSetDevice(i));
-    CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(hessians_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(gradients_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(indices_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(features_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_start_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_wait_obj_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(histograms_wait_obj_[i])));
-  }
-
-  allocated_num_data_ = 0;
-  prevAllocateGPUMemory();
-
-  AllocateGPUMemory();
-
-  copyDenseFeature();
-}
-
-Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) {
-  Tree *ret = SerialTreeLearner::Train(gradients, hessians, is_first_tree);
-  return ret;
-}
-
-void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
-  // check data size
-  data_size_t old_allocated_num_data = allocated_num_data_;
-
-  SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
-
-  #if ResetTrainingData_DEBUG == 1
-  serial_time = std::chrono::steady_clock::now() - start_serial_time;
-  #endif
-
-  num_feature_groups_ = train_data_->num_feature_groups();
-
-  // GPU memory has to been reallocated because data may have been changed
-  #if ResetTrainingData_DEBUG == 1
-  auto start_alloc_gpu_time = std::chrono::steady_clock::now();
-  #endif
-
-  // AllocateGPUMemory only when the number of data increased
-  int old_num_feature_groups = num_dense_feature_groups_;
-  CountDenseFeatureGroups();
-  if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) {
-    prevAllocateGPUMemory();
-    AllocateGPUMemory();
-  } else {
-    ResetGPUMemory();
-  }
-
-  copyDenseFeature();
-
-  #if ResetTrainingData_DEBUG == 1
-  alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time;
-  #endif
-
-  // setup GPU kernel arguments after we allocating all the buffers
-  #if ResetTrainingData_DEBUG == 1
-  auto start_set_arg_time = std::chrono::steady_clock::now();
-  #endif
-
-  #if ResetTrainingData_DEBUG == 1
-  set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time;
-  reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time;
-  Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3);
-  Log::Info("serial_time: %f secs.", serial_time.count() * 1e-3);
-  Log::Info("alloc_gpu_time: %f secs.", alloc_gpu_time.count() * 1e-3);
-  Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3);
-  #endif
-}
-
-void CUDATreeLearner::BeforeTrain() {
-  #if cudaMemcpy_DEBUG == 1
-  std::chrono::duration<double, std::milli> device_hessians_time = std::chrono::milliseconds(0);
-  std::chrono::duration<double, std::milli> device_gradients_time = std::chrono::milliseconds(0);
-  #endif
-
-  SerialTreeLearner::BeforeTrain();
-
-  #if CUDA_DEBUG >= 2
-  printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n");
-  #endif
-
-  // Copy initial full hessians and gradients to GPU.
-  // We start copying as early as possible, instead of at ConstructHistogram().
-  if ((hessians_ != NULL) && (gradients_ != NULL)) {
-    if (!use_bagging_ && num_dense_feature_groups_) {
-      Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
-
-      for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-        if (!(share_state_->is_constant_hessian)) {
-          Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_");
-
-          #if cudaMemcpy_DEBUG == 1
-          auto start_device_hessians_time = std::chrono::steady_clock::now();
-          #endif
-
-          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-
-          CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
-
-          #if cudaMemcpy_DEBUG == 1
-          device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time;
-          #endif
-
-          Log::Debug("queued copy of device_hessians_");
-        }
-
-        #if cudaMemcpy_DEBUG == 1
-        auto start_device_gradients_time = std::chrono::steady_clock::now();
-        #endif
-
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
-
-        #if cudaMemcpy_DEBUG == 1
-        device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time;
-        #endif
-
-        Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_");
-      }
-    }
-  }
-
-  // use bagging
-  if ((hessians_ != NULL) && (gradients_ != NULL)) {
-    if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
-      // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
-      // copy used gradients and hessians to ordered buffer
-      const data_size_t* indices = data_partition_->indices();
-      data_size_t cnt = data_partition_->leaf_count(0);
-
-      // transfer the indices to GPU
-      for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id]));
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
-
-        if (!(share_state_->is_constant_hessian)) {
-          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], const_cast<void*>(reinterpret_cast<const void*>(&(hessians_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-          CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
-        }
-
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], const_cast<void*>(reinterpret_cast<const void*>(&(gradients_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
-      }
-    }
-  }
-}
-
-bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
-  int smaller_leaf;
-
-  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
-  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
-
-  // only have root
-  if (right_leaf < 0) {
-    smaller_leaf = -1;
-  } else if (num_data_in_left_child < num_data_in_right_child) {
-    smaller_leaf = left_leaf;
-  } else {
-    smaller_leaf = right_leaf;
-  }
-
-  // Copy indices, gradients and hessians as early as possible
-  if (smaller_leaf >= 0 && num_dense_feature_groups_) {
-    // only need to initialize for smaller leaf
-    // Get leaf boundary
-    const data_size_t* indices = data_partition_->indices();
-    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
-    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
-
-    for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
-    }
-  }
-
-  const bool ret = SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf);
-
-  return ret;
-}
-
-bool CUDATreeLearner::ConstructGPUHistogramsAsync(
-  const std::vector<int8_t>& is_feature_used,
-  const data_size_t* data_indices, data_size_t num_data) {
-  if (num_data <= 0) {
-    return false;
-  }
-
-  // do nothing if no features can be processed on GPU
-  if (!num_dense_feature_groups_) {
-    Log::Debug("no dense feature groups, returning");
-    return false;
-  }
-
-  // copy data indices if it is not null
-  if (data_indices != nullptr && num_data != num_data_) {
-    for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
-    }
-  }
-
-  // converted indices in is_feature_used to feature-group indices
-  std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
-
-  #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
-  for (int i = 0; i < num_features_; ++i) {
-    if (is_feature_used[i]) {
-      int feature_group = train_data_->Feature2Group(i);
-      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1;
-    }
-  }
-
-  // construct the feature masks for dense feature-groups
-  int used_dense_feature_groups = 0;
-  #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
-  for (int i = 0; i < num_dense_feature_groups_; ++i) {
-    if (is_feature_group_used[dense_feature_group_map_[i]]) {
-      feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]];
-      ++used_dense_feature_groups;
-    } else {
-      feature_masks_[i] = 0;
-    }
-  }
-  bool use_all_features = ((used_dense_feature_groups == num_dense_feature_groups_) && (data_indices != nullptr));
-  // if no feature group is used, just return and do not use GPU
-  if (used_dense_feature_groups == 0) {
-    return false;
-  }
-
-  // if not all feature groups are used, we need to transfer the feature mask to GPU
-  // otherwise, we will use a specialized GPU kernel with all feature groups enabled
-
-  // We now copy even if all features are used.
-  #pragma omp parallel for schedule(static, num_gpu_)
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    int offset = offset_gpu_feature_groups_[device_id];
-    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
-  }
-
-  // All data have been prepared, now run the GPU kernel
-  GPUHistogram(num_data, use_all_features);
-
-  return true;
-}
-
-void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
-  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
-  int num_dense_features = 0, num_sparse_features = 0;
-
-  #pragma omp parallel for schedule(static)
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
-    if (!is_feature_used[feature_index]) continue;
-    if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
-      is_sparse_feature_used[feature_index] = 1;
-      num_sparse_features++;
-    } else {
-      is_dense_feature_used[feature_index] = 1;
-      num_dense_features++;
-    }
-  }
-
-  // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
-
-  // Check workgroups per feature4 tuple..
-  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
-
-  // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
-  if (exp_workgroups_per_feature == 0) {
-    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-  }
-
-  // ConstructGPUHistogramsAsync will return true if there are availabe feature groups dispatched to GPU
-  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
-    nullptr, smaller_leaf_splits_->num_data_in_leaf());
-
-  // then construct sparse features on CPU
-  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
-  if (num_sparse_features > 0) {
-    train_data_->ConstructHistograms(is_sparse_feature_used,
-    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
-    gradients_, hessians_,
-    ordered_gradients_.data(), ordered_hessians_.data(),
-    share_state_.get(),
-    ptr_smaller_leaf_hist_data);
-  }
-
-  // wait for GPU to finish, only if GPU is actually used
-  if (is_gpu_used) {
-    if (config_->gpu_use_dp) {
-      // use double precision
-      WaitAndGetHistograms<hist_t>(smaller_leaf_histogram_array_);
-    } else {
-      // use single precision
-      WaitAndGetHistograms<gpu_hist_t>(smaller_leaf_histogram_array_);
-    }
-  }
-
-  // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
-  // #define CUDA_DEBUG_COMPARE
-#ifdef CUDA_DEBUG_COMPARE
-  printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_);
-  bool compare = true;
-  for (int i = 0; i < num_dense_feature_groups_; ++i) {
-    if (!feature_masks_[i])
-      continue;
-    int dense_feature_group_index = dense_feature_group_map_[i];
-    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
-    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
-    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
-    hist_t* gpu_histogram = new hist_t[size * 2];
-    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
-    printf("Comparing histogram for feature %d, num_data %d, num_data_ = %d, %lu bins\n", dense_feature_group_index, num_data, num_data_, size);
-    std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
-    std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
-    if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
-      continue;
-    }
-    if (num_data == num_data_) {
-      if (share_state_->is_constant_hessian) {
-        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            0,
-            num_data,
-            gradients_,
-            current_histogram);
-      } else {
-        printf("ConstructHistogram(): num_data == num_data_\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            0,
-            num_data,
-            gradients_, hessians_,
-            current_histogram);
-      }
-    } else {
-      if (share_state_->is_constant_hessian) {
-        printf("ConstructHistogram(): is_constant_hessian\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            smaller_leaf_splits_->data_indices(),
-            0,
-            num_data,
-            gradients_,
-            current_histogram);
-      } else {
-        printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            smaller_leaf_splits_->data_indices(),
-            0,
-            num_data,
-            gradients_, hessians_,
-            current_histogram);
-      }
-    }
-    int retval;
-    if ((num_data != num_data_) && compare) {
-        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
-        printf("CompareHistograms reports %d errors\n", retval);
-        compare = false;
-    }
-    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
-    if (num_data == num_data_) {
-        printf("CompareHistograms reports %d errors\n", retval);
-    } else {
-        printf("CompareHistograms reports %d errors\n", retval);
-    }
-    std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
-    delete [] gpu_histogram;
-  }
-  printf("End Comparing Histogram between GPU and CPU\n");
-  fflush(stderr);
-  fflush(stdout);
-#endif
-
-  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
-    // construct larger leaf
-    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
-
-    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
-      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf());
-
-    // then construct sparse features on CPU
-    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
-    if (num_sparse_features > 0) {
-    train_data_->ConstructHistograms(is_sparse_feature_used,
-      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
-      gradients_, hessians_,
-      ordered_gradients_.data(), ordered_hessians_.data(),
-      share_state_.get(),
-      ptr_larger_leaf_hist_data);
-    }
-
-    // wait for GPU to finish, only if GPU is actually used
-    if (is_gpu_used) {
-      if (config_->gpu_use_dp) {
-        // use double precision
-        WaitAndGetHistograms<hist_t>(larger_leaf_histogram_array_);
-      } else {
-        // use single precision
-        WaitAndGetHistograms<gpu_hist_t>(larger_leaf_histogram_array_);
-      }
-    }
-  }
-}
-
-void CUDATreeLearner::FindBestSplits(const Tree* tree) {
-  SerialTreeLearner::FindBestSplits(tree);
-
-#if CUDA_DEBUG >= 3
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
-    if (parent_leaf_histogram_array_ != nullptr
-        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
-      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
-      continue;
-    }
-    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
-    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size);
-    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
-    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; }
-    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size);
-
-    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
-  }
-#endif
-}
-
-void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
-  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
-#if CUDA_DEBUG >= 2
-  printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
-#endif
-  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
-  if (Network::num_machines() == 1) {
-    // do some sanity check for the GPU algorithm
-    if (best_split_info.left_count < best_split_info.right_count) {
-      if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) ||
-          (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) {
-        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
-      }
-    } else {
-      if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
-          (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
-        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
-      }
-    }
-  }
-}
-
-}   // namespace LightGBM
-#undef cudaMemcpy_DEBUG
-#endif  // USE_CUDA
--- a/src/treelearner/cuda_tree_learner.h
+++ b/src/treelearner/cuda_tree_learner.h
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-
-#include <LightGBM/utils/random.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/tree.h>
-
-#include <string>
-#include <cmath>
-#include <cstdio>
-#include <memory>
-#include <random>
-#include <vector>
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#include "feature_histogram.hpp"
-#include "serial_tree_learner.h"
-#include "data_partition.hpp"
-#include "split_info.hpp"
-#include "leaf_splits.hpp"
-
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#include "cuda_kernel_launcher.h"
-
-
-using json11::Json;
-
-namespace LightGBM {
-
-/*!
-* \brief CUDA-based parallel learning algorithm.
-*/
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    explicit CUDATreeLearner(const Config* tree_config);
-    ~CUDATreeLearner();
-    void Init(const Dataset* train_data, bool is_constant_hessian) override;
-    void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
-    Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override;
-    void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
-      SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
-      if (subset == nullptr && used_indices != nullptr) {
-        if (num_data != num_data_) {
-          use_bagging_ = true;
-          return;
-        }
-      }
-      use_bagging_ = false;
-    }
-
- protected:
-    void BeforeTrain() override;
-    bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
-    void FindBestSplits(const Tree* tree) override;
-    void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
-    void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
-
- private:
-    typedef float gpu_hist_t;
-
-    /*!
-     * \brief Find the best number of workgroups processing one feature for maximizing efficiency
-     * \param leaf_num_data The number of data examples on the current leaf being processed
-     * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
-     */
-    int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
-
-    /*!
-     * \brief Initialize GPU device
-     * \param num_gpu: number of maximum gpus
-     */
-    void InitGPU(int num_gpu);
-
-    /*!
-     * \brief Allocate memory for GPU computation // alloc only
-     */
-    void CountDenseFeatureGroups();  // compute num_dense_feature_group
-    void prevAllocateGPUMemory();  // compute CPU-side param calculation & Pin HostMemory
-    void AllocateGPUMemory();
-
-    /*!
-     * \ ResetGPUMemory
-     */
-    void ResetGPUMemory();
-
-    /*!
-     * \ copy dense feature from CPU to GPU
-     */
-    void copyDenseFeature();
-
-    /*! 
-     * \brief Compute GPU feature histogram for the current leaf.
-     *        Indices, gradients and Hessians have been copied to the device.
-     * \param leaf_num_data Number of data on current leaf
-     * \param use_all_features Set to true to not use feature masks, with a faster kernel
-     */
-    void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
-
-    void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
-                int leaf_num_data, bool use_all_features,
-                int num_workgroups, int exp_workgroups_per_feature) {
-      ThreadData* td = &thread_data[device_id];
-      td->device_id             = device_id;
-      td->histogram_size        = histogram_size;
-      td->leaf_num_data         = leaf_num_data;
-      td->num_data              = num_data_;
-      td->use_all_features      = use_all_features;
-      td->is_constant_hessian   = share_state_->is_constant_hessian;
-      td->num_workgroups        = num_workgroups;
-      td->stream                = stream_[device_id];
-      td->device_features       = device_features_[device_id];
-      td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
-      td->device_data_indices   = device_data_indices_[device_id];
-      td->device_gradients      = device_gradients_[device_id];
-      td->device_hessians       = device_hessians_[device_id];
-      td->hessians_const        = hessians_[0];
-      td->device_subhistograms  = device_subhistograms_[device_id];
-      td->sync_counters         = sync_counters_[device_id];
-      td->device_histogram_outputs   = device_histogram_outputs_[device_id];
-      td->exp_workgroups_per_feature = exp_workgroups_per_feature;
-
-      td->kernel_start           = &(kernel_start_[device_id]);
-      td->kernel_wait_obj        = &(kernel_wait_obj_[device_id]);
-      td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);
-
-      size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      td->output_size           = output_size;
-      td->host_histogram_output = reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset;
-      td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
-    }
-
-    /*!
-     * \brief Wait for GPU kernel execution and read histogram
-     * \param histograms Destination of histogram results from GPU.
-     */
-    template <typename HistType>
-    void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
-
-    /*!
-     * \brief Construct GPU histogram asynchronously. 
-     *        Interface is similar to Dataset::ConstructHistograms().
-     * \param is_feature_used A predicate vector for enabling each feature
-     * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
-     *                     Set to nullptr to skip copy to GPU.
-     * \param num_data Number of data examples to be included in histogram
-     * \return true if GPU kernel is launched, false if GPU is not used
-    */
-    bool ConstructGPUHistogramsAsync(
-      const std::vector<int8_t>& is_feature_used,
-      const data_size_t* data_indices, data_size_t num_data);
-
-    /*! brief Log2 of max number of workgroups per feature*/
-    const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
-    /*! brief Max total number of workgroups with preallocated workspace.
-     *        If we use more than this number of workgroups, we have to reallocate subhistograms */
-    std::vector<int> preallocd_max_num_wg_;
-
-    /*! \brief True if bagging is used */
-    bool use_bagging_;
-
-    /*! \brief GPU command queue object */
-    std::vector<cudaStream_t> stream_;
-
-    /*! \brief total number of feature-groups */
-    int num_feature_groups_;
-    /*! \brief total number of dense feature-groups, which will be processed on GPU */
-    int num_dense_feature_groups_;
-    std::vector<int> num_gpu_feature_groups_;
-    std::vector<int> offset_gpu_feature_groups_;
-    /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
-     *  With bin size > 16, there are 4 features per DWORD.
-     *  With bin size <=16, there are 8 features per DWORD.
-     */
-    int dword_features_;
-    /*! \brief Max number of bins of training data, used to determine 
-     * which GPU kernel to use */
-    int max_num_bin_;
-    /*! \brief Used GPU kernel bin size (64, 256) */
-    int histogram_size_;
-    int device_bin_size_;
-    /*! \brief Size of histogram bin entry, depending if single or double precision is used */
-    size_t hist_bin_entry_sz_;
-    /*! \brief Indices of all dense feature-groups */
-    std::vector<int> dense_feature_group_map_;
-    /*! \brief Indices of all sparse feature-groups */
-    std::vector<int> sparse_feature_group_map_;
-    /*! \brief GPU memory object holding the training data */
-    std::vector<uint8_t*> device_features_;
-    /*! \brief GPU memory object holding the ordered gradient */
-    std::vector<score_t*> device_gradients_;
-    /*! \brief GPU memory object holding the ordered hessian */
-    std::vector<score_t*> device_hessians_;
-    /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
-    std::vector<char> feature_masks_;
-    /*! \brief GPU memory object holding the feature masks */
-    std::vector<char*> device_feature_masks_;
-    /*! \brief Pointer to pinned memory of feature masks */
-    char* ptr_pinned_feature_masks_ = nullptr;
-    /*! \brief GPU memory object holding indices of the leaf being processed */
-    std::vector<data_size_t*> device_data_indices_;
-    /*! \brief GPU memory object holding counters for workgroup coordination */
-    std::vector<int*> sync_counters_;
-    /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
-    std::vector<char*> device_subhistograms_;
-    /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
-    std::vector<void*> device_histogram_outputs_;
-    /*! \brief Host memory pointer for histogram outputs */
-    void *host_histogram_outputs_;
-    /*! CUDA waitlist object for waiting for data transfer before kernel execution */
-    std::vector<cudaEvent_t> kernel_wait_obj_;
-    /*! CUDA waitlist object for reading output histograms after kernel execution */
-    std::vector<cudaEvent_t> histograms_wait_obj_;
-    /*! CUDA Asynchronous waiting object for copying indices */
-    std::vector<cudaEvent_t> indices_future_;
-    /*! Asynchronous waiting object for copying gradients */
-    std::vector<cudaEvent_t> gradients_future_;
-    /*! Asynchronous waiting object for copying Hessians */
-    std::vector<cudaEvent_t> hessians_future_;
-    /*! Asynchronous waiting object for copying dense features */
-    std::vector<cudaEvent_t> features_future_;
-
-    // host-side buffer for converting feature data into featre4 data
-    int nthreads_;  // number of Feature4* vector on host4_vecs_
-    std::vector<cudaEvent_t> kernel_start_;
-    std::vector<float> kernel_time_;  // measure histogram kernel time
-    std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
-    int num_gpu_;
-    int allocated_num_data_;  // allocated data instances
-    pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
-};
-
-}  // namespace LightGBM
-#else  // USE_CUDA
-
-// When GPU support is not compiled in, quit with an error message
-
-namespace LightGBM {
-
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    #pragma warning(disable : 4702)
-    explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA=1");
-    }
-};
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -276,7 +276,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class DataParallelTreeLearner<CUDATreeLearner>;
 template class DataParallelTreeLearner<GPUTreeLearner>;
 template class DataParallelTreeLearner<SerialTreeLearner>;


--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -77,7 +77,6 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class FeatureParallelTreeLearner<CUDATreeLearner>;
 template class FeatureParallelTreeLearner<GPUTreeLearner>;
 template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -12,7 +12,6 @@
 #include <memory>
 #include <vector>

-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"


--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -344,15 +344,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set<int>* fo
  }
  bool use_subtract = parent_leaf_histogram_array_ != nullptr;

-#ifdef USE_CUDA
-  if (LGBM_config_::current_learner == use_cpu_learner) {
-    SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-  } else {
-    ConstructHistograms(is_feature_used, use_subtract);
-  }
-#else
  ConstructHistograms(is_feature_used, use_subtract);
-#endif
  FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
 }


--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -211,7 +211,7 @@ class SerialTreeLearner: public TreeLearner {
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#elif defined(USE_CUDA)
  /*! \brief gradients of current iteration, ordered for cache optimized */
  std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
  /*! \brief hessians of current iteration, ordered for cache optimized */

--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -4,7 +4,6 @@
 */
 #include <LightGBM/tree_learner.h>

-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "linear_tree_learner.h"
 #include "parallel_tree_learner.h"
@@ -40,24 +39,14 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
      return new VotingParallelTreeLearner<GPUTreeLearner>(config);
    }
  } else if (device_type == std::string("cuda")) {
-    if (learner_type == std::string("serial")) {
-      return new CUDATreeLearner(config);
-    } else if (learner_type == std::string("feature")) {
-      return new FeatureParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("data")) {
-      return new DataParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("voting")) {
-      return new VotingParallelTreeLearner<CUDATreeLearner>(config);
-    }
-  } else if (device_type == std::string("cuda_exp")) {
    if (learner_type == std::string("serial")) {
      if (config->num_gpu == 1) {
        return new CUDASingleGPUTreeLearner(config, boosting_on_cuda);
      } else {
-        Log::Fatal("cuda_exp only supports training on a single GPU.");
+        Log::Fatal("Currently cuda version only supports training on a single GPU.");
      }
    } else {
-      Log::Fatal("cuda_exp only supports training on a single machine.");
+      Log::Fatal("Currently cuda version only supports training on a single machine.");
    }
  }
  return nullptr;

--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -501,7 +501,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
 }

 // instantiate template classes, otherwise linker cannot find the code
-template class VotingParallelTreeLearner<CUDATreeLearner>;
 template class VotingParallelTreeLearner<GPUTreeLearner>;
 template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -48,7 +48,7 @@ def test_basic(tmp_path):
    assert bst.current_iteration() == 20
    assert bst.num_trees() == 20
    assert bst.num_model_per_iteration() == 1
-    if getenv('TASK', '') != 'cuda_exp':
+    if getenv('TASK', '') != 'cuda':
        assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
        assert bst.upper_bound() == pytest.approx(3.3182142872462883)