[CUDA] Initial work for boosting and evaluation with CUDA (#5279)

* initial work for boosting and evaluation with CUDA * fix compatibility with CPU code * fix creating objective without USE_CUDA_EXP * fix static analysis errors * fix static analysis errors

[CUDA] Initial work for boosting and evaluation with CUDA (#5279)
* initial work for boosting and evaluation with CUDA * fix compatibility with CPU code * fix creating objective without USE_CUDA_EXP * fix static analysis errors * fix static analysis errors
e0af160a · shiyu1994 · GitHub · da3b4c19 · e0af160a · e0af160a
Unverified Commit e0af160a authored Jul 29, 2022 by shiyu1994 Committed by GitHub Jul 29, 2022
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -395,6 +395,8 @@ if(USE_CUDA OR USE_CUDA_EXP)
      src/treelearner/*.cu
 endif()
 if(USE_CUDA_EXP)
+      src/boosting/cuda/*.cpp
+      src/boosting/cuda/*.cu
      src/treelearner/cuda/*.cpp
      src/treelearner/cuda/*.cu
      src/io/cuda/*.cu

--- a/include/LightGBM/cuda/cuda_tree.hpp
+++ b/include/LightGBM/cuda/cuda_tree.hpp
@@ -56,6 +56,27 @@ class CUDATree : public Tree {
    uint32_t* cuda_bitset_inner,
    size_t cuda_bitset_inner_len);

+  /*!
+  * \brief Adding prediction value of this tree model to scores
+  * \param data The dataset
+  * \param num_data Number of total data
+  * \param score Will add prediction to score
+  */
+  void AddPredictionToScore(const Dataset* data,
+                            data_size_t num_data,
+                            double* score) const override;
+
+  /*!
+  * \brief Adding prediction value of this tree model to scores
+  * \param data The dataset
+  * \param used_data_indices Indices of used data
+  * \param num_data Number of total data
+  * \param score Will add prediction to score
+  */
+  void AddPredictionToScore(const Dataset* data,
+                            const data_size_t* used_data_indices,
+                            data_size_t num_data, double* score) const override;
+
  const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }

  const int* cuda_left_child() const { return cuda_left_child_; }
@@ -105,6 +126,10 @@ class CUDATree : public Tree {
    size_t cuda_bitset_len,
    size_t cuda_bitset_inner_len);

+  void LaunchAddPredictionToScoreKernel(const Dataset* data,
+                                        const data_size_t* used_data_indices,
+                                        data_size_t num_data, double* score) const;
+
  void LaunchShrinkageKernel(const double rate);

  void LaunchAddBiasKernel(const double val);

--- a/include/LightGBM/metric.h
+++ b/include/LightGBM/metric.h
@@ -55,6 +55,11 @@ class Metric {
  * \param config Config for metric
  */
  LIGHTGBM_EXPORT static Metric* CreateMetric(const std::string& type, const Config& config);
+
+  /*!
+  * \brief Whether boosting is done on CUDA
+  */
+  virtual bool IsCUDAMetric() const { return false; }
 };

 /*!

--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -88,6 +88,11 @@ class ObjectiveFunction {
  * \brief Load objective function from string object
  */
  LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str);
+
+  /*!
+  * \brief Whether boosting is done on CUDA
+  */
+  virtual bool IsCUDAObjective() const { return false; }
 };

 }  // namespace LightGBM

--- a/include/LightGBM/tree_learner.h
+++ b/include/LightGBM/tree_learner.h
@@ -103,7 +103,8 @@ class TreeLearner {
  */
  static TreeLearner* CreateTreeLearner(const std::string& learner_type,
                                        const std::string& device_type,
-                                        const Config* config);
+                                        const Config* config,
+                                        const bool boosting_on_cuda);
 };

 }  // namespace LightGBM

--- a/src/boosting/cuda/cuda_score_updater.cpp
+++ b/src/boosting/cuda/cuda_score_updater.cpp
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include "cuda_score_updater.hpp"
+
+#ifdef USE_CUDA_EXP
+
+namespace LightGBM {
+
+CUDAScoreUpdater::CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration, const bool boosting_on_cuda):
+  ScoreUpdater(data, num_tree_per_iteration), num_threads_per_block_(1024), boosting_on_cuda_(boosting_on_cuda) {
+  num_data_ = data->num_data();
+  int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration;
+  InitCUDA(total_size);
+  has_init_score_ = false;
+  const double* init_score = data->metadata().init_score();
+  // if exists initial score, will start from it
+  if (init_score != nullptr) {
+    if ((data->metadata().num_init_score() % num_data_) != 0
+        || (data->metadata().num_init_score() / num_data_) != num_tree_per_iteration) {
+      Log::Fatal("Number of class for initial score error");
+    }
+    has_init_score_ = true;
+    CopyFromHostToCUDADevice<double>(cuda_score_, init_score, total_size, __FILE__, __LINE__);
+  } else {
+    SetCUDAMemory<double>(cuda_score_, 0, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  if (boosting_on_cuda_) {
+    // clear host score buffer
+    score_.clear();
+    score_.shrink_to_fit();
+  }
+}
+
+void CUDAScoreUpdater::InitCUDA(const size_t total_size) {
+  AllocateCUDAMemory<double>(&cuda_score_, total_size, __FILE__, __LINE__);
+}
+
+CUDAScoreUpdater::~CUDAScoreUpdater() {
+  DeallocateCUDAMemory<double>(&cuda_score_, __FILE__, __LINE__);
+}
+
+inline void CUDAScoreUpdater::AddScore(double val, int cur_tree_id) {
+  Common::FunctionTimer fun_timer("CUDAScoreUpdater::AddScore", global_timer);
+  const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+  LaunchAddScoreConstantKernel(val, offset);
+  if (!boosting_on_cuda_) {
+    CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
+}
+
+inline void CUDAScoreUpdater::AddScore(const Tree* tree, int cur_tree_id) {
+  Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
+  const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+  tree->AddPredictionToScore(data_, num_data_, cuda_score_ + offset);
+  if (!boosting_on_cuda_) {
+    CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
+}
+
+inline void CUDAScoreUpdater::AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
+  Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
+  const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+  tree_learner->AddPredictionToScore(tree, cuda_score_ + offset);
+  if (!boosting_on_cuda_) {
+    CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
+}
+
+inline void CUDAScoreUpdater::AddScore(const Tree* tree, const data_size_t* data_indices,
+                      data_size_t data_cnt, int cur_tree_id) {
+  Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
+  const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+  tree->AddPredictionToScore(data_, data_indices, data_cnt, cuda_score_ + offset);
+  if (!boosting_on_cuda_) {
+    CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
+}
+
+inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) {
+  Common::FunctionTimer fun_timer("CUDAScoreUpdater::MultiplyScore", global_timer);
+  const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
+  LaunchMultiplyScoreConstantKernel(val, offset);
+  if (!boosting_on_cuda_) {
+    CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
--- a/src/boosting/cuda/cuda_score_updater.cu
+++ b/src/boosting/cuda/cuda_score_updater.cu
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include "cuda_score_updater.hpp"
+
+#ifdef USE_CUDA_EXP
+
+namespace LightGBM {
+
+__global__ void AddScoreConstantKernel(
+  const double val,
+  const size_t offset,
+  const data_size_t num_data,
+  double* score) {
+  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (data_index < num_data) {
+    score[data_index + offset] += val;
+  }
+}
+
+void CUDAScoreUpdater::LaunchAddScoreConstantKernel(const double val, const size_t offset) {
+  const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_;
+  Log::Warning("adding init score = %f", val);
+  AddScoreConstantKernel<<<num_blocks, num_threads_per_block_>>>(val, offset, num_data_, cuda_score_);
+}
+
+__global__ void MultiplyScoreConstantKernel(
+  const double val,
+  const size_t offset,
+  const data_size_t num_data,
+  double* score) {
+  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (data_index < num_data) {
+    score[data_index] *= val;
+  }
+}
+
+void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const size_t offset) {
+  const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_;
+  MultiplyScoreConstantKernel<<<num_blocks, num_threads_per_block_>>>(val, offset, num_data_, cuda_score_);
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
--- a/src/boosting/cuda/cuda_score_updater.hpp
+++ b/src/boosting/cuda/cuda_score_updater.hpp
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#ifndef LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
+#define LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
+
+#ifdef USE_CUDA_EXP
+
+#include <LightGBM/cuda/cuda_utils.h>
+
+#include "../score_updater.hpp"
+
+namespace LightGBM {
+
+class CUDAScoreUpdater: public ScoreUpdater {
+ public:
+  CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration, const bool boosting_on_cuda);
+
+  ~CUDAScoreUpdater();
+
+  inline void AddScore(double val, int cur_tree_id) override;
+
+  inline void AddScore(const Tree* tree, int cur_tree_id) override;
+
+  inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) override;
+
+  inline void AddScore(const Tree* tree, const data_size_t* data_indices,
+                       data_size_t data_cnt, int cur_tree_id) override;
+
+  inline void MultiplyScore(double val, int cur_tree_id) override;
+
+  inline const double* score() const override {
+    if (boosting_on_cuda_) {
+      return cuda_score_;
+    } else {
+      return score_.data();
+    }
+  }
+
+  /*! \brief Disable copy */
+  CUDAScoreUpdater& operator=(const CUDAScoreUpdater&) = delete;
+
+  CUDAScoreUpdater(const CUDAScoreUpdater&) = delete;
+
+ private:
+  void InitCUDA(const size_t total_size);
+
+  void LaunchAddScoreConstantKernel(const double val, const size_t offset);
+
+  void LaunchMultiplyScoreConstantKernel(const double val, const size_t offset);
+
+  double* cuda_score_;
+
+  const int num_threads_per_block_;
+
+  const bool boosting_on_cuda_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA_EXP
+
+#endif  // LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -67,6 +67,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective

  if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) {
    LGBM_config_::current_learner = use_cuda_learner;
+    #ifdef USE_CUDA_EXP
+    if (config_->device_type == std::string("cuda_exp")) {
+      const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
+    }
+    #endif  // USE_CUDA_EXP
  }

  // load forced_splits file
@@ -89,8 +95,9 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective

  is_constant_hessian_ = GetIsConstHessian(objective_function);

+  const bool boosting_on_gpu = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
  tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type,
-                                                                              config_.get()));
+                                                                              config_.get(), boosting_on_gpu));

  // init tree learner
  tree_learner_->Init(train_data_, is_constant_hessian_);
@@ -103,15 +110,44 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
  }
  training_metrics_.shrink_to_fit();

+  #ifdef USE_CUDA_EXP
+  if (config_->device_type == std::string("cuda_exp")) {
+    train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu));
+  } else {
+  #endif  // USE_CUDA_EXP
    train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
+  #ifdef USE_CUDA_EXP
+  }
+  #endif  // USE_CUDA_EXP

  num_data_ = train_data_->num_data();
  // create buffer for gradients and Hessians
  if (objective_function_ != nullptr) {
    size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+    #ifdef USE_CUDA_EXP
+    if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu) {
+      AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
+      AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
+    } else {
+    #endif  // USE_CUDA_EXP
      gradients_.resize(total_size);
      hessians_.resize(total_size);
+      gradients_pointer_ = gradients_.data();
+      hessians_pointer_ = hessians_.data();
+    #ifdef USE_CUDA_EXP
+    }
+    #endif  // USE_CUDA_EXP
+  #ifndef USE_CUDA_EXP
+  }
+  #else  // USE_CUDA_EXP
+  } else {
+    if (config_->device_type == std::string("cuda_exp")) {
+      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+      AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
+      AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
    }
+  }
+  #endif  // USE_CUDA_EXP
  // get max feature index
  max_feature_idx_ = train_data_->num_total_features() - 1;
  // get label index
@@ -145,7 +181,13 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
    Log::Fatal("Cannot add validation data, since it has different bin mappers with training data");
  }
  // for a validation dataset, we need its score and metric
-  auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
+  auto new_score_updater =
+    #ifdef USE_CUDA_EXP
+    config_->device_type == std::string("cuda_exp") ?
+    std::unique_ptr<CUDAScoreUpdater>(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_,
+      objective_function_ != nullptr && objective_function_->IsCUDAObjective())) :
+    #endif  // USE_CUDA_EXP
+    std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
  // update score
  for (int i = 0; i < iter_; ++i) {
    for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
@@ -177,7 +219,7 @@ void GBDT::Boosting() {
  // objective function will calculate gradients and hessians
  int64_t num_score = 0;
  objective_function_->
-    GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
+    GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_);
 }

 data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) {
@@ -251,14 +293,33 @@ void GBDT::Bagging(int iter) {
    Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
    // set bagging data to tree learner
    if (!is_use_subset_) {
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+        tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
+      } else {
+      #endif  // USE_CUDA_EXP
        tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
+      #ifdef USE_CUDA_EXP
+      }
+      #endif  // USE_CUDA_EXP
    } else {
      // get subset
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                              bag_data_cnt_, false);
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+        tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
+                                      bag_data_cnt_);
+      } else {
+      #endif  // USE_CUDA_EXP
        tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                      bag_data_cnt_);
+      #ifdef USE_CUDA_EXP
+      }
+      #endif  // USE_CUDA_EXP
    }
  }
 }
@@ -313,8 +374,8 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
        CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves());
      }
      size_t offset = static_cast<size_t>(tree_id) * num_data_;
-      auto grad = gradients_.data() + offset;
-      auto hess = hessians_.data() + offset;
+      auto grad = gradients_pointer_ + offset;
+      auto hess = hessians_pointer_ + offset;
      auto new_tree = tree_learner_->FitByExistingTree(models_[model_index].get(), leaf_pred, grad, hess);
      train_score_updater_->AddScore(tree_learner_.get(), new_tree, tree_id);
      models_[model_index].reset(new_tree);
@@ -377,9 +438,22 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
      init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true);
    }
    Boosting();
-    gradients = gradients_.data();
-    hessians = hessians_.data();
+    gradients = gradients_pointer_;
+    hessians = hessians_pointer_;
+  #ifndef USE_CUDA_EXP
+  }
+  #else  // USE_CUDA_EXP
+  } else {
+    if (config_->device_type == std::string("cuda_exp")) {
+      const size_t total_size = static_cast<size_t>(num_data_ * num_class_);
+      CopyFromHostToCUDADevice<score_t>(gradients_pointer_, gradients, total_size, __FILE__, __LINE__);
+      CopyFromHostToCUDADevice<score_t>(hessians_pointer_, hessians, total_size, __FILE__, __LINE__);
+      gradients = gradients_pointer_;
+      hessians = hessians_pointer_;
    }
+  }
+  #endif  // USE_CUDA_EXP
+
  // bagging logic
  Bagging(iter_);

@@ -393,11 +467,11 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
      // need to copy gradients for bagging subset.
      if (is_use_subset_ && bag_data_cnt_ < num_data_ && config_->device_type != std::string("cuda_exp")) {
        for (int i = 0; i < bag_data_cnt_; ++i) {
-          gradients_[offset + i] = grad[bag_data_indices_[i]];
-          hessians_[offset + i] = hess[bag_data_indices_[i]];
+          gradients_pointer_[offset + i] = grad[bag_data_indices_[i]];
+          hessians_pointer_[offset + i] = hess[bag_data_indices_[i]];
        }
-        grad = gradients_.data() + offset;
-        hess = hessians_.data() + offset;
+        grad = gradients_pointer_ + offset;
+        hess = hessians_pointer_ + offset;
      }
      bool is_first_tree = models_.size() < static_cast<size_t>(num_tree_per_iteration_);
      new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree));
@@ -493,7 +567,15 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {

    // we need to predict out-of-bag scores of data for boosting
    if (num_data_ - bag_data_cnt_ > 0) {
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        train_score_updater_->AddScore(tree, cuda_bag_data_indices_.RawData() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
+      } else {
+      #endif  // USE_CUDA_EXP
        train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
+      #ifdef USE_CUDA_EXP
+      }
+      #endif  // USE_CUDA_EXP
    }

  } else {
@@ -508,7 +590,29 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
 }

 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
+  #ifdef USE_CUDA_EXP
+  const bool boosting_on_cuda = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
+  const bool evaluation_on_cuda = metric->IsCUDAMetric();
+  if ((boosting_on_cuda && evaluation_on_cuda) || (!boosting_on_cuda && !evaluation_on_cuda)) {
+  #endif  // USE_CUDA_EXP
    return metric->Eval(score, objective_function_);
+  #ifdef USE_CUDA_EXP
+  } else if (boosting_on_cuda && !evaluation_on_cuda) {
+    const size_t total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_tree_per_iteration_);
+    if (total_size > host_score_.size()) {
+      host_score_.resize(total_size, 0.0f);
+    }
+    CopyFromCUDADeviceToHost<double>(host_score_.data(), score, total_size, __FILE__, __LINE__);
+    return metric->Eval(host_score_.data(), objective_function_);
+  } else {
+    const size_t total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_tree_per_iteration_);
+    if (total_size > cuda_score_.Size()) {
+      cuda_score_.Resize(total_size);
+    }
+    CopyFromHostToCUDADevice<double>(cuda_score_.RawData(), score, total_size, __FILE__, __LINE__);
+    return metric->Eval(cuda_score_.RawData(), objective_function_);
+  }
+  #endif  // USE_CUDA_EXP
 }

 std::string GBDT::OutputMetric(int iter) {
@@ -700,11 +804,23 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
  }
  training_metrics_.shrink_to_fit();

+  #ifdef USE_CUDA_EXP
+  const bool boosting_on_gpu = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
+  #endif  // USE_CUDA_EXP
+
  if (train_data != train_data_) {
    train_data_ = train_data;
    // not same training data, need reset score and others
    // create score tracker
+    #ifdef USE_CUDA_EXP
+    if (config_->device_type == std::string("cuda_exp")) {
+      train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu));
+    } else {
+    #endif  // USE_CUDA_EXP
      train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
+    #ifdef USE_CUDA_EXP
+    }
+    #endif  // USE_CUDA_EXP

    // update score
    for (int i = 0; i < iter_; ++i) {
@@ -719,8 +835,19 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
    // create buffer for gradients and hessians
    if (objective_function_ != nullptr) {
      size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu) {
+        AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
+        AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
+      } else {
+      #endif  // USE_CUDA_EXP
        gradients_.resize(total_size);
        hessians_.resize(total_size);
+        gradients_pointer_ = gradients_.data();
+        hessians_pointer_ = hessians_.data();
+      #ifdef USE_CUDA_EXP
+      }
+      #endif  // USE_CUDA_EXP
    }

    max_feature_idx_ = train_data_->num_total_features() - 1;
@@ -795,6 +922,11 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
      bag_data_cnt_ = static_cast<data_size_t>(config->bagging_fraction * num_data_);
    }
    bag_data_indices_.resize(num_data_);
+    #ifdef USE_CUDA_EXP
+    if (config->device_type == std::string("cuda_exp")) {
+      cuda_bag_data_indices_.Resize(num_data_);
+    }
+    #endif  // USE_CUDA_EXP
    bagging_runner_.ReSize(num_data_);
    bagging_rands_.clear();
    for (int i = 0;
@@ -823,13 +955,27 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
    if (is_use_subset_ && bag_data_cnt_ < num_data_) {
      if (objective_function_ == nullptr) {
        size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
+        #ifdef USE_CUDA_EXP
+        if (config_->device_type == std::string("cuda_exp") && objective_function_ != nullptr && objective_function_->IsCUDAObjective()) {
+          AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
+          AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
+        } else {
+        #endif  // USE_CUDA_EXP
          gradients_.resize(total_size);
          hessians_.resize(total_size);
+          gradients_pointer_ = gradients_.data();
+          hessians_pointer_ = hessians_.data();
+        #ifdef USE_CUDA_EXP
+        }
+        #endif  // USE_CUDA_EXP
      }
    }
  } else {
    bag_data_cnt_ = num_data_;
    bag_data_indices_.clear();
+    #ifdef USE_CUDA_EXP
+    cuda_bag_data_indices_.Clear();
+    #endif  // USE_CUDA_EXP
    bagging_runner_.ReSize(0);
    is_use_subset_ = false;
  }

--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -23,6 +23,7 @@
 #include <utility>
 #include <vector>

+#include "cuda/cuda_score_updater.hpp"
 #include "score_updater.hpp"

 namespace LightGBM {
@@ -499,6 +500,18 @@ class GBDT : public GBDTBase {
  /*! \brief Second order derivative of training data */
  std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> hessians_;
 #endif
+  /*! \brief Pointer to gradient vector, can be on CPU or GPU */
+  score_t* gradients_pointer_;
+  /*! \brief Pointer to hessian vector, can be on CPU or GPU */
+  score_t* hessians_pointer_;
+  #ifdef USE_CUDA_EXP
+  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */
+  mutable std::vector<double> host_score_;
+  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */
+  mutable CUDAVector<double> cuda_score_;
+  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */
+  CUDAVector<data_size_t> cuda_bag_data_indices_;
+  #endif  // USE_CUDA_EXP

  /*! \brief Store the indices of in-bag data */
  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;

--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -167,14 +167,33 @@ class GOSS: public GBDT {
    bag_data_cnt_ = left_cnt;
    // set bagging data to tree learner
    if (!is_use_subset_) {
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+        tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
+      } else {
+      #endif  // USE_CUDA_EXP
        tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
+      #ifdef USE_CUDA_EXP
+      }
+      #endif  // USE_CUDA_EXP
    } else {
      // get subset
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                              bag_data_cnt_, false);
+      #ifdef USE_CUDA_EXP
+      if (config_->device_type == std::string("cuda_exp")) {
+        CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
+        tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
+                                      bag_data_cnt_);
+      } else {
+      #endif  // USE_CUDA_EXP
        tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                      bag_data_cnt_);
+      #ifdef USE_CUDA_EXP
+      }
+      #endif  // USE_CUDA_EXP
    }
  }


--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -51,7 +51,7 @@ class ScoreUpdater {

  inline bool has_init_score() const { return has_init_score_; }

-  inline void AddScore(double val, int cur_tree_id) {
+  virtual inline void AddScore(double val, int cur_tree_id) {
    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
 #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
@@ -60,7 +60,7 @@ class ScoreUpdater {
    }
  }

-  inline void MultiplyScore(double val, int cur_tree_id) {
+  virtual inline void MultiplyScore(double val, int cur_tree_id) {
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
 #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
    for (int i = 0; i < num_data_; ++i) {
@@ -73,7 +73,7 @@ class ScoreUpdater {
  * \param tree Trained tree model
  * \param cur_tree_id Current tree for multiclass training
  */
-  inline void AddScore(const Tree* tree, int cur_tree_id) {
+  virtual inline void AddScore(const Tree* tree, int cur_tree_id) {
    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
    tree->AddPredictionToScore(data_, num_data_, score_.data() + offset);
@@ -85,7 +85,7 @@ class ScoreUpdater {
  * \param tree_learner
  * \param cur_tree_id Current tree for multiclass training
  */
-  inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
+  virtual inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
    tree_learner->AddPredictionToScore(tree, score_.data() + offset);
@@ -98,14 +98,14 @@ class ScoreUpdater {
  * \param data_cnt Number of data that will be processed
  * \param cur_tree_id Current tree for multiclass training
  */
-  inline void AddScore(const Tree* tree, const data_size_t* data_indices,
+  virtual inline void AddScore(const Tree* tree, const data_size_t* data_indices,
                       data_size_t data_cnt, int cur_tree_id) {
    Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
    const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
    tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset);
  }
  /*! \brief Pointer of score */
-  inline const double* score() const { return score_.data(); }
+  virtual inline const double* score() const { return score_.data(); }

  inline data_size_t num_data() const { return num_data_; }

@@ -114,7 +114,7 @@ class ScoreUpdater {
  /*! \brief Disable copy */
  ScoreUpdater(const ScoreUpdater&) = delete;

- private:
+ protected:
  /*! \brief Number of total data */
  data_size_t num_data_;
  /*! \brief Pointer of data set */

--- a/src/io/cuda/cuda_tree.cpp
+++ b/src/io/cuda/cuda_tree.cpp
@@ -238,6 +238,20 @@ int CUDATree::SplitCategorical(const int leaf_index,
  return num_leaves_ - 1;
 }

+void CUDATree::AddPredictionToScore(const Dataset* data,
+                                    data_size_t num_data,
+                                    double* score) const {
+  LaunchAddPredictionToScoreKernel(data, nullptr, num_data, score);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDATree::AddPredictionToScore(const Dataset* data,
+                                    const data_size_t* used_data_indices,
+                                    data_size_t num_data, double* score) const {
+  LaunchAddPredictionToScoreKernel(data, used_data_indices, num_data, score);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
 inline void CUDATree::Shrinkage(double rate) {
  Tree::Shrinkage(rate);
  LaunchShrinkageKernel(rate);

--- a/src/io/cuda/cuda_tree.cu
+++ b/src/io/cuda/cuda_tree.cu
@@ -303,6 +303,131 @@ void CUDATree::LaunchAddBiasKernel(const double val) {
  AddBiasKernel<<<num_blocks, num_threads_per_block>>>(val, cuda_leaf_value_, num_leaves_);
 }

+template <bool USE_INDICES>
+__global__ void AddPredictionToScoreKernel(
+  // dataset information
+  const data_size_t num_data,
+  void* const* cuda_data_by_column,
+  const uint8_t* cuda_column_bit_type,
+  const uint32_t* cuda_feature_min_bin,
+  const uint32_t* cuda_feature_max_bin,
+  const uint32_t* cuda_feature_offset,
+  const uint32_t* cuda_feature_default_bin,
+  const uint32_t* cuda_feature_most_freq_bin,
+  const int* cuda_feature_to_column,
+  const data_size_t* cuda_used_indices,
+  // tree information
+  const uint32_t* cuda_threshold_in_bin,
+  const int8_t* cuda_decision_type,
+  const int* cuda_split_feature_inner,
+  const int* cuda_left_child,
+  const int* cuda_right_child,
+  const double* cuda_leaf_value,
+  // output
+  double* score) {
+  const data_size_t inner_data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index;
+  if (data_index < num_data) {
+    int node = 0;
+    while (node >= 0) {
+      const int split_feature_inner = cuda_split_feature_inner[node];
+      const int column = cuda_feature_to_column[split_feature_inner];
+      const uint32_t default_bin = cuda_feature_default_bin[split_feature_inner];
+      const uint32_t most_freq_bin = cuda_feature_most_freq_bin[split_feature_inner];
+      const uint32_t max_bin = cuda_feature_max_bin[split_feature_inner];
+      const uint32_t min_bin = cuda_feature_min_bin[split_feature_inner];
+      const uint32_t offset = cuda_feature_offset[split_feature_inner];
+      const uint8_t column_bit_type = cuda_column_bit_type[column];
+      uint32_t bin = 0;
+      if (column_bit_type == 8) {
+        bin = static_cast<uint32_t>((reinterpret_cast<const uint8_t*>(cuda_data_by_column[column]))[data_index]);
+      } else if (column_bit_type == 16) {
+        bin = static_cast<uint32_t>((reinterpret_cast<const uint16_t*>(cuda_data_by_column[column]))[data_index]);
+      } else if (column_bit_type == 32) {
+        bin = static_cast<uint32_t>((reinterpret_cast<const uint32_t*>(cuda_data_by_column[column]))[data_index]);
+      }
+      if (bin >= min_bin && bin <= max_bin) {
+        bin = bin - min_bin + offset;
+      } else {
+        bin = most_freq_bin;
+      }
+      const int8_t decision_type = cuda_decision_type[node];
+      const uint32_t threshold_in_bin = cuda_threshold_in_bin[node];
+      const int8_t missing_type = ((decision_type >> 2) & 3);
+      const bool default_left = ((decision_type & kDefaultLeftMask) > 0);
+      if ((missing_type == 1 && bin == default_bin) || (missing_type == 2 && bin == max_bin)) {
+        if (default_left) {
+          node = cuda_left_child[node];
+        } else {
+          node = cuda_right_child[node];
+        }
+      } else {
+        if (bin <= threshold_in_bin) {
+          node = cuda_left_child[node];
+        } else {
+          node = cuda_right_child[node];
+        }
+      }
+    }
+    score[data_index] += cuda_leaf_value[~node];
+  }
+}
+
+void CUDATree::LaunchAddPredictionToScoreKernel(
+  const Dataset* data,
+  const data_size_t* used_data_indices,
+  data_size_t num_data,
+  double* score) const {
+  const CUDAColumnData* cuda_column_data = data->cuda_column_data();
+  const int num_blocks = (num_data + num_threads_per_block_add_prediction_to_score_ - 1) / num_threads_per_block_add_prediction_to_score_;
+  if (used_data_indices == nullptr) {
+    AddPredictionToScoreKernel<false><<<num_blocks, num_threads_per_block_add_prediction_to_score_>>>(
+      // dataset information
+      num_data,
+      cuda_column_data->cuda_data_by_column(),
+      cuda_column_data->cuda_column_bit_type(),
+      cuda_column_data->cuda_feature_min_bin(),
+      cuda_column_data->cuda_feature_max_bin(),
+      cuda_column_data->cuda_feature_offset(),
+      cuda_column_data->cuda_feature_default_bin(),
+      cuda_column_data->cuda_feature_most_freq_bin(),
+      cuda_column_data->cuda_feature_to_column(),
+      nullptr,
+      // tree information
+      cuda_threshold_in_bin_,
+      cuda_decision_type_,
+      cuda_split_feature_inner_,
+      cuda_left_child_,
+      cuda_right_child_,
+      cuda_leaf_value_,
+      // output
+      score);
+  } else {
+    AddPredictionToScoreKernel<true><<<num_blocks, num_threads_per_block_add_prediction_to_score_>>>(
+      // dataset information
+      num_data,
+      cuda_column_data->cuda_data_by_column(),
+      cuda_column_data->cuda_column_bit_type(),
+      cuda_column_data->cuda_feature_min_bin(),
+      cuda_column_data->cuda_feature_max_bin(),
+      cuda_column_data->cuda_feature_offset(),
+      cuda_column_data->cuda_feature_default_bin(),
+      cuda_column_data->cuda_feature_most_freq_bin(),
+      cuda_column_data->cuda_feature_to_column(),
+      used_data_indices,
+      // tree information
+      cuda_threshold_in_bin_,
+      cuda_decision_type_,
+      cuda_split_feature_inner_,
+      cuda_left_child_,
+      cuda_right_child_,
+      cuda_leaf_value_,
+      // output
+      score);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
 }  // namespace LightGBM

 #endif  // USE_CUDA_EXP
--- a/src/metric/metric.cpp
+++ b/src/metric/metric.cpp
@@ -14,53 +14,130 @@
 namespace LightGBM {

 Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
+  #ifdef USE_CUDA_EXP
+  if (config.device_type == std::string("cuda_exp")) {
    if (type == std::string("l2")) {
+      Log::Warning("Metric l2 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new L2Metric(config);
    } else if (type == std::string("rmse")) {
+      Log::Warning("Metric rmse is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new RMSEMetric(config);
    } else if (type == std::string("l1")) {
+      Log::Warning("Metric l1 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new L1Metric(config);
    } else if (type == std::string("quantile")) {
+      Log::Warning("Metric quantile is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new QuantileMetric(config);
    } else if (type == std::string("huber")) {
+      Log::Warning("Metric huber is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new HuberLossMetric(config);
    } else if (type == std::string("fair")) {
+      Log::Warning("Metric fair is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new FairLossMetric(config);
    } else if (type == std::string("poisson")) {
+      Log::Warning("Metric poisson is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new PoissonMetric(config);
    } else if (type == std::string("binary_logloss")) {
+      Log::Warning("Metric binary_logloss is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new BinaryLoglossMetric(config);
    } else if (type == std::string("binary_error")) {
+      Log::Warning("Metric binary_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new BinaryErrorMetric(config);
    } else if (type == std::string("auc")) {
+      Log::Warning("Metric auc is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new AUCMetric(config);
    } else if (type == std::string("average_precision")) {
+      Log::Warning("Metric average_precision is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new AveragePrecisionMetric(config);
    } else if (type == std::string("auc_mu")) {
+      Log::Warning("Metric auc_mu is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new AucMuMetric(config);
    } else if (type == std::string("ndcg")) {
+      Log::Warning("Metric ndcg is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new NDCGMetric(config);
    } else if (type == std::string("map")) {
+      Log::Warning("Metric map is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new MapMetric(config);
    } else if (type == std::string("multi_logloss")) {
+      Log::Warning("Metric multi_logloss is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new MultiSoftmaxLoglossMetric(config);
    } else if (type == std::string("multi_error")) {
+      Log::Warning("Metric multi_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new MultiErrorMetric(config);
    } else if (type == std::string("cross_entropy")) {
+      Log::Warning("Metric cross_entropy is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new CrossEntropyMetric(config);
    } else if (type == std::string("cross_entropy_lambda")) {
+      Log::Warning("Metric cross_entropy_lambda is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new CrossEntropyLambdaMetric(config);
    } else if (type == std::string("kullback_leibler")) {
+      Log::Warning("Metric kullback_leibler is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new KullbackLeiblerDivergence(config);
    } else if (type == std::string("mape")) {
+      Log::Warning("Metric mape is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new MAPEMetric(config);
    } else if (type == std::string("gamma")) {
+      Log::Warning("Metric gamma is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new GammaMetric(config);
    } else if (type == std::string("gamma_deviance")) {
+      Log::Warning("Metric gamma_deviance is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new GammaDevianceMetric(config);
    } else if (type == std::string("tweedie")) {
+      Log::Warning("Metric tweedie is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
      return new TweedieMetric(config);
    }
+  } else {
+  #endif  // USE_CUDA_EXP
+    if (type == std::string("l2")) {
+      return new L2Metric(config);
+    } else if (type == std::string("rmse")) {
+      return new RMSEMetric(config);
+    } else if (type == std::string("l1")) {
+      return new L1Metric(config);
+    } else if (type == std::string("quantile")) {
+      return new QuantileMetric(config);
+    } else if (type == std::string("huber")) {
+      return new HuberLossMetric(config);
+    } else if (type == std::string("fair")) {
+      return new FairLossMetric(config);
+    } else if (type == std::string("poisson")) {
+      return new PoissonMetric(config);
+    } else if (type == std::string("binary_logloss")) {
+      return new BinaryLoglossMetric(config);
+    } else if (type == std::string("binary_error")) {
+      return new BinaryErrorMetric(config);
+    } else if (type == std::string("auc")) {
+      return new AUCMetric(config);
+    } else if (type == std::string("average_precision")) {
+      return new AveragePrecisionMetric(config);
+    } else if (type == std::string("auc_mu")) {
+      return new AucMuMetric(config);
+    } else if (type == std::string("ndcg")) {
+      return new NDCGMetric(config);
+    } else if (type == std::string("map")) {
+      return new MapMetric(config);
+    } else if (type == std::string("multi_logloss")) {
+      return new MultiSoftmaxLoglossMetric(config);
+    } else if (type == std::string("multi_error")) {
+      return new MultiErrorMetric(config);
+    } else if (type == std::string("cross_entropy")) {
+      return new CrossEntropyMetric(config);
+    } else if (type == std::string("cross_entropy_lambda")) {
+      return new CrossEntropyLambdaMetric(config);
+    } else if (type == std::string("kullback_leibler")) {
+      return new KullbackLeiblerDivergence(config);
+    } else if (type == std::string("mape")) {
+      return new MAPEMetric(config);
+    } else if (type == std::string("gamma")) {
+      return new GammaMetric(config);
+    } else if (type == std::string("gamma_deviance")) {
+      return new GammaDevianceMetric(config);
+    } else if (type == std::string("tweedie")) {
+      return new TweedieMetric(config);
+    }
+  #ifdef USE_CUDA_EXP
+  }
+  #endif  // USE_CUDA_EXP
  return nullptr;
 }


--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -13,41 +13,100 @@
 namespace LightGBM {

 ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) {
+  #ifdef USE_CUDA_EXP
+  if (config.device_type == std::string("cuda_exp")) {
    if (type == std::string("regression")) {
+      Log::Warning("Objective regression is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionL2loss(config);
    } else if (type == std::string("regression_l1")) {
+      Log::Warning("Objective regression_l1 is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionL1loss(config);
    } else if (type == std::string("quantile")) {
+      Log::Warning("Objective quantile is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionQuantileloss(config);
    } else if (type == std::string("huber")) {
+      Log::Warning("Objective huber is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionHuberLoss(config);
    } else if (type == std::string("fair")) {
+      Log::Warning("Objective fair is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionFairLoss(config);
    } else if (type == std::string("poisson")) {
+      Log::Warning("Objective poisson is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionPoissonLoss(config);
    } else if (type == std::string("binary")) {
+      Log::Warning("Objective binary is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new BinaryLogloss(config);
    } else if (type == std::string("lambdarank")) {
+      Log::Warning("Objective lambdarank is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new LambdarankNDCG(config);
    } else if (type == std::string("rank_xendcg")) {
+      Log::Warning("Objective rank_xendcg is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RankXENDCG(config);
    } else if (type == std::string("multiclass")) {
+      Log::Warning("Objective multiclass is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new MulticlassSoftmax(config);
    } else if (type == std::string("multiclassova")) {
+      Log::Warning("Objective multiclassova is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new MulticlassOVA(config);
    } else if (type == std::string("cross_entropy")) {
+      Log::Warning("Objective cross_entropy is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new CrossEntropy(config);
    } else if (type == std::string("cross_entropy_lambda")) {
+      Log::Warning("Objective cross_entropy_lambda is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new CrossEntropyLambda(config);
    } else if (type == std::string("mape")) {
+      Log::Warning("Objective mape is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionMAPELOSS(config);
    } else if (type == std::string("gamma")) {
+      Log::Warning("Objective gamma is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionGammaLoss(config);
    } else if (type == std::string("tweedie")) {
+      Log::Warning("Objective tweedie is not implemented in cuda_exp version. Fall back to boosting on CPU.");
      return new RegressionTweedieLoss(config);
    } else if (type == std::string("custom")) {
+      Log::Warning("Using customized objective with cuda_exp. This requires copying gradients from CPU to GPU, which can be slow.");
      return nullptr;
    }
+  } else {
+  #endif  // USE_CUDA_EXP
+    if (type == std::string("regression")) {
+      return new RegressionL2loss(config);
+    } else if (type == std::string("regression_l1")) {
+      return new RegressionL1loss(config);
+    } else if (type == std::string("quantile")) {
+      return new RegressionQuantileloss(config);
+    } else if (type == std::string("huber")) {
+      return new RegressionHuberLoss(config);
+    } else if (type == std::string("fair")) {
+      return new RegressionFairLoss(config);
+    } else if (type == std::string("poisson")) {
+      return new RegressionPoissonLoss(config);
+    } else if (type == std::string("binary")) {
+      return new BinaryLogloss(config);
+    } else if (type == std::string("lambdarank")) {
+      return new LambdarankNDCG(config);
+    } else if (type == std::string("rank_xendcg")) {
+      return new RankXENDCG(config);
+    } else if (type == std::string("multiclass")) {
+      return new MulticlassSoftmax(config);
+    } else if (type == std::string("multiclassova")) {
+      return new MulticlassOVA(config);
+    } else if (type == std::string("cross_entropy")) {
+      return new CrossEntropy(config);
+    } else if (type == std::string("cross_entropy_lambda")) {
+      return new CrossEntropyLambda(config);
+    } else if (type == std::string("mape")) {
+      return new RegressionMAPELOSS(config);
+    } else if (type == std::string("gamma")) {
+      return new RegressionGammaLoss(config);
+    } else if (type == std::string("tweedie")) {
+      return new RegressionTweedieLoss(config);
+    } else if (type == std::string("custom")) {
+      return nullptr;
+    }
+  #ifdef USE_CUDA_EXP
+  }
+  #endif  // USE_CUDA_EXP
  Log::Fatal("Unknown objective type name: %s", type.c_str());
  return nullptr;
 }

--- a/src/treelearner/cuda/cuda_data_partition.cpp
+++ b/src/treelearner/cuda/cuda_data_partition.cpp
@@ -61,7 +61,6 @@ CUDADataPartition::CUDADataPartition(
  cuda_out_data_indices_in_leaf_ = nullptr;
  cuda_split_info_buffer_ = nullptr;
  cuda_num_data_ = nullptr;
-  cuda_add_train_score_ = nullptr;
 }

 CUDADataPartition::~CUDADataPartition() {
@@ -78,7 +77,6 @@ CUDADataPartition::~CUDADataPartition() {
  DeallocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__);
  DeallocateCUDAMemory<int>(&cuda_split_info_buffer_, __FILE__, __LINE__);
  DeallocateCUDAMemory<data_size_t>(&cuda_num_data_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<double>(&cuda_add_train_score_, __FILE__, __LINE__);
  CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[0]));
  CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[1]));
  CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[2]));
@@ -115,8 +113,6 @@ void CUDADataPartition::Init() {
  gpuAssert(cudaStreamCreate(&cuda_streams_[3]), __FILE__, __LINE__);

  InitCUDAMemoryFromHostMemory<data_size_t>(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__);
-  add_train_score_.resize(num_data_, 0.0f);
-  AllocateCUDAMemory<double>(&cuda_add_train_score_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
  use_bagging_ = false;
  used_indices_ = nullptr;
 }
@@ -270,33 +266,11 @@ void CUDADataPartition::UpdateTrainScore(const Tree* tree, double* scores) {
    cuda_tree_ptr.reset(new CUDATree(tree));
    cuda_tree = cuda_tree_ptr.get();
  }
-  const data_size_t num_data_in_root = root_num_data();
  if (use_bagging_) {
    // we need restore the order of indices in cuda_data_indices_
-    CopyFromHostToCUDADevice<data_size_t>(cuda_data_indices_, used_indices_, static_cast<size_t>(num_used_indices_), __FILE__, __LINE__);
-  }
-  LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), cuda_add_train_score_);
-  CopyFromCUDADeviceToHost<double>(add_train_score_.data(),
-    cuda_add_train_score_, static_cast<size_t>(num_data_in_root), __FILE__, __LINE__);
-  if (!use_bagging_) {
-    OMP_INIT_EX();
-    #pragma omp parallel for schedule(static) num_threads(num_threads_)
-    for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) {
-      OMP_LOOP_EX_BEGIN();
-      scores[data_index] += add_train_score_[data_index];
-      OMP_LOOP_EX_END();
-    }
-    OMP_THROW_EX();
-  } else {
-    OMP_INIT_EX();
-    #pragma omp parallel for schedule(static) num_threads(num_threads_)
-    for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) {
-      OMP_LOOP_EX_BEGIN();
-      scores[used_indices_[data_index]] += add_train_score_[data_index];
-      OMP_LOOP_EX_END();
-    }
-    OMP_THROW_EX();
+    CopyFromCUDADeviceToCUDADevice<data_size_t>(cuda_data_indices_, used_indices_, static_cast<size_t>(num_used_indices_), __FILE__, __LINE__);
  }
+  LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), scores);
 }

 void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) {
@@ -318,7 +292,7 @@ void CUDADataPartition::SetUsedDataIndices(const data_size_t* used_indices, cons
  use_bagging_ = true;
  num_used_indices_ = num_used_indices;
  used_indices_ = used_indices;
-  CopyFromHostToCUDADevice<data_size_t>(cuda_data_indices_, used_indices, static_cast<size_t>(num_used_indices), __FILE__, __LINE__);
+  CopyFromCUDADeviceToCUDADevice<data_size_t>(cuda_data_indices_, used_indices, static_cast<size_t>(num_used_indices), __FILE__, __LINE__);
  LaunchFillDataIndexToLeafIndex();
 }

@@ -347,14 +321,11 @@ void CUDADataPartition::ResetTrainingData(const Dataset* train_data, const int n
    DeallocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, __FILE__, __LINE__);
    DeallocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__);
    DeallocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__);
-    DeallocateCUDAMemory<double>(&cuda_add_train_score_, __FILE__, __LINE__);
-    add_train_score_.resize(num_data_, 0.0f);

    AllocateCUDAMemory<data_size_t>(&cuda_data_indices_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
    AllocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
    AllocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
    AllocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
-    AllocateCUDAMemory<double>(&cuda_add_train_score_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
  }
  used_indices_ = nullptr;
  use_bagging_ = false;

--- a/src/treelearner/cuda/cuda_data_partition.cu
+++ b/src/treelearner/cuda/cuda_data_partition.cu
@@ -1045,11 +1045,11 @@ __global__ void AddPredictionToScoreKernel(
      const data_size_t global_data_index = data_indices_in_leaf[local_data_index];
      const int leaf_index = cuda_data_index_to_leaf_index[global_data_index];
      const double leaf_prediction_value = leaf_value[leaf_index];
-      cuda_scores[local_data_index] = leaf_prediction_value;
+      cuda_scores[global_data_index] += leaf_prediction_value;
    } else {
      const int leaf_index = cuda_data_index_to_leaf_index[local_data_index];
      const double leaf_prediction_value = leaf_value[leaf_index];
-      cuda_scores[local_data_index] = leaf_prediction_value;
+      cuda_scores[local_data_index] += leaf_prediction_value;
    }
  }
 }

--- a/src/treelearner/cuda/cuda_data_partition.hpp
+++ b/src/treelearner/cuda/cuda_data_partition.hpp
@@ -307,8 +307,6 @@ class CUDADataPartition {
  int grid_dim_;
  /*! \brief block dimension when splitting one leaf */
  int block_dim_;
-  /*! \brief add train score buffer in host */
-  mutable std::vector<double> add_train_score_;
  /*! \brief data indices used in this iteration */
  const data_size_t* used_indices_;
  /*! \brief marks whether a feature is a categorical feature */
@@ -376,10 +374,6 @@ class CUDADataPartition {
  /*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */
  data_size_t* cuda_num_data_;

-  // for train score update
-  /*! \brief added train score buffer in CUDA */
-  double* cuda_add_train_score_;
-

  // CUDA memory, held by other object


--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@@ -19,14 +19,16 @@

 namespace LightGBM {

-CUDASingleGPUTreeLearner::CUDASingleGPUTreeLearner(const Config* config): SerialTreeLearner(config) {
+CUDASingleGPUTreeLearner::CUDASingleGPUTreeLearner(const Config* config, const bool boosting_on_cuda): SerialTreeLearner(config), boosting_on_cuda_(boosting_on_cuda) {
  cuda_gradients_ = nullptr;
  cuda_hessians_ = nullptr;
 }

 CUDASingleGPUTreeLearner::~CUDASingleGPUTreeLearner() {
+  if (!boosting_on_cuda_) {
    DeallocateCUDAMemory<score_t>(&cuda_gradients_, __FILE__, __LINE__);
    DeallocateCUDAMemory<score_t>(&cuda_hessians_, __FILE__, __LINE__);
+  }
 }

 void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
@@ -64,28 +66,45 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_
  leaf_data_start_.resize(config_->num_leaves, 0);
  leaf_sum_hessians_.resize(config_->num_leaves, 0.0f);

+  if (!boosting_on_cuda_) {
    AllocateCUDAMemory<score_t>(&cuda_gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
    AllocateCUDAMemory<score_t>(&cuda_hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
  AllocateBitset();

  cuda_leaf_gradient_stat_buffer_ = nullptr;
  cuda_leaf_hessian_stat_buffer_ = nullptr;
  leaf_stat_buffer_size_ = 0;
  num_cat_threshold_ = 0;
+
+  #ifdef DEBUG
+  host_gradients_.resize(num_data_, 0.0f);
+  host_hessians_.resize(num_data_, 0.0f);
+  #endif  // DEBUG
 }

 void CUDASingleGPUTreeLearner::BeforeTrain() {
  const data_size_t root_num_data = cuda_data_partition_->root_num_data();
+  if (!boosting_on_cuda_) {
    CopyFromHostToCUDADevice<score_t>(cuda_gradients_, gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
    CopyFromHostToCUDADevice<score_t>(cuda_hessians_, hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+    gradients_ = cuda_gradients_;
+    hessians_ = cuda_hessians_;
+  }
+
+  #ifdef DEBUG
+  CopyFromCUDADeviceToHost<score_t>(host_gradients.data(), gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<score_t>(host_hessians.data(), hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  #endif  // DEBUG
+
  const data_size_t* leaf_splits_init_indices =
    cuda_data_partition_->use_bagging() ? cuda_data_partition_->cuda_data_indices() : nullptr;
  cuda_data_partition_->BeforeTrain();
  cuda_smaller_leaf_splits_->InitValues(
    config_->lambda_l1,
    config_->lambda_l2,
-    cuda_gradients_,
-    cuda_hessians_,
+    gradients_,
+    hessians_,
    leaf_splits_init_indices,
    cuda_data_partition_->cuda_data_indices(),
    root_num_data,
@@ -93,7 +112,7 @@ void CUDASingleGPUTreeLearner::BeforeTrain() {
    &leaf_sum_hessians_[0]);
  leaf_num_data_[0] = root_num_data;
  cuda_larger_leaf_splits_->InitValues();
-  cuda_histogram_constructor_->BeforeTrain(cuda_gradients_, cuda_hessians_);
+  cuda_histogram_constructor_->BeforeTrain(gradients_, hessians_);
  col_sampler_.ResetByTree();
  cuda_best_split_finder_->BeforeTrain(col_sampler_.is_feature_used_bytree());
  leaf_data_start_[0] = 0;
@@ -247,10 +266,12 @@ void CUDASingleGPUTreeLearner::ResetTrainingData(
  cuda_smaller_leaf_splits_->Resize(num_data_);
  cuda_larger_leaf_splits_->Resize(num_data_);
  CHECK_EQ(is_constant_hessian, share_state_->is_constant_hessian);
+  if (!boosting_on_cuda_) {
    DeallocateCUDAMemory<score_t>(&cuda_gradients_, __FILE__, __LINE__);
    DeallocateCUDAMemory<score_t>(&cuda_hessians_, __FILE__, __LINE__);
    AllocateCUDAMemory<score_t>(&cuda_gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
    AllocateCUDAMemory<score_t>(&cuda_hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
+  }
 }

 void CUDASingleGPUTreeLearner::ResetConfig(const Config* config) {
@@ -444,13 +465,13 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(
  double sum_right_gradients = 0.0f, sum_right_hessians = 0.0f;
  for (size_t i = 0; i < left_data_indices.size(); ++i) {
    const data_size_t index = left_data_indices[i];
-    sum_left_gradients += gradients_[index];
-    sum_left_hessians += hessians_[index];
+    sum_left_gradients += host_gradients_[index];
+    sum_left_hessians += host_hessians_[index];
  }
  for (size_t i = 0; i < right_data_indices.size(); ++i) {
    const data_size_t index = right_data_indices[i];
-    sum_right_gradients += gradients_[index];
-    sum_right_hessians += hessians_[index];
+    sum_right_gradients += host_gradients_[index];
+    sum_right_hessians += host_hessians_[index];
  }
  CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f);
  CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f);