Unverified Commit e0af160a authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

[CUDA] Initial work for boosting and evaluation with CUDA (#5279)

* initial work for boosting and evaluation with CUDA

* fix compatibility with CPU code

* fix creating objective without USE_CUDA_EXP

* fix static analysis errors

* fix static analysis errors
parent da3b4c19
......@@ -395,6 +395,8 @@ if(USE_CUDA OR USE_CUDA_EXP)
src/treelearner/*.cu
endif()
if(USE_CUDA_EXP)
src/boosting/cuda/*.cpp
src/boosting/cuda/*.cu
src/treelearner/cuda/*.cpp
src/treelearner/cuda/*.cu
src/io/cuda/*.cu
......
......@@ -56,6 +56,27 @@ class CUDATree : public Tree {
uint32_t* cuda_bitset_inner,
size_t cuda_bitset_inner_len);
/*!
* \brief Adding prediction value of this tree model to scores
* \param data The dataset
* \param num_data Number of total data
* \param score Will add prediction to score
*/
void AddPredictionToScore(const Dataset* data,
data_size_t num_data,
double* score) const override;
/*!
* \brief Adding prediction value of this tree model to scores
* \param data The dataset
* \param used_data_indices Indices of used data
* \param num_data Number of total data
* \param score Will add prediction to score
*/
void AddPredictionToScore(const Dataset* data,
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const override;
const int* cuda_leaf_parent() const { return cuda_leaf_parent_; }
const int* cuda_left_child() const { return cuda_left_child_; }
......@@ -105,6 +126,10 @@ class CUDATree : public Tree {
size_t cuda_bitset_len,
size_t cuda_bitset_inner_len);
void LaunchAddPredictionToScoreKernel(const Dataset* data,
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const;
void LaunchShrinkageKernel(const double rate);
void LaunchAddBiasKernel(const double val);
......
......@@ -55,6 +55,11 @@ class Metric {
* \param config Config for metric
*/
LIGHTGBM_EXPORT static Metric* CreateMetric(const std::string& type, const Config& config);
/*!
* \brief Whether boosting is done on CUDA
*/
virtual bool IsCUDAMetric() const { return false; }
};
/*!
......
......@@ -88,6 +88,11 @@ class ObjectiveFunction {
* \brief Load objective function from string object
*/
LIGHTGBM_EXPORT static ObjectiveFunction* CreateObjectiveFunction(const std::string& str);
/*!
* \brief Whether boosting is done on CUDA
*/
virtual bool IsCUDAObjective() const { return false; }
};
} // namespace LightGBM
......
......@@ -103,7 +103,8 @@ class TreeLearner {
*/
static TreeLearner* CreateTreeLearner(const std::string& learner_type,
const std::string& device_type,
const Config* config);
const Config* config,
const bool boosting_on_cuda);
};
} // namespace LightGBM
......
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#include "cuda_score_updater.hpp"
#ifdef USE_CUDA_EXP
namespace LightGBM {
CUDAScoreUpdater::CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration, const bool boosting_on_cuda):
ScoreUpdater(data, num_tree_per_iteration), num_threads_per_block_(1024), boosting_on_cuda_(boosting_on_cuda) {
num_data_ = data->num_data();
int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration;
InitCUDA(total_size);
has_init_score_ = false;
const double* init_score = data->metadata().init_score();
// if exists initial score, will start from it
if (init_score != nullptr) {
if ((data->metadata().num_init_score() % num_data_) != 0
|| (data->metadata().num_init_score() / num_data_) != num_tree_per_iteration) {
Log::Fatal("Number of class for initial score error");
}
has_init_score_ = true;
CopyFromHostToCUDADevice<double>(cuda_score_, init_score, total_size, __FILE__, __LINE__);
} else {
SetCUDAMemory<double>(cuda_score_, 0, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
SynchronizeCUDADevice(__FILE__, __LINE__);
if (boosting_on_cuda_) {
// clear host score buffer
score_.clear();
score_.shrink_to_fit();
}
}
void CUDAScoreUpdater::InitCUDA(const size_t total_size) {
AllocateCUDAMemory<double>(&cuda_score_, total_size, __FILE__, __LINE__);
}
CUDAScoreUpdater::~CUDAScoreUpdater() {
DeallocateCUDAMemory<double>(&cuda_score_, __FILE__, __LINE__);
}
inline void CUDAScoreUpdater::AddScore(double val, int cur_tree_id) {
Common::FunctionTimer fun_timer("CUDAScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
LaunchAddScoreConstantKernel(val, offset);
if (!boosting_on_cuda_) {
CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
}
inline void CUDAScoreUpdater::AddScore(const Tree* tree, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree->AddPredictionToScore(data_, num_data_, cuda_score_ + offset);
if (!boosting_on_cuda_) {
CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
}
inline void CUDAScoreUpdater::AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree_learner->AddPredictionToScore(tree, cuda_score_ + offset);
if (!boosting_on_cuda_) {
CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
}
inline void CUDAScoreUpdater::AddScore(const Tree* tree, const data_size_t* data_indices,
data_size_t data_cnt, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree->AddPredictionToScore(data_, data_indices, data_cnt, cuda_score_ + offset);
if (!boosting_on_cuda_) {
CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
}
inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) {
Common::FunctionTimer fun_timer("CUDAScoreUpdater::MultiplyScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
LaunchMultiplyScoreConstantKernel(val, offset);
if (!boosting_on_cuda_) {
CopyFromCUDADeviceToHost<double>(score_.data() + offset, cuda_score_ + offset, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
}
} // namespace LightGBM
#endif // USE_CUDA_EXP
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#include "cuda_score_updater.hpp"
#ifdef USE_CUDA_EXP
namespace LightGBM {
__global__ void AddScoreConstantKernel(
const double val,
const size_t offset,
const data_size_t num_data,
double* score) {
const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
if (data_index < num_data) {
score[data_index + offset] += val;
}
}
void CUDAScoreUpdater::LaunchAddScoreConstantKernel(const double val, const size_t offset) {
const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_;
Log::Warning("adding init score = %f", val);
AddScoreConstantKernel<<<num_blocks, num_threads_per_block_>>>(val, offset, num_data_, cuda_score_);
}
__global__ void MultiplyScoreConstantKernel(
const double val,
const size_t offset,
const data_size_t num_data,
double* score) {
const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
if (data_index < num_data) {
score[data_index] *= val;
}
}
void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const size_t offset) {
const int num_blocks = (num_data_ + num_threads_per_block_) / num_threads_per_block_;
MultiplyScoreConstantKernel<<<num_blocks, num_threads_per_block_>>>(val, offset, num_data_, cuda_score_);
}
} // namespace LightGBM
#endif // USE_CUDA_EXP
/*!
* Copyright (c) 2021 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
*/
#ifndef LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
#define LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
#ifdef USE_CUDA_EXP
#include <LightGBM/cuda/cuda_utils.h>
#include "../score_updater.hpp"
namespace LightGBM {
class CUDAScoreUpdater: public ScoreUpdater {
public:
CUDAScoreUpdater(const Dataset* data, int num_tree_per_iteration, const bool boosting_on_cuda);
~CUDAScoreUpdater();
inline void AddScore(double val, int cur_tree_id) override;
inline void AddScore(const Tree* tree, int cur_tree_id) override;
inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) override;
inline void AddScore(const Tree* tree, const data_size_t* data_indices,
data_size_t data_cnt, int cur_tree_id) override;
inline void MultiplyScore(double val, int cur_tree_id) override;
inline const double* score() const override {
if (boosting_on_cuda_) {
return cuda_score_;
} else {
return score_.data();
}
}
/*! \brief Disable copy */
CUDAScoreUpdater& operator=(const CUDAScoreUpdater&) = delete;
CUDAScoreUpdater(const CUDAScoreUpdater&) = delete;
private:
void InitCUDA(const size_t total_size);
void LaunchAddScoreConstantKernel(const double val, const size_t offset);
void LaunchMultiplyScoreConstantKernel(const double val, const size_t offset);
double* cuda_score_;
const int num_threads_per_block_;
const bool boosting_on_cuda_;
};
} // namespace LightGBM
#endif // USE_CUDA_EXP
#endif // LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
......@@ -67,6 +67,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) {
LGBM_config_::current_learner = use_cuda_learner;
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
}
#endif // USE_CUDA_EXP
}
// load forced_splits file
......@@ -89,8 +95,9 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
is_constant_hessian_ = GetIsConstHessian(objective_function);
const bool boosting_on_gpu = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
tree_learner_ = std::unique_ptr<TreeLearner>(TreeLearner::CreateTreeLearner(config_->tree_learner, config_->device_type,
config_.get()));
config_.get(), boosting_on_gpu));
// init tree learner
tree_learner_->Init(train_data_, is_constant_hessian_);
......@@ -103,15 +110,44 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
}
training_metrics_.shrink_to_fit();
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu));
} else {
#endif // USE_CUDA_EXP
train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
num_data_ = train_data_->num_data();
// create buffer for gradients and Hessians
if (objective_function_ != nullptr) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu) {
AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
} else {
#endif // USE_CUDA_EXP
gradients_.resize(total_size);
hessians_.resize(total_size);
gradients_pointer_ = gradients_.data();
hessians_pointer_ = hessians_.data();
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
#ifndef USE_CUDA_EXP
}
#else // USE_CUDA_EXP
} else {
if (config_->device_type == std::string("cuda_exp")) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
}
}
#endif // USE_CUDA_EXP
// get max feature index
max_feature_idx_ = train_data_->num_total_features() - 1;
// get label index
......@@ -145,7 +181,13 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
Log::Fatal("Cannot add validation data, since it has different bin mappers with training data");
}
// for a validation dataset, we need its score and metric
auto new_score_updater = std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
auto new_score_updater =
#ifdef USE_CUDA_EXP
config_->device_type == std::string("cuda_exp") ?
std::unique_ptr<CUDAScoreUpdater>(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_,
objective_function_ != nullptr && objective_function_->IsCUDAObjective())) :
#endif // USE_CUDA_EXP
std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
// update score
for (int i = 0; i < iter_; ++i) {
for (int cur_tree_id = 0; cur_tree_id < num_tree_per_iteration_; ++cur_tree_id) {
......@@ -177,7 +219,7 @@ void GBDT::Boosting() {
// objective function will calculate gradients and hessians
int64_t num_score = 0;
objective_function_->
GetGradients(GetTrainingScore(&num_score), gradients_.data(), hessians_.data());
GetGradients(GetTrainingScore(&num_score), gradients_pointer_, hessians_pointer_);
}
data_size_t GBDT::BaggingHelper(data_size_t start, data_size_t cnt, data_size_t* buffer) {
......@@ -251,14 +293,33 @@ void GBDT::Bagging(int iter) {
Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
// set bagging data to tree learner
if (!is_use_subset_) {
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
} else {
#endif // USE_CUDA_EXP
tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
bag_data_cnt_, false);
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
bag_data_cnt_);
} else {
#endif // USE_CUDA_EXP
tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
bag_data_cnt_);
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
}
}
}
......@@ -313,8 +374,8 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves());
}
size_t offset = static_cast<size_t>(tree_id) * num_data_;
auto grad = gradients_.data() + offset;
auto hess = hessians_.data() + offset;
auto grad = gradients_pointer_ + offset;
auto hess = hessians_pointer_ + offset;
auto new_tree = tree_learner_->FitByExistingTree(models_[model_index].get(), leaf_pred, grad, hess);
train_score_updater_->AddScore(tree_learner_.get(), new_tree, tree_id);
models_[model_index].reset(new_tree);
......@@ -377,9 +438,22 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
init_scores[cur_tree_id] = BoostFromAverage(cur_tree_id, true);
}
Boosting();
gradients = gradients_.data();
hessians = hessians_.data();
gradients = gradients_pointer_;
hessians = hessians_pointer_;
#ifndef USE_CUDA_EXP
}
#else // USE_CUDA_EXP
} else {
if (config_->device_type == std::string("cuda_exp")) {
const size_t total_size = static_cast<size_t>(num_data_ * num_class_);
CopyFromHostToCUDADevice<score_t>(gradients_pointer_, gradients, total_size, __FILE__, __LINE__);
CopyFromHostToCUDADevice<score_t>(hessians_pointer_, hessians, total_size, __FILE__, __LINE__);
gradients = gradients_pointer_;
hessians = hessians_pointer_;
}
}
#endif // USE_CUDA_EXP
// bagging logic
Bagging(iter_);
......@@ -393,11 +467,11 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
// need to copy gradients for bagging subset.
if (is_use_subset_ && bag_data_cnt_ < num_data_ && config_->device_type != std::string("cuda_exp")) {
for (int i = 0; i < bag_data_cnt_; ++i) {
gradients_[offset + i] = grad[bag_data_indices_[i]];
hessians_[offset + i] = hess[bag_data_indices_[i]];
gradients_pointer_[offset + i] = grad[bag_data_indices_[i]];
hessians_pointer_[offset + i] = hess[bag_data_indices_[i]];
}
grad = gradients_.data() + offset;
hess = hessians_.data() + offset;
grad = gradients_pointer_ + offset;
hess = hessians_pointer_ + offset;
}
bool is_first_tree = models_.size() < static_cast<size_t>(num_tree_per_iteration_);
new_tree.reset(tree_learner_->Train(grad, hess, is_first_tree));
......@@ -493,7 +567,15 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
// we need to predict out-of-bag scores of data for boosting
if (num_data_ - bag_data_cnt_ > 0) {
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
train_score_updater_->AddScore(tree, cuda_bag_data_indices_.RawData() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
} else {
#endif // USE_CUDA_EXP
train_score_updater_->AddScore(tree, bag_data_indices_.data() + bag_data_cnt_, num_data_ - bag_data_cnt_, cur_tree_id);
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
}
} else {
......@@ -508,7 +590,29 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
}
std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score) const {
#ifdef USE_CUDA_EXP
const bool boosting_on_cuda = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
const bool evaluation_on_cuda = metric->IsCUDAMetric();
if ((boosting_on_cuda && evaluation_on_cuda) || (!boosting_on_cuda && !evaluation_on_cuda)) {
#endif // USE_CUDA_EXP
return metric->Eval(score, objective_function_);
#ifdef USE_CUDA_EXP
} else if (boosting_on_cuda && !evaluation_on_cuda) {
const size_t total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_tree_per_iteration_);
if (total_size > host_score_.size()) {
host_score_.resize(total_size, 0.0f);
}
CopyFromCUDADeviceToHost<double>(host_score_.data(), score, total_size, __FILE__, __LINE__);
return metric->Eval(host_score_.data(), objective_function_);
} else {
const size_t total_size = static_cast<size_t>(num_data_) * static_cast<size_t>(num_tree_per_iteration_);
if (total_size > cuda_score_.Size()) {
cuda_score_.Resize(total_size);
}
CopyFromHostToCUDADevice<double>(cuda_score_.RawData(), score, total_size, __FILE__, __LINE__);
return metric->Eval(cuda_score_.RawData(), objective_function_);
}
#endif // USE_CUDA_EXP
}
std::string GBDT::OutputMetric(int iter) {
......@@ -700,11 +804,23 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
}
training_metrics_.shrink_to_fit();
#ifdef USE_CUDA_EXP
const bool boosting_on_gpu = objective_function_ != nullptr && objective_function_->IsCUDAObjective();
#endif // USE_CUDA_EXP
if (train_data != train_data_) {
train_data_ = train_data;
// not same training data, need reset score and others
// create score tracker
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu));
} else {
#endif // USE_CUDA_EXP
train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
// update score
for (int i = 0; i < iter_; ++i) {
......@@ -719,8 +835,19 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
// create buffer for gradients and hessians
if (objective_function_ != nullptr) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu) {
AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
} else {
#endif // USE_CUDA_EXP
gradients_.resize(total_size);
hessians_.resize(total_size);
gradients_pointer_ = gradients_.data();
hessians_pointer_ = hessians_.data();
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
}
max_feature_idx_ = train_data_->num_total_features() - 1;
......@@ -795,6 +922,11 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
bag_data_cnt_ = static_cast<data_size_t>(config->bagging_fraction * num_data_);
}
bag_data_indices_.resize(num_data_);
#ifdef USE_CUDA_EXP
if (config->device_type == std::string("cuda_exp")) {
cuda_bag_data_indices_.Resize(num_data_);
}
#endif // USE_CUDA_EXP
bagging_runner_.ReSize(num_data_);
bagging_rands_.clear();
for (int i = 0;
......@@ -823,13 +955,27 @@ void GBDT::ResetBaggingConfig(const Config* config, bool is_change_dataset) {
if (is_use_subset_ && bag_data_cnt_ < num_data_) {
if (objective_function_ == nullptr) {
size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp") && objective_function_ != nullptr && objective_function_->IsCUDAObjective()) {
AllocateCUDAMemory<score_t>(&gradients_pointer_, total_size, __FILE__, __LINE__);
AllocateCUDAMemory<score_t>(&hessians_pointer_, total_size, __FILE__, __LINE__);
} else {
#endif // USE_CUDA_EXP
gradients_.resize(total_size);
hessians_.resize(total_size);
gradients_pointer_ = gradients_.data();
hessians_pointer_ = hessians_.data();
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
}
}
} else {
bag_data_cnt_ = num_data_;
bag_data_indices_.clear();
#ifdef USE_CUDA_EXP
cuda_bag_data_indices_.Clear();
#endif // USE_CUDA_EXP
bagging_runner_.ReSize(0);
is_use_subset_ = false;
}
......
......@@ -23,6 +23,7 @@
#include <utility>
#include <vector>
#include "cuda/cuda_score_updater.hpp"
#include "score_updater.hpp"
namespace LightGBM {
......@@ -499,6 +500,18 @@ class GBDT : public GBDTBase {
/*! \brief Second order derivative of training data */
std::vector<score_t, Common::AlignmentAllocator<score_t, kAlignedSize>> hessians_;
#endif
/*! \brief Pointer to gradient vector, can be on CPU or GPU */
score_t* gradients_pointer_;
/*! \brief Pointer to hessian vector, can be on CPU or GPU */
score_t* hessians_pointer_;
#ifdef USE_CUDA_EXP
/*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */
mutable std::vector<double> host_score_;
/*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */
mutable CUDAVector<double> cuda_score_;
/*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */
CUDAVector<data_size_t> cuda_bag_data_indices_;
#endif // USE_CUDA_EXP
/*! \brief Store the indices of in-bag data */
std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> bag_data_indices_;
......
......@@ -167,14 +167,33 @@ class GOSS: public GBDT {
bag_data_cnt_ = left_cnt;
// set bagging data to tree learner
if (!is_use_subset_) {
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
tree_learner_->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
} else {
#endif // USE_CUDA_EXP
tree_learner_->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
} else {
// get subset
tmp_subset_->ReSize(bag_data_cnt_);
tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
bag_data_cnt_, false);
#ifdef USE_CUDA_EXP
if (config_->device_type == std::string("cuda_exp")) {
CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
tree_learner_->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
bag_data_cnt_);
} else {
#endif // USE_CUDA_EXP
tree_learner_->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
bag_data_cnt_);
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
}
}
......
......@@ -51,7 +51,7 @@ class ScoreUpdater {
inline bool has_init_score() const { return has_init_score_; }
inline void AddScore(double val, int cur_tree_id) {
virtual inline void AddScore(double val, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
......@@ -60,7 +60,7 @@ class ScoreUpdater {
}
}
inline void MultiplyScore(double val, int cur_tree_id) {
virtual inline void MultiplyScore(double val, int cur_tree_id) {
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
for (int i = 0; i < num_data_; ++i) {
......@@ -73,7 +73,7 @@ class ScoreUpdater {
* \param tree Trained tree model
* \param cur_tree_id Current tree for multiclass training
*/
inline void AddScore(const Tree* tree, int cur_tree_id) {
virtual inline void AddScore(const Tree* tree, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree->AddPredictionToScore(data_, num_data_, score_.data() + offset);
......@@ -85,7 +85,7 @@ class ScoreUpdater {
* \param tree_learner
* \param cur_tree_id Current tree for multiclass training
*/
inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
virtual inline void AddScore(const TreeLearner* tree_learner, const Tree* tree, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree_learner->AddPredictionToScore(tree, score_.data() + offset);
......@@ -98,14 +98,14 @@ class ScoreUpdater {
* \param data_cnt Number of data that will be processed
* \param cur_tree_id Current tree for multiclass training
*/
inline void AddScore(const Tree* tree, const data_size_t* data_indices,
virtual inline void AddScore(const Tree* tree, const data_size_t* data_indices,
data_size_t data_cnt, int cur_tree_id) {
Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
tree->AddPredictionToScore(data_, data_indices, data_cnt, score_.data() + offset);
}
/*! \brief Pointer of score */
inline const double* score() const { return score_.data(); }
virtual inline const double* score() const { return score_.data(); }
inline data_size_t num_data() const { return num_data_; }
......@@ -114,7 +114,7 @@ class ScoreUpdater {
/*! \brief Disable copy */
ScoreUpdater(const ScoreUpdater&) = delete;
private:
protected:
/*! \brief Number of total data */
data_size_t num_data_;
/*! \brief Pointer of data set */
......
......@@ -238,6 +238,20 @@ int CUDATree::SplitCategorical(const int leaf_index,
return num_leaves_ - 1;
}
void CUDATree::AddPredictionToScore(const Dataset* data,
data_size_t num_data,
double* score) const {
LaunchAddPredictionToScoreKernel(data, nullptr, num_data, score);
SynchronizeCUDADevice(__FILE__, __LINE__);
}
void CUDATree::AddPredictionToScore(const Dataset* data,
const data_size_t* used_data_indices,
data_size_t num_data, double* score) const {
LaunchAddPredictionToScoreKernel(data, used_data_indices, num_data, score);
SynchronizeCUDADevice(__FILE__, __LINE__);
}
inline void CUDATree::Shrinkage(double rate) {
Tree::Shrinkage(rate);
LaunchShrinkageKernel(rate);
......
......@@ -303,6 +303,131 @@ void CUDATree::LaunchAddBiasKernel(const double val) {
AddBiasKernel<<<num_blocks, num_threads_per_block>>>(val, cuda_leaf_value_, num_leaves_);
}
template <bool USE_INDICES>
__global__ void AddPredictionToScoreKernel(
// dataset information
const data_size_t num_data,
void* const* cuda_data_by_column,
const uint8_t* cuda_column_bit_type,
const uint32_t* cuda_feature_min_bin,
const uint32_t* cuda_feature_max_bin,
const uint32_t* cuda_feature_offset,
const uint32_t* cuda_feature_default_bin,
const uint32_t* cuda_feature_most_freq_bin,
const int* cuda_feature_to_column,
const data_size_t* cuda_used_indices,
// tree information
const uint32_t* cuda_threshold_in_bin,
const int8_t* cuda_decision_type,
const int* cuda_split_feature_inner,
const int* cuda_left_child,
const int* cuda_right_child,
const double* cuda_leaf_value,
// output
double* score) {
const data_size_t inner_data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
const data_size_t data_index = USE_INDICES ? cuda_used_indices[inner_data_index] : inner_data_index;
if (data_index < num_data) {
int node = 0;
while (node >= 0) {
const int split_feature_inner = cuda_split_feature_inner[node];
const int column = cuda_feature_to_column[split_feature_inner];
const uint32_t default_bin = cuda_feature_default_bin[split_feature_inner];
const uint32_t most_freq_bin = cuda_feature_most_freq_bin[split_feature_inner];
const uint32_t max_bin = cuda_feature_max_bin[split_feature_inner];
const uint32_t min_bin = cuda_feature_min_bin[split_feature_inner];
const uint32_t offset = cuda_feature_offset[split_feature_inner];
const uint8_t column_bit_type = cuda_column_bit_type[column];
uint32_t bin = 0;
if (column_bit_type == 8) {
bin = static_cast<uint32_t>((reinterpret_cast<const uint8_t*>(cuda_data_by_column[column]))[data_index]);
} else if (column_bit_type == 16) {
bin = static_cast<uint32_t>((reinterpret_cast<const uint16_t*>(cuda_data_by_column[column]))[data_index]);
} else if (column_bit_type == 32) {
bin = static_cast<uint32_t>((reinterpret_cast<const uint32_t*>(cuda_data_by_column[column]))[data_index]);
}
if (bin >= min_bin && bin <= max_bin) {
bin = bin - min_bin + offset;
} else {
bin = most_freq_bin;
}
const int8_t decision_type = cuda_decision_type[node];
const uint32_t threshold_in_bin = cuda_threshold_in_bin[node];
const int8_t missing_type = ((decision_type >> 2) & 3);
const bool default_left = ((decision_type & kDefaultLeftMask) > 0);
if ((missing_type == 1 && bin == default_bin) || (missing_type == 2 && bin == max_bin)) {
if (default_left) {
node = cuda_left_child[node];
} else {
node = cuda_right_child[node];
}
} else {
if (bin <= threshold_in_bin) {
node = cuda_left_child[node];
} else {
node = cuda_right_child[node];
}
}
}
score[data_index] += cuda_leaf_value[~node];
}
}
void CUDATree::LaunchAddPredictionToScoreKernel(
const Dataset* data,
const data_size_t* used_data_indices,
data_size_t num_data,
double* score) const {
const CUDAColumnData* cuda_column_data = data->cuda_column_data();
const int num_blocks = (num_data + num_threads_per_block_add_prediction_to_score_ - 1) / num_threads_per_block_add_prediction_to_score_;
if (used_data_indices == nullptr) {
AddPredictionToScoreKernel<false><<<num_blocks, num_threads_per_block_add_prediction_to_score_>>>(
// dataset information
num_data,
cuda_column_data->cuda_data_by_column(),
cuda_column_data->cuda_column_bit_type(),
cuda_column_data->cuda_feature_min_bin(),
cuda_column_data->cuda_feature_max_bin(),
cuda_column_data->cuda_feature_offset(),
cuda_column_data->cuda_feature_default_bin(),
cuda_column_data->cuda_feature_most_freq_bin(),
cuda_column_data->cuda_feature_to_column(),
nullptr,
// tree information
cuda_threshold_in_bin_,
cuda_decision_type_,
cuda_split_feature_inner_,
cuda_left_child_,
cuda_right_child_,
cuda_leaf_value_,
// output
score);
} else {
AddPredictionToScoreKernel<true><<<num_blocks, num_threads_per_block_add_prediction_to_score_>>>(
// dataset information
num_data,
cuda_column_data->cuda_data_by_column(),
cuda_column_data->cuda_column_bit_type(),
cuda_column_data->cuda_feature_min_bin(),
cuda_column_data->cuda_feature_max_bin(),
cuda_column_data->cuda_feature_offset(),
cuda_column_data->cuda_feature_default_bin(),
cuda_column_data->cuda_feature_most_freq_bin(),
cuda_column_data->cuda_feature_to_column(),
used_data_indices,
// tree information
cuda_threshold_in_bin_,
cuda_decision_type_,
cuda_split_feature_inner_,
cuda_left_child_,
cuda_right_child_,
cuda_leaf_value_,
// output
score);
}
SynchronizeCUDADevice(__FILE__, __LINE__);
}
} // namespace LightGBM
#endif // USE_CUDA_EXP
......@@ -14,53 +14,130 @@
namespace LightGBM {
Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
#ifdef USE_CUDA_EXP
if (config.device_type == std::string("cuda_exp")) {
if (type == std::string("l2")) {
Log::Warning("Metric l2 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new L2Metric(config);
} else if (type == std::string("rmse")) {
Log::Warning("Metric rmse is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new RMSEMetric(config);
} else if (type == std::string("l1")) {
Log::Warning("Metric l1 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new L1Metric(config);
} else if (type == std::string("quantile")) {
Log::Warning("Metric quantile is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new QuantileMetric(config);
} else if (type == std::string("huber")) {
Log::Warning("Metric huber is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new HuberLossMetric(config);
} else if (type == std::string("fair")) {
Log::Warning("Metric fair is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new FairLossMetric(config);
} else if (type == std::string("poisson")) {
Log::Warning("Metric poisson is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new PoissonMetric(config);
} else if (type == std::string("binary_logloss")) {
Log::Warning("Metric binary_logloss is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new BinaryLoglossMetric(config);
} else if (type == std::string("binary_error")) {
Log::Warning("Metric binary_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new BinaryErrorMetric(config);
} else if (type == std::string("auc")) {
Log::Warning("Metric auc is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new AUCMetric(config);
} else if (type == std::string("average_precision")) {
Log::Warning("Metric average_precision is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new AveragePrecisionMetric(config);
} else if (type == std::string("auc_mu")) {
Log::Warning("Metric auc_mu is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new AucMuMetric(config);
} else if (type == std::string("ndcg")) {
Log::Warning("Metric ndcg is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new NDCGMetric(config);
} else if (type == std::string("map")) {
Log::Warning("Metric map is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new MapMetric(config);
} else if (type == std::string("multi_logloss")) {
Log::Warning("Metric multi_logloss is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new MultiSoftmaxLoglossMetric(config);
} else if (type == std::string("multi_error")) {
Log::Warning("Metric multi_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new MultiErrorMetric(config);
} else if (type == std::string("cross_entropy")) {
Log::Warning("Metric cross_entropy is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new CrossEntropyMetric(config);
} else if (type == std::string("cross_entropy_lambda")) {
Log::Warning("Metric cross_entropy_lambda is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new CrossEntropyLambdaMetric(config);
} else if (type == std::string("kullback_leibler")) {
Log::Warning("Metric kullback_leibler is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new KullbackLeiblerDivergence(config);
} else if (type == std::string("mape")) {
Log::Warning("Metric mape is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new MAPEMetric(config);
} else if (type == std::string("gamma")) {
Log::Warning("Metric gamma is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new GammaMetric(config);
} else if (type == std::string("gamma_deviance")) {
Log::Warning("Metric gamma_deviance is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new GammaDevianceMetric(config);
} else if (type == std::string("tweedie")) {
Log::Warning("Metric tweedie is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
return new TweedieMetric(config);
}
} else {
#endif // USE_CUDA_EXP
if (type == std::string("l2")) {
return new L2Metric(config);
} else if (type == std::string("rmse")) {
return new RMSEMetric(config);
} else if (type == std::string("l1")) {
return new L1Metric(config);
} else if (type == std::string("quantile")) {
return new QuantileMetric(config);
} else if (type == std::string("huber")) {
return new HuberLossMetric(config);
} else if (type == std::string("fair")) {
return new FairLossMetric(config);
} else if (type == std::string("poisson")) {
return new PoissonMetric(config);
} else if (type == std::string("binary_logloss")) {
return new BinaryLoglossMetric(config);
} else if (type == std::string("binary_error")) {
return new BinaryErrorMetric(config);
} else if (type == std::string("auc")) {
return new AUCMetric(config);
} else if (type == std::string("average_precision")) {
return new AveragePrecisionMetric(config);
} else if (type == std::string("auc_mu")) {
return new AucMuMetric(config);
} else if (type == std::string("ndcg")) {
return new NDCGMetric(config);
} else if (type == std::string("map")) {
return new MapMetric(config);
} else if (type == std::string("multi_logloss")) {
return new MultiSoftmaxLoglossMetric(config);
} else if (type == std::string("multi_error")) {
return new MultiErrorMetric(config);
} else if (type == std::string("cross_entropy")) {
return new CrossEntropyMetric(config);
} else if (type == std::string("cross_entropy_lambda")) {
return new CrossEntropyLambdaMetric(config);
} else if (type == std::string("kullback_leibler")) {
return new KullbackLeiblerDivergence(config);
} else if (type == std::string("mape")) {
return new MAPEMetric(config);
} else if (type == std::string("gamma")) {
return new GammaMetric(config);
} else if (type == std::string("gamma_deviance")) {
return new GammaDevianceMetric(config);
} else if (type == std::string("tweedie")) {
return new TweedieMetric(config);
}
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
return nullptr;
}
......
......@@ -13,41 +13,100 @@
namespace LightGBM {
ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) {
#ifdef USE_CUDA_EXP
if (config.device_type == std::string("cuda_exp")) {
if (type == std::string("regression")) {
Log::Warning("Objective regression is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionL2loss(config);
} else if (type == std::string("regression_l1")) {
Log::Warning("Objective regression_l1 is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionL1loss(config);
} else if (type == std::string("quantile")) {
Log::Warning("Objective quantile is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionQuantileloss(config);
} else if (type == std::string("huber")) {
Log::Warning("Objective huber is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionHuberLoss(config);
} else if (type == std::string("fair")) {
Log::Warning("Objective fair is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionFairLoss(config);
} else if (type == std::string("poisson")) {
Log::Warning("Objective poisson is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionPoissonLoss(config);
} else if (type == std::string("binary")) {
Log::Warning("Objective binary is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new BinaryLogloss(config);
} else if (type == std::string("lambdarank")) {
Log::Warning("Objective lambdarank is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new LambdarankNDCG(config);
} else if (type == std::string("rank_xendcg")) {
Log::Warning("Objective rank_xendcg is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RankXENDCG(config);
} else if (type == std::string("multiclass")) {
Log::Warning("Objective multiclass is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new MulticlassSoftmax(config);
} else if (type == std::string("multiclassova")) {
Log::Warning("Objective multiclassova is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new MulticlassOVA(config);
} else if (type == std::string("cross_entropy")) {
Log::Warning("Objective cross_entropy is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new CrossEntropy(config);
} else if (type == std::string("cross_entropy_lambda")) {
Log::Warning("Objective cross_entropy_lambda is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new CrossEntropyLambda(config);
} else if (type == std::string("mape")) {
Log::Warning("Objective mape is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionMAPELOSS(config);
} else if (type == std::string("gamma")) {
Log::Warning("Objective gamma is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionGammaLoss(config);
} else if (type == std::string("tweedie")) {
Log::Warning("Objective tweedie is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionTweedieLoss(config);
} else if (type == std::string("custom")) {
Log::Warning("Using customized objective with cuda_exp. This requires copying gradients from CPU to GPU, which can be slow.");
return nullptr;
}
} else {
#endif // USE_CUDA_EXP
if (type == std::string("regression")) {
return new RegressionL2loss(config);
} else if (type == std::string("regression_l1")) {
return new RegressionL1loss(config);
} else if (type == std::string("quantile")) {
return new RegressionQuantileloss(config);
} else if (type == std::string("huber")) {
return new RegressionHuberLoss(config);
} else if (type == std::string("fair")) {
return new RegressionFairLoss(config);
} else if (type == std::string("poisson")) {
return new RegressionPoissonLoss(config);
} else if (type == std::string("binary")) {
return new BinaryLogloss(config);
} else if (type == std::string("lambdarank")) {
return new LambdarankNDCG(config);
} else if (type == std::string("rank_xendcg")) {
return new RankXENDCG(config);
} else if (type == std::string("multiclass")) {
return new MulticlassSoftmax(config);
} else if (type == std::string("multiclassova")) {
return new MulticlassOVA(config);
} else if (type == std::string("cross_entropy")) {
return new CrossEntropy(config);
} else if (type == std::string("cross_entropy_lambda")) {
return new CrossEntropyLambda(config);
} else if (type == std::string("mape")) {
return new RegressionMAPELOSS(config);
} else if (type == std::string("gamma")) {
return new RegressionGammaLoss(config);
} else if (type == std::string("tweedie")) {
return new RegressionTweedieLoss(config);
} else if (type == std::string("custom")) {
return nullptr;
}
#ifdef USE_CUDA_EXP
}
#endif // USE_CUDA_EXP
Log::Fatal("Unknown objective type name: %s", type.c_str());
return nullptr;
}
......
......@@ -61,7 +61,6 @@ CUDADataPartition::CUDADataPartition(
cuda_out_data_indices_in_leaf_ = nullptr;
cuda_split_info_buffer_ = nullptr;
cuda_num_data_ = nullptr;
cuda_add_train_score_ = nullptr;
}
CUDADataPartition::~CUDADataPartition() {
......@@ -78,7 +77,6 @@ CUDADataPartition::~CUDADataPartition() {
DeallocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__);
DeallocateCUDAMemory<int>(&cuda_split_info_buffer_, __FILE__, __LINE__);
DeallocateCUDAMemory<data_size_t>(&cuda_num_data_, __FILE__, __LINE__);
DeallocateCUDAMemory<double>(&cuda_add_train_score_, __FILE__, __LINE__);
CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[0]));
CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[1]));
CUDASUCCESS_OR_FATAL(cudaStreamDestroy(cuda_streams_[2]));
......@@ -115,8 +113,6 @@ void CUDADataPartition::Init() {
gpuAssert(cudaStreamCreate(&cuda_streams_[3]), __FILE__, __LINE__);
InitCUDAMemoryFromHostMemory<data_size_t>(&cuda_num_data_, &num_data_, 1, __FILE__, __LINE__);
add_train_score_.resize(num_data_, 0.0f);
AllocateCUDAMemory<double>(&cuda_add_train_score_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
use_bagging_ = false;
used_indices_ = nullptr;
}
......@@ -270,33 +266,11 @@ void CUDADataPartition::UpdateTrainScore(const Tree* tree, double* scores) {
cuda_tree_ptr.reset(new CUDATree(tree));
cuda_tree = cuda_tree_ptr.get();
}
const data_size_t num_data_in_root = root_num_data();
if (use_bagging_) {
// we need restore the order of indices in cuda_data_indices_
CopyFromHostToCUDADevice<data_size_t>(cuda_data_indices_, used_indices_, static_cast<size_t>(num_used_indices_), __FILE__, __LINE__);
}
LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), cuda_add_train_score_);
CopyFromCUDADeviceToHost<double>(add_train_score_.data(),
cuda_add_train_score_, static_cast<size_t>(num_data_in_root), __FILE__, __LINE__);
if (!use_bagging_) {
OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(num_threads_)
for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) {
OMP_LOOP_EX_BEGIN();
scores[data_index] += add_train_score_[data_index];
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
} else {
OMP_INIT_EX();
#pragma omp parallel for schedule(static) num_threads(num_threads_)
for (data_size_t data_index = 0; data_index < num_data_in_root; ++data_index) {
OMP_LOOP_EX_BEGIN();
scores[used_indices_[data_index]] += add_train_score_[data_index];
OMP_LOOP_EX_END();
}
OMP_THROW_EX();
CopyFromCUDADeviceToCUDADevice<data_size_t>(cuda_data_indices_, used_indices_, static_cast<size_t>(num_used_indices_), __FILE__, __LINE__);
}
LaunchAddPredictionToScoreKernel(cuda_tree->cuda_leaf_value(), scores);
}
void CUDADataPartition::CalcBlockDim(const data_size_t num_data_in_leaf) {
......@@ -318,7 +292,7 @@ void CUDADataPartition::SetUsedDataIndices(const data_size_t* used_indices, cons
use_bagging_ = true;
num_used_indices_ = num_used_indices;
used_indices_ = used_indices;
CopyFromHostToCUDADevice<data_size_t>(cuda_data_indices_, used_indices, static_cast<size_t>(num_used_indices), __FILE__, __LINE__);
CopyFromCUDADeviceToCUDADevice<data_size_t>(cuda_data_indices_, used_indices, static_cast<size_t>(num_used_indices), __FILE__, __LINE__);
LaunchFillDataIndexToLeafIndex();
}
......@@ -347,14 +321,11 @@ void CUDADataPartition::ResetTrainingData(const Dataset* train_data, const int n
DeallocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, __FILE__, __LINE__);
DeallocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, __FILE__, __LINE__);
DeallocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, __FILE__, __LINE__);
DeallocateCUDAMemory<double>(&cuda_add_train_score_, __FILE__, __LINE__);
add_train_score_.resize(num_data_, 0.0f);
AllocateCUDAMemory<data_size_t>(&cuda_data_indices_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
AllocateCUDAMemory<uint16_t>(&cuda_block_to_left_offset_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
AllocateCUDAMemory<int>(&cuda_data_index_to_leaf_index_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
AllocateCUDAMemory<data_size_t>(&cuda_out_data_indices_in_leaf_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
AllocateCUDAMemory<double>(&cuda_add_train_score_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
used_indices_ = nullptr;
use_bagging_ = false;
......
......@@ -1045,11 +1045,11 @@ __global__ void AddPredictionToScoreKernel(
const data_size_t global_data_index = data_indices_in_leaf[local_data_index];
const int leaf_index = cuda_data_index_to_leaf_index[global_data_index];
const double leaf_prediction_value = leaf_value[leaf_index];
cuda_scores[local_data_index] = leaf_prediction_value;
cuda_scores[global_data_index] += leaf_prediction_value;
} else {
const int leaf_index = cuda_data_index_to_leaf_index[local_data_index];
const double leaf_prediction_value = leaf_value[leaf_index];
cuda_scores[local_data_index] = leaf_prediction_value;
cuda_scores[local_data_index] += leaf_prediction_value;
}
}
}
......
......@@ -307,8 +307,6 @@ class CUDADataPartition {
int grid_dim_;
/*! \brief block dimension when splitting one leaf */
int block_dim_;
/*! \brief add train score buffer in host */
mutable std::vector<double> add_train_score_;
/*! \brief data indices used in this iteration */
const data_size_t* used_indices_;
/*! \brief marks whether a feature is a categorical feature */
......@@ -376,10 +374,6 @@ class CUDADataPartition {
/*! \brief number of data in training set, for intialization of cuda_leaf_num_data_ and cuda_leaf_data_end_ */
data_size_t* cuda_num_data_;
// for train score update
/*! \brief added train score buffer in CUDA */
double* cuda_add_train_score_;
// CUDA memory, held by other object
......
......@@ -19,14 +19,16 @@
namespace LightGBM {
CUDASingleGPUTreeLearner::CUDASingleGPUTreeLearner(const Config* config): SerialTreeLearner(config) {
CUDASingleGPUTreeLearner::CUDASingleGPUTreeLearner(const Config* config, const bool boosting_on_cuda): SerialTreeLearner(config), boosting_on_cuda_(boosting_on_cuda) {
cuda_gradients_ = nullptr;
cuda_hessians_ = nullptr;
}
CUDASingleGPUTreeLearner::~CUDASingleGPUTreeLearner() {
if (!boosting_on_cuda_) {
DeallocateCUDAMemory<score_t>(&cuda_gradients_, __FILE__, __LINE__);
DeallocateCUDAMemory<score_t>(&cuda_hessians_, __FILE__, __LINE__);
}
}
void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
......@@ -64,28 +66,45 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_
leaf_data_start_.resize(config_->num_leaves, 0);
leaf_sum_hessians_.resize(config_->num_leaves, 0.0f);
if (!boosting_on_cuda_) {
AllocateCUDAMemory<score_t>(&cuda_gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
AllocateCUDAMemory<score_t>(&cuda_hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
AllocateBitset();
cuda_leaf_gradient_stat_buffer_ = nullptr;
cuda_leaf_hessian_stat_buffer_ = nullptr;
leaf_stat_buffer_size_ = 0;
num_cat_threshold_ = 0;
#ifdef DEBUG
host_gradients_.resize(num_data_, 0.0f);
host_hessians_.resize(num_data_, 0.0f);
#endif // DEBUG
}
void CUDASingleGPUTreeLearner::BeforeTrain() {
const data_size_t root_num_data = cuda_data_partition_->root_num_data();
if (!boosting_on_cuda_) {
CopyFromHostToCUDADevice<score_t>(cuda_gradients_, gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
CopyFromHostToCUDADevice<score_t>(cuda_hessians_, hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
gradients_ = cuda_gradients_;
hessians_ = cuda_hessians_;
}
#ifdef DEBUG
CopyFromCUDADeviceToHost<score_t>(host_gradients.data(), gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
CopyFromCUDADeviceToHost<score_t>(host_hessians.data(), hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
#endif // DEBUG
const data_size_t* leaf_splits_init_indices =
cuda_data_partition_->use_bagging() ? cuda_data_partition_->cuda_data_indices() : nullptr;
cuda_data_partition_->BeforeTrain();
cuda_smaller_leaf_splits_->InitValues(
config_->lambda_l1,
config_->lambda_l2,
cuda_gradients_,
cuda_hessians_,
gradients_,
hessians_,
leaf_splits_init_indices,
cuda_data_partition_->cuda_data_indices(),
root_num_data,
......@@ -93,7 +112,7 @@ void CUDASingleGPUTreeLearner::BeforeTrain() {
&leaf_sum_hessians_[0]);
leaf_num_data_[0] = root_num_data;
cuda_larger_leaf_splits_->InitValues();
cuda_histogram_constructor_->BeforeTrain(cuda_gradients_, cuda_hessians_);
cuda_histogram_constructor_->BeforeTrain(gradients_, hessians_);
col_sampler_.ResetByTree();
cuda_best_split_finder_->BeforeTrain(col_sampler_.is_feature_used_bytree());
leaf_data_start_[0] = 0;
......@@ -247,10 +266,12 @@ void CUDASingleGPUTreeLearner::ResetTrainingData(
cuda_smaller_leaf_splits_->Resize(num_data_);
cuda_larger_leaf_splits_->Resize(num_data_);
CHECK_EQ(is_constant_hessian, share_state_->is_constant_hessian);
if (!boosting_on_cuda_) {
DeallocateCUDAMemory<score_t>(&cuda_gradients_, __FILE__, __LINE__);
DeallocateCUDAMemory<score_t>(&cuda_hessians_, __FILE__, __LINE__);
AllocateCUDAMemory<score_t>(&cuda_gradients_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
AllocateCUDAMemory<score_t>(&cuda_hessians_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
}
}
void CUDASingleGPUTreeLearner::ResetConfig(const Config* config) {
......@@ -444,13 +465,13 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(
double sum_right_gradients = 0.0f, sum_right_hessians = 0.0f;
for (size_t i = 0; i < left_data_indices.size(); ++i) {
const data_size_t index = left_data_indices[i];
sum_left_gradients += gradients_[index];
sum_left_hessians += hessians_[index];
sum_left_gradients += host_gradients_[index];
sum_left_hessians += host_hessians_[index];
}
for (size_t i = 0; i < right_data_indices.size(); ++i) {
const data_size_t index = right_data_indices[i];
sum_right_gradients += gradients_[index];
sum_right_hessians += hessians_[index];
sum_right_gradients += host_gradients_[index];
sum_right_hessians += host_hessians_[index];
}
CHECK_LE(std::fabs(sum_left_gradients - split_sum_left_gradients), 1e-6f);
CHECK_LE(std::fabs(sum_left_hessians - leaf_sum_hessians_[left_leaf]), 1e-6f);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment