"include/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "2db6377a227b78a03044237f0c599cd65627b9f9"
Unverified Commit 24af9fa5 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

[CUDA] Add Poisson regression objective for cuda_exp and refactor objective...


[CUDA] Add Poisson regression objective for cuda_exp and refactor objective functions for cuda_exp (#5486)

* add poisson regression objective for cuda_exp

* enable Poisson regression for cuda_exp

* refactor cuda objective functions

* remove useless changes

* fix linter errors

* remove redundant buffer in cuda poisson regression objective

* fix log of cuda_exp binary objective

* fix threshold of poisson objective result

* remove useless changes

* fix compilation errors

* add cuda quantile regression objective

* remove cuda quantile regression objective
Co-authored-by: default avatarJames Lamb <jaylamb20@gmail.com>
parent f7e64a8f
......@@ -184,6 +184,40 @@ __device__ __forceinline__ void GlobalMemoryPrefixSum(T* array, const size_t len
}
}
template <typename T>
__device__ __forceinline__ T ShuffleReduceMinWarp(T value, const data_size_t len) {
if (len > 0) {
const uint32_t mask = (0xffffffff >> (warpSize - len));
for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
const T other_value = __shfl_down_sync(mask, value, offset);
value = (other_value < value) ? other_value : value;
}
}
return value;
}
// reduce values from an 1-dimensional block (block size must be no greather than 1024)
template <typename T>
__device__ __forceinline__ T ShuffleReduceMin(T value, T* shared_mem_buffer, const size_t len) {
const uint32_t warpLane = threadIdx.x % warpSize;
const uint32_t warpID = threadIdx.x / warpSize;
const data_size_t warp_len = min(static_cast<data_size_t>(warpSize), static_cast<data_size_t>(len) - static_cast<data_size_t>(warpID * warpSize));
value = ShuffleReduceMinWarp<T>(value, warp_len);
if (warpLane == 0) {
shared_mem_buffer[warpID] = value;
}
__syncthreads();
const data_size_t num_warp = static_cast<data_size_t>((len + warpSize - 1) / warpSize);
if (warpID == 0) {
value = (warpLane < num_warp ? shared_mem_buffer[warpLane] : shared_mem_buffer[0]);
value = ShuffleReduceMinWarp<T>(value, num_warp);
}
return value;
}
template <typename VAL_T, typename REDUCE_T>
void ShuffleReduceMinGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer);
template <typename VAL_T, typename INDEX_T, bool ASCENDING>
__device__ __forceinline__ void BitonicArgSort_1024(const VAL_T* scores, INDEX_T* indices, const INDEX_T num_items) {
INDEX_T depth = 1;
......
......@@ -13,11 +13,68 @@
#include <LightGBM/objective_function.h>
#include <LightGBM/meta.h>
#include <string>
#include <vector>
namespace LightGBM {
class CUDAObjectiveInterface {
template <typename HOST_OBJECTIVE>
class CUDAObjectiveInterface: public HOST_OBJECTIVE {
public:
virtual void ConvertOutputCUDA(const data_size_t /*num_data*/, const double* /*input*/, double* /*output*/) const {}
explicit CUDAObjectiveInterface(const Config& config): HOST_OBJECTIVE(config) {}
explicit CUDAObjectiveInterface(const std::vector<std::string>& strs): HOST_OBJECTIVE(strs) {}
void Init(const Metadata& metadata, data_size_t num_data) {
HOST_OBJECTIVE::Init(metadata, num_data);
cuda_labels_ = metadata.cuda_metadata()->cuda_label();
cuda_weights_ = metadata.cuda_metadata()->cuda_weights();
}
virtual void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
LaunchConvertOutputCUDAKernel(num_data, input, output);
}
std::function<void(data_size_t, const double*, double*)> GetCUDAConvertOutputFunc() const override {
return [this] (data_size_t num_data, const double* input, double* output) {
ConvertOutputCUDA(num_data, input, output);
};
}
double BoostFromScore(int class_id) const override {
return LaunchCalcInitScoreKernel(class_id);
}
bool IsCUDAObjective() const override { return true; }
void GetGradients(const double* scores, score_t* gradients, score_t* hessians) const override {
LaunchGetGradientsKernel(scores, gradients, hessians);
SynchronizeCUDADevice(__FILE__, __LINE__);
}
void RenewTreeOutputCUDA(const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf,
const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override {
global_timer.Start("CUDAObjectiveInterface::LaunchRenewTreeOutputCUDAKernel");
LaunchRenewTreeOutputCUDAKernel(score, data_indices_in_leaf, num_data_in_leaf, data_start_in_leaf, num_leaves, leaf_value);
SynchronizeCUDADevice(__FILE__, __LINE__);
global_timer.Stop("CUDAObjectiveInterface::LaunchRenewTreeOutputCUDAKernel");
}
protected:
virtual void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const = 0;
virtual double LaunchCalcInitScoreKernel(const int class_id) const {
return HOST_OBJECTIVE::BoostFromScore(class_id);
}
virtual void LaunchConvertOutputCUDAKernel(const data_size_t /*num_data*/, const double* /*input*/, double* /*output*/) const {}
virtual void LaunchRenewTreeOutputCUDAKernel(
const double* /*score*/, const data_size_t* /*data_indices_in_leaf*/, const data_size_t* /*num_data_in_leaf*/,
const data_size_t* /*data_start_in_leaf*/, const int /*num_leaves*/, double* /*leaf_value*/) const {}
const label_t* cuda_labels_;
const label_t* cuda_weights_;
};
} // namespace LightGBM
......
......@@ -55,27 +55,16 @@ __global__ void ShufflePrefixSumGlobalAddBase(size_t len, const T* block_prefix_
}
template <typename T>
void ShufflePrefixSumGlobalInner(T* values, size_t len, T* block_prefix_sum_buffer) {
void ShufflePrefixSumGlobal(T* values, size_t len, T* block_prefix_sum_buffer) {
const int num_blocks = (static_cast<int>(len) + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE;
ShufflePrefixSumGlobalKernel<<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values, len, block_prefix_sum_buffer);
ShufflePrefixSumGlobalReduceBlockKernel<<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_prefix_sum_buffer, num_blocks);
ShufflePrefixSumGlobalAddBase<<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(len, block_prefix_sum_buffer, values);
}
template <>
void ShufflePrefixSumGlobal(uint16_t* values, size_t len, uint16_t* block_prefix_sum_buffer) {
ShufflePrefixSumGlobalInner<uint16_t>(values, len, block_prefix_sum_buffer);
}
template <>
void ShufflePrefixSumGlobal(uint32_t* values, size_t len, uint32_t* block_prefix_sum_buffer) {
ShufflePrefixSumGlobalInner<uint32_t>(values, len, block_prefix_sum_buffer);
}
template <>
void ShufflePrefixSumGlobal(uint64_t* values, size_t len, uint64_t* block_prefix_sum_buffer) {
ShufflePrefixSumGlobalInner<uint64_t>(values, len, block_prefix_sum_buffer);
}
template void ShufflePrefixSumGlobal<uint16_t>(uint16_t* values, size_t len, uint16_t* block_prefix_sum_buffer);
template void ShufflePrefixSumGlobal<uint32_t>(uint32_t* values, size_t len, uint32_t* block_prefix_sum_buffer);
template void ShufflePrefixSumGlobal<uint64_t>(uint64_t* values, size_t len, uint64_t* block_prefix_sum_buffer);
__global__ void BitonicArgSortItemsGlobalKernel(const double* scores,
const int num_queries,
......@@ -130,18 +119,52 @@ __global__ void ShuffleReduceSumGlobalKernel(const VAL_T* values, const data_siz
}
template <typename VAL_T, typename REDUCE_T>
void ShuffleReduceSumGlobalInner(const VAL_T* values, size_t n, REDUCE_T* block_buffer) {
void ShuffleReduceSumGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer) {
const data_size_t num_value = static_cast<data_size_t>(n);
const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE;
ShuffleReduceSumGlobalKernel<VAL_T, REDUCE_T><<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values, num_value, block_buffer);
BlockReduceSum<REDUCE_T><<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks);
}
template <>
void ShuffleReduceSumGlobal<label_t, double>(const label_t* values, size_t n, double* block_buffer) {
ShuffleReduceSumGlobalInner(values, n, block_buffer);
template void ShuffleReduceSumGlobal<label_t, double>(const label_t* values, size_t n, double* block_buffer);
template <typename VAL_T, typename REDUCE_T>
__global__ void ShuffleReduceMinGlobalKernel(const VAL_T* values, const data_size_t num_value, REDUCE_T* block_buffer) {
__shared__ REDUCE_T shared_buffer[32];
const data_size_t data_index = static_cast<data_size_t>(blockIdx.x * blockDim.x + threadIdx.x);
const REDUCE_T value = (data_index < num_value ? static_cast<REDUCE_T>(values[data_index]) : 0.0f);
const REDUCE_T reduce_value = ShuffleReduceMin<REDUCE_T>(value, shared_buffer, blockDim.x);
if (threadIdx.x == 0) {
block_buffer[blockIdx.x] = reduce_value;
}
}
template <typename T>
__global__ void ShuffleBlockReduceMin(T* block_buffer, const data_size_t num_blocks) {
__shared__ T shared_buffer[32];
T thread_min = 0;
for (data_size_t block_index = static_cast<data_size_t>(threadIdx.x); block_index < num_blocks; block_index += static_cast<data_size_t>(blockDim.x)) {
const T value = block_buffer[block_index];
if (value < thread_min) {
thread_min = value;
}
}
thread_min = ShuffleReduceMin<T>(thread_min, shared_buffer, blockDim.x);
if (threadIdx.x == 0) {
block_buffer[0] = thread_min;
}
}
template <typename VAL_T, typename REDUCE_T>
void ShuffleReduceMinGlobal(const VAL_T* values, size_t n, REDUCE_T* block_buffer) {
const data_size_t num_value = static_cast<data_size_t>(n);
const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE;
ShuffleReduceMinGlobalKernel<VAL_T, REDUCE_T><<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values, num_value, block_buffer);
ShuffleBlockReduceMin<REDUCE_T><<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks);
}
template void ShuffleReduceMinGlobal<label_t, double>(const label_t* values, size_t n, double* block_buffer);
template <typename VAL_T, typename REDUCE_T>
__global__ void ShuffleReduceDotProdGlobalKernel(const VAL_T* values1, const VAL_T* values2, const data_size_t num_value, REDUCE_T* block_buffer) {
__shared__ REDUCE_T shared_buffer[32];
......@@ -155,17 +178,14 @@ __global__ void ShuffleReduceDotProdGlobalKernel(const VAL_T* values1, const VAL
}
template <typename VAL_T, typename REDUCE_T>
void ShuffleReduceDotProdGlobalInner(const VAL_T* values1, const VAL_T* values2, size_t n, REDUCE_T* block_buffer) {
void ShuffleReduceDotProdGlobal(const VAL_T* values1, const VAL_T* values2, size_t n, REDUCE_T* block_buffer) {
const data_size_t num_value = static_cast<data_size_t>(n);
const data_size_t num_blocks = (num_value + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE;
ShuffleReduceDotProdGlobalKernel<VAL_T, REDUCE_T><<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(values1, values2, num_value, block_buffer);
BlockReduceSum<REDUCE_T><<<1, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(block_buffer, num_blocks);
}
template <>
void ShuffleReduceDotProdGlobal<label_t, double>(const label_t* values1, const label_t* values2, size_t n, double* block_buffer) {
ShuffleReduceDotProdGlobalInner(values1, values2, n, block_buffer);
}
template void ShuffleReduceDotProdGlobal<label_t, double>(const label_t* values1, const label_t* values2, size_t n, double* block_buffer);
template <typename INDEX_T, typename VAL_T, typename REDUCE_T>
__global__ void GlobalInclusiveArgPrefixSumKernel(
......@@ -209,7 +229,7 @@ __global__ void GlobalInclusivePrefixSumAddBlockBaseKernel(const T* block_buffer
}
template <typename VAL_T, typename REDUCE_T, typename INDEX_T>
void GlobalInclusiveArgPrefixSumInner(const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, size_t n) {
void GlobalInclusiveArgPrefixSum(const INDEX_T* sorted_indices, const VAL_T* in_values, REDUCE_T* out_values, REDUCE_T* block_buffer, size_t n) {
const data_size_t num_data = static_cast<data_size_t>(n);
const data_size_t num_blocks = (num_data + GLOBAL_PREFIX_SUM_BLOCK_SIZE - 1) / GLOBAL_PREFIX_SUM_BLOCK_SIZE;
GlobalInclusiveArgPrefixSumKernel<INDEX_T, VAL_T, REDUCE_T><<<num_blocks, GLOBAL_PREFIX_SUM_BLOCK_SIZE>>>(
......@@ -223,10 +243,7 @@ void GlobalInclusiveArgPrefixSumInner(const INDEX_T* sorted_indices, const VAL_T
SynchronizeCUDADevice(__FILE__, __LINE__);
}
template <>
void GlobalInclusiveArgPrefixSum<label_t, double, data_size_t>(const data_size_t* sorted_indices, const label_t* in_values, double* out_values, double* block_buffer, size_t n) {
GlobalInclusiveArgPrefixSumInner<label_t, double, data_size_t>(sorted_indices, in_values, out_values, block_buffer, n);
}
template void GlobalInclusiveArgPrefixSum<label_t, double, data_size_t>(const data_size_t* sorted_indices, const label_t* in_values, double* out_values, double* block_buffer, size_t n);
template <typename VAL_T, typename INDEX_T, bool ASCENDING>
__global__ void BitonicArgSortGlobalKernel(const VAL_T* values, INDEX_T* indices, const int num_total_data) {
......@@ -424,7 +441,7 @@ void BitonicArgSortGlobal<data_size_t, int, true>(const data_size_t* values, int
}
template <typename VAL_T, typename INDEX_T, typename WEIGHT_T, typename REDUCE_WEIGHT_T, bool ASCENDING, bool USE_WEIGHT>
__device__ VAL_T PercentileDeviceInner(const VAL_T* values,
__device__ VAL_T PercentileDevice(const VAL_T* values,
const WEIGHT_T* weights,
INDEX_T* indices,
REDUCE_WEIGHT_T* weights_prefix_sum,
......@@ -472,27 +489,21 @@ __device__ VAL_T PercentileDeviceInner(const VAL_T* values,
}
}
template <>
__device__ double PercentileDevice<double, data_size_t, label_t, double, false, true>(
template __device__ double PercentileDevice<double, data_size_t, label_t, double, false, true>(
const double* values,
const label_t* weights,
data_size_t* indices,
double* weights_prefix_sum,
const double alpha,
const data_size_t len) {
return PercentileDeviceInner<double, data_size_t, label_t, double, false, true>(values, weights, indices, weights_prefix_sum, alpha, len);
}
const data_size_t len);
template <>
__device__ double PercentileDevice<double, data_size_t, label_t, double, false, false>(
template __device__ double PercentileDevice<double, data_size_t, label_t, double, false, false>(
const double* values,
const label_t* weights,
data_size_t* indices,
double* weights_prefix_sum,
const double alpha,
const data_size_t len) {
return PercentileDeviceInner<double, data_size_t, label_t, double, false, false>(values, weights, indices, weights_prefix_sum, alpha, len);
}
const data_size_t len);
} // namespace LightGBM
......
......@@ -160,7 +160,7 @@ class BinaryLogloss: public ObjectiveFunction {
pavg = std::min(pavg, 1.0 - kEpsilon);
pavg = std::max<double>(pavg, kEpsilon);
double initscore = std::log(pavg / (1.0f - pavg)) / sigmoid_;
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, initscore);
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, initscore);
return initscore;
}
......
......@@ -14,7 +14,7 @@
namespace LightGBM {
CUDABinaryLogloss::CUDABinaryLogloss(const Config& config):
BinaryLogloss(config), ova_class_id_(-1) {
CUDAObjectiveInterface<BinaryLogloss>(config), ova_class_id_(-1) {
cuda_label_ = nullptr;
cuda_ova_label_ = nullptr;
cuda_weights_ = nullptr;
......@@ -24,9 +24,11 @@ BinaryLogloss(config), ova_class_id_(-1) {
}
CUDABinaryLogloss::CUDABinaryLogloss(const Config& config, const int ova_class_id):
BinaryLogloss(config, [ova_class_id](label_t label) { return static_cast<int>(label) == ova_class_id; }), ova_class_id_(ova_class_id) {}
CUDAObjectiveInterface<BinaryLogloss>(config), ova_class_id_(ova_class_id) {
is_pos_ = [ova_class_id](label_t label) { return static_cast<int>(label) == ova_class_id; };
}
CUDABinaryLogloss::CUDABinaryLogloss(const std::vector<std::string>& strs): BinaryLogloss(strs) {}
CUDABinaryLogloss::CUDABinaryLogloss(const std::vector<std::string>& strs): CUDAObjectiveInterface<BinaryLogloss>(strs) {}
CUDABinaryLogloss::~CUDABinaryLogloss() {
DeallocateCUDAMemory<label_t>(&cuda_ova_label_, __FILE__, __LINE__);
......@@ -36,13 +38,13 @@ CUDABinaryLogloss::~CUDABinaryLogloss() {
}
void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) {
BinaryLogloss::Init(metadata, num_data);
CUDAObjectiveInterface<BinaryLogloss>::Init(metadata, num_data);
if (ova_class_id_ == -1) {
cuda_label_ = metadata.cuda_metadata()->cuda_label();
cuda_ova_label_ = nullptr;
} else {
InitCUDAMemoryFromHostMemory<label_t>(&cuda_ova_label_, metadata.cuda_metadata()->cuda_label(), static_cast<size_t>(num_data), __FILE__, __LINE__);
LaunchResetOVACUDALableKernel();
LaunchResetOVACUDALabelKernel();
cuda_label_ = cuda_ova_label_;
}
cuda_weights_ = metadata.cuda_metadata()->cuda_weights();
......@@ -57,26 +59,6 @@ void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) {
}
}
void CUDABinaryLogloss::GetGradients(const double* scores, score_t* gradients, score_t* hessians) const {
LaunchGetGradientsKernel(scores, gradients, hessians);
SynchronizeCUDADevice(__FILE__, __LINE__);
}
double CUDABinaryLogloss::BoostFromScore(int) const {
LaunchBoostFromScoreKernel();
SynchronizeCUDADevice(__FILE__, __LINE__);
double boost_from_score = 0.0f;
CopyFromCUDADeviceToHost<double>(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__);
double pavg = 0.0f;
CopyFromCUDADeviceToHost<double>(&pavg, cuda_sum_weights_, 1, __FILE__, __LINE__);
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), __func__, pavg, boost_from_score);
return boost_from_score;
}
void CUDABinaryLogloss::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
LaunchConvertOutputCUDAKernel(num_data, input, output);
}
} // namespace LightGBM
#endif // USE_CUDA_EXP
......@@ -86,7 +86,7 @@ __global__ void BoostFromScoreKernel_2_BinaryLogloss(double* out_cuda_sum_labels
*out_cuda_sum_labels = init_score;
}
void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const {
double CUDABinaryLogloss::LaunchCalcInitScoreKernel(const int /*class_id*/) const {
const int num_blocks = (num_data_ + CALC_INIT_SCORE_BLOCK_SIZE_BINARY - 1) / CALC_INIT_SCORE_BLOCK_SIZE_BINARY;
SetCUDAMemory<double>(cuda_boost_from_score_, 0, 1, __FILE__, __LINE__);
if (cuda_weights_ == nullptr) {
......@@ -103,6 +103,13 @@ void CUDABinaryLogloss::LaunchBoostFromScoreKernel() const {
BoostFromScoreKernel_2_BinaryLogloss<true><<<1, 1>>>(cuda_boost_from_score_, cuda_sum_weights_, num_data_, sigmoid_);
}
SynchronizeCUDADevice(__FILE__, __LINE__);
double boost_from_score = 0.0f;
CopyFromCUDADeviceToHost<double>(&boost_from_score, cuda_boost_from_score_, 1, __FILE__, __LINE__);
double pavg = 0.0f;
CopyFromCUDADeviceToHost<double>(&pavg, cuda_sum_weights_, 1, __FILE__, __LINE__);
// for some test cases in test_utilities.py which check the log output
Log::Info("[%s:%s]: pavg=%f -> initscore=%f", GetName(), "BoostFromScore", pavg, boost_from_score);
return boost_from_score;
}
template <bool USE_LABEL_WEIGHT, bool USE_WEIGHT>
......@@ -180,7 +187,7 @@ void CUDABinaryLogloss::LaunchConvertOutputCUDAKernel(const data_size_t num_data
ConvertOutputCUDAKernel_BinaryLogloss<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(sigmoid_, num_data, input, output);
}
__global__ void ResetOVACUDALableKernel(
__global__ void ResetOVACUDALabelKernel(
const int ova_class_id,
const data_size_t num_data,
label_t* cuda_label) {
......@@ -191,9 +198,9 @@ __global__ void ResetOVACUDALableKernel(
}
}
void CUDABinaryLogloss::LaunchResetOVACUDALableKernel() const {
void CUDABinaryLogloss::LaunchResetOVACUDALabelKernel() const {
const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_BINARY - 1) / GET_GRADIENTS_BLOCK_SIZE_BINARY;
ResetOVACUDALableKernel<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(ova_class_id_, num_data_, cuda_ova_label_);
ResetOVACUDALabelKernel<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_BINARY>>>(ova_class_id_, num_data_, cuda_ova_label_);
}
} // namespace LightGBM
......
......@@ -21,7 +21,7 @@
namespace LightGBM {
class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss {
class CUDABinaryLogloss : public CUDAObjectiveInterface<BinaryLogloss> {
public:
explicit CUDABinaryLogloss(const Config& config);
......@@ -33,28 +33,14 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface, public BinaryLogloss {
void Init(const Metadata& metadata, data_size_t num_data) override;
void GetGradients(const double* scores, score_t* gradients, score_t* hessians) const override;
double BoostFromScore(int) const override;
void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override;
std::function<void(data_size_t, const double*, double*)> GetCUDAConvertOutputFunc() const override {
return [this] (data_size_t num_data, const double* input, double* output) {
ConvertOutputCUDA(num_data, input, output);
};
}
bool IsCUDAObjective() const override { return true; }
private:
void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const;
void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const override;
void LaunchBoostFromScoreKernel() const;
double LaunchCalcInitScoreKernel(const int class_id) const override;
void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const;
void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
void LaunchResetOVACUDALableKernel() const;
void LaunchResetOVACUDALabelKernel() const;
// CUDA memory, held by other objects
const label_t* cuda_label_;
......
......@@ -12,37 +12,26 @@
namespace LightGBM {
CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const Config& config): MulticlassSoftmax(config) {}
CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const Config& config): CUDAObjectiveInterface<MulticlassSoftmax>(config) {}
CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const std::vector<std::string>& strs): MulticlassSoftmax(strs) {}
CUDAMulticlassSoftmax::CUDAMulticlassSoftmax(const std::vector<std::string>& strs): CUDAObjectiveInterface<MulticlassSoftmax>(strs) {}
CUDAMulticlassSoftmax::~CUDAMulticlassSoftmax() {}
void CUDAMulticlassSoftmax::Init(const Metadata& metadata, data_size_t num_data) {
MulticlassSoftmax::Init(metadata, num_data);
cuda_label_ = metadata.cuda_metadata()->cuda_label();
cuda_weights_ = metadata.cuda_metadata()->cuda_weights();
CUDAObjectiveInterface<MulticlassSoftmax>::Init(metadata, num_data);
cuda_softmax_buffer_.Resize(static_cast<size_t>(num_data) * static_cast<size_t>(num_class_));
SynchronizeCUDADevice(__FILE__, __LINE__);
}
void CUDAMulticlassSoftmax::GetGradients(const double* score, score_t* gradients, score_t* hessians) const {
LaunchGetGradientsKernel(score, gradients, hessians);
SynchronizeCUDADevice(__FILE__, __LINE__);
}
void CUDAMulticlassSoftmax::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
LaunchConvertOutputCUDAKernel(num_data, input, output);
}
CUDAMulticlassOVA::CUDAMulticlassOVA(const Config& config): MulticlassOVA(config) {
CUDAMulticlassOVA::CUDAMulticlassOVA(const Config& config): CUDAObjectiveInterface<MulticlassOVA>(config) {
for (int i = 0; i < num_class_; ++i) {
cuda_binary_loss_.emplace_back(new CUDABinaryLogloss(config, i));
}
}
CUDAMulticlassOVA::CUDAMulticlassOVA(const std::vector<std::string>& strs): MulticlassOVA(strs) {}
CUDAMulticlassOVA::CUDAMulticlassOVA(const std::vector<std::string>& strs): CUDAObjectiveInterface<MulticlassOVA>(strs) {}
CUDAMulticlassOVA::~CUDAMulticlassOVA() {}
......
......@@ -36,7 +36,6 @@ __global__ void GetGradientsKernel_MulticlassSoftmax(
const data_size_t offset = data_index * num_class;
double* softmax_result = cuda_softmax_buffer + offset;
for (int k = 0; k < num_class; ++k) {
const double point_score = cuda_scores[k * num_data + data_index];
softmax_result[k] = cuda_scores[k * num_data + data_index];
}
SoftmaxCUDA(softmax_result, num_class);
......@@ -71,11 +70,11 @@ void CUDAMulticlassSoftmax::LaunchGetGradientsKernel(const double* scores, score
const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_MULTICLASS - 1) / GET_GRADIENTS_BLOCK_SIZE_MULTICLASS;
if (cuda_weights_ == nullptr) {
GetGradientsKernel_MulticlassSoftmax<false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_MULTICLASS>>>(
scores, cuda_label_, cuda_weights_, factor_, num_class_, num_data_,
scores, cuda_labels_, cuda_weights_, factor_, num_class_, num_data_,
cuda_softmax_buffer_.RawData(), gradients, hessians);
} else {
GetGradientsKernel_MulticlassSoftmax<true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_MULTICLASS>>>(
scores, cuda_label_, cuda_weights_, factor_, num_class_, num_data_,
scores, cuda_labels_, cuda_weights_, factor_, num_class_, num_data_,
cuda_softmax_buffer_.RawData(), gradients, hessians);
}
}
......
......@@ -21,7 +21,7 @@
namespace LightGBM {
class CUDAMulticlassSoftmax: public CUDAObjectiveInterface, public MulticlassSoftmax {
class CUDAMulticlassSoftmax: public CUDAObjectiveInterface<MulticlassSoftmax> {
public:
explicit CUDAMulticlassSoftmax(const Config& config);
......@@ -31,33 +31,17 @@ class CUDAMulticlassSoftmax: public CUDAObjectiveInterface, public MulticlassSof
void Init(const Metadata& metadata, data_size_t num_data) override;
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;
void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override;
std::function<void(data_size_t, const double*, double*)> GetCUDAConvertOutputFunc() const override {
return [this] (data_size_t num_data, const double* input, double* output) {
ConvertOutputCUDA(num_data, input, output);
};
}
bool IsCUDAObjective() const override { return true; }
private:
void LaunchGetGradientsKernel(const double* scores, score_t* gradients, score_t* hessians) const;
void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const;
// CUDA memory, held by other objects
const label_t* cuda_label_;
const label_t* cuda_weights_;
// CUDA memory, held by this object
CUDAVector<double> cuda_softmax_buffer_;
};
class CUDAMulticlassOVA: public CUDAObjectiveInterface, public MulticlassOVA {
class CUDAMulticlassOVA: public CUDAObjectiveInterface<MulticlassOVA> {
public:
explicit CUDAMulticlassOVA(const Config& config);
......@@ -82,6 +66,8 @@ class CUDAMulticlassOVA: public CUDAObjectiveInterface, public MulticlassOVA {
bool IsCUDAObjective() const override { return true; }
private:
void LaunchGetGradientsKernel(const double* /*scores*/, score_t* /*gradients*/, score_t* /*hessians*/) const {}
std::vector<std::unique_ptr<CUDABinaryLogloss>> cuda_binary_loss_;
};
......
......@@ -13,68 +13,36 @@
namespace LightGBM {
CUDALambdarankNDCG::CUDALambdarankNDCG(const Config& config):
LambdarankNDCG(config) {}
CUDALambdarankNDCG::CUDALambdarankNDCG(const Config& config): CUDALambdaRankObjectiveInterface<LambdarankNDCG>(config) {}
CUDALambdarankNDCG::CUDALambdarankNDCG(const std::vector<std::string>& strs): LambdarankNDCG(strs) {}
CUDALambdarankNDCG::CUDALambdarankNDCG(const std::vector<std::string>& strs): CUDALambdaRankObjectiveInterface<LambdarankNDCG>(strs) {}
void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) {
const int num_threads = OMP_NUM_THREADS();
LambdarankNDCG::Init(metadata, num_data);
std::vector<uint16_t> thread_max_num_items_in_query(num_threads);
Threading::For<data_size_t>(0, num_queries_, 1,
[this, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) {
for (data_size_t query_index = start; query_index < end; ++query_index) {
const data_size_t query_item_count = query_boundaries_[query_index + 1] - query_boundaries_[query_index];
if (query_item_count > thread_max_num_items_in_query[thread_index]) {
thread_max_num_items_in_query[thread_index] = query_item_count;
}
}
});
data_size_t max_items_in_query = 0;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
if (thread_max_num_items_in_query[thread_index] > max_items_in_query) {
max_items_in_query = thread_max_num_items_in_query[thread_index];
}
}
max_items_in_query_aligned_ = 1;
--max_items_in_query;
while (max_items_in_query > 0) {
max_items_in_query >>= 1;
max_items_in_query_aligned_ <<= 1;
}
if (max_items_in_query_aligned_ > 2048) {
cuda_item_indices_buffer_.Resize(static_cast<size_t>(metadata.query_boundaries()[metadata.num_queries()]));
}
cuda_labels_ = metadata.cuda_metadata()->cuda_label();
cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries();
cuda_inverse_max_dcgs_.Resize(inverse_max_dcgs_.size());
CopyFromHostToCUDADevice(cuda_inverse_max_dcgs_.RawData(), inverse_max_dcgs_.data(), inverse_max_dcgs_.size(), __FILE__, __LINE__);
cuda_label_gain_.Resize(label_gain_.size());
CopyFromHostToCUDADevice(cuda_label_gain_.RawData(), label_gain_.data(), label_gain_.size(), __FILE__, __LINE__);
}
CUDALambdarankNDCG::~CUDALambdarankNDCG() {}
void CUDALambdarankNDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const {
LaunchGetGradientsKernel(score, gradients, hessians);
void CUDALambdarankNDCG::Init(const Metadata& metadata, data_size_t num_data) {
CUDALambdaRankObjectiveInterface<LambdarankNDCG>::Init(metadata, num_data);
cuda_inverse_max_dcgs_.Resize(this->inverse_max_dcgs_.size());
CopyFromHostToCUDADevice(cuda_inverse_max_dcgs_.RawData(), this->inverse_max_dcgs_.data(), this->inverse_max_dcgs_.size(), __FILE__, __LINE__);
cuda_label_gain_.Resize(this->label_gain_.size());
CopyFromHostToCUDADevice(cuda_label_gain_.RawData(), this->label_gain_.data(), this->label_gain_.size(), __FILE__, __LINE__);
}
CUDARankXENDCG::CUDARankXENDCG(const Config& config): CUDALambdarankNDCG(config) {}
CUDARankXENDCG::CUDARankXENDCG(const Config& config): CUDALambdaRankObjectiveInterface<RankXENDCG>(config) {}
CUDARankXENDCG::CUDARankXENDCG(const std::vector<std::string>& strs): CUDALambdarankNDCG(strs) {}
CUDARankXENDCG::CUDARankXENDCG(const std::vector<std::string>& strs): CUDALambdaRankObjectiveInterface<RankXENDCG>(strs) {}
CUDARankXENDCG::~CUDARankXENDCG() {}
void CUDARankXENDCG::Init(const Metadata& metadata, data_size_t num_data) {
CUDALambdarankNDCG::Init(metadata, num_data);
CUDALambdaRankObjectiveInterface<RankXENDCG>::Init(metadata, num_data);
for (data_size_t i = 0; i < num_queries_; ++i) {
rands_.emplace_back(seed_ + i);
}
item_rands_.resize(num_data, 0.0f);
AllocateCUDAMemory<double>(&cuda_item_rands_, static_cast<size_t>(num_data), __FILE__, __LINE__);
cuda_item_rands_.Resize(static_cast<size_t>(num_data));
if (max_items_in_query_aligned_ >= 2048) {
AllocateCUDAMemory<double>(&cuda_params_buffer_, static_cast<size_t>(num_data_), __FILE__, __LINE__);
cuda_params_buffer_.Resize(static_cast<size_t>(num_data_));
}
}
......@@ -94,13 +62,6 @@ void CUDARankXENDCG::GenerateItemRands() const {
OMP_THROW_EX();
}
void CUDARankXENDCG::GetGradients(const double* score, score_t* gradients, score_t* hessians) const {
GenerateItemRands();
CopyFromHostToCUDADevice<double>(cuda_item_rands_, item_rands_.data(), item_rands_.size(), __FILE__, __LINE__);
LaunchGetGradientsKernel(score, gradients, hessians);
}
} // namespace LightGBM
#endif // USE_CUDA_EXP
......@@ -616,12 +616,15 @@ __global__ void GetGradientsKernel_RankXENDCG_GlobalMemory(
}
void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const {
GenerateItemRands();
CopyFromHostToCUDADevice<double>(cuda_item_rands_.RawData(), item_rands_.data(), item_rands_.size(), __FILE__, __LINE__);
const int num_blocks = (num_queries_ + NUM_QUERY_PER_BLOCK - 1) / NUM_QUERY_PER_BLOCK;
if (max_items_in_query_aligned_ <= 1024) {
GetGradientsKernel_RankXENDCG_SharedMemory<1024><<<num_blocks, max_items_in_query_aligned_>>>(
score,
cuda_labels_,
cuda_item_rands_,
cuda_item_rands_.RawData(),
num_data_,
num_queries_,
cuda_query_boundaries_,
......@@ -631,7 +634,7 @@ void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* grad
GetGradientsKernel_RankXENDCG_SharedMemory<2 * 1024><<<num_blocks, 1024>>>(
score,
cuda_labels_,
cuda_item_rands_,
cuda_item_rands_.RawData(),
num_data_,
num_queries_,
cuda_query_boundaries_,
......@@ -641,11 +644,11 @@ void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* grad
GetGradientsKernel_RankXENDCG_GlobalMemory<<<num_blocks, 1024>>>(
score,
cuda_labels_,
cuda_item_rands_,
cuda_item_rands_.RawData(),
num_data_,
num_queries_,
cuda_query_boundaries_,
cuda_params_buffer_,
cuda_params_buffer_.RawData(),
gradients,
hessians);
}
......
......@@ -22,28 +22,53 @@
namespace LightGBM {
class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG {
template <typename HOST_OBJECTIVE>
class CUDALambdaRankObjectiveInterface : public CUDAObjectiveInterface<HOST_OBJECTIVE> {
public:
explicit CUDALambdarankNDCG(const Config& config);
explicit CUDALambdarankNDCG(const std::vector<std::string>& strs);
void Init(const Metadata& metadata, data_size_t num_data) override;
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;
bool IsCUDAObjective() const override { return true; }
explicit CUDALambdaRankObjectiveInterface(const Config& config): CUDAObjectiveInterface<HOST_OBJECTIVE>(config) {}
explicit CUDALambdaRankObjectiveInterface(const std::vector<std::string>& strs): CUDAObjectiveInterface<HOST_OBJECTIVE>(strs) {}
~CUDALambdaRankObjectiveInterface() {}
void Init(const Metadata& metadata, data_size_t num_data) override {
CUDAObjectiveInterface<HOST_OBJECTIVE>::Init(metadata, num_data);
const int num_threads = OMP_NUM_THREADS();
std::vector<uint16_t> thread_max_num_items_in_query(num_threads);
Threading::For<data_size_t>(0, this->num_queries_, 1,
[this, &thread_max_num_items_in_query] (int thread_index, data_size_t start, data_size_t end) {
for (data_size_t query_index = start; query_index < end; ++query_index) {
const data_size_t query_item_count = this->query_boundaries_[query_index + 1] - this->query_boundaries_[query_index];
if (query_item_count > thread_max_num_items_in_query[thread_index]) {
thread_max_num_items_in_query[thread_index] = query_item_count;
}
}
});
data_size_t max_items_in_query = 0;
for (int thread_index = 0; thread_index < num_threads; ++thread_index) {
if (thread_max_num_items_in_query[thread_index] > max_items_in_query) {
max_items_in_query = thread_max_num_items_in_query[thread_index];
}
}
max_items_in_query_aligned_ = 1;
--max_items_in_query;
while (max_items_in_query > 0) {
max_items_in_query >>= 1;
max_items_in_query_aligned_ <<= 1;
}
if (max_items_in_query_aligned_ > 2048) {
cuda_item_indices_buffer_.Resize(static_cast<size_t>(metadata.query_boundaries()[metadata.num_queries()]));
}
this->cuda_labels_ = metadata.cuda_metadata()->cuda_label();
cuda_query_boundaries_ = metadata.cuda_metadata()->cuda_query_boundaries();
}
protected:
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const;
// CUDA memory, held by this object
CUDAVector<double> cuda_inverse_max_dcgs_;
CUDAVector<double> cuda_label_gain_;
CUDAVector<int> cuda_item_indices_buffer_;
// CUDA memory, held by other objects
const label_t* cuda_labels_;
const data_size_t* cuda_query_boundaries_;
// Host memory
......@@ -51,7 +76,26 @@ class CUDALambdarankNDCG : public CUDAObjectiveInterface, public LambdarankNDCG
};
class CUDARankXENDCG : public CUDALambdarankNDCG {
class CUDALambdarankNDCG: public CUDALambdaRankObjectiveInterface<LambdarankNDCG> {
public:
explicit CUDALambdarankNDCG(const Config& config);
explicit CUDALambdarankNDCG(const std::vector<std::string>& strs);
void Init(const Metadata& mdtadata, data_size_t num_data) override;
~CUDALambdarankNDCG();
private:
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
// CUDA memory, held by this object
CUDAVector<double> cuda_inverse_max_dcgs_;
CUDAVector<double> cuda_label_gain_;
};
class CUDARankXENDCG : public CUDALambdaRankObjectiveInterface<RankXENDCG> {
public:
explicit CUDARankXENDCG(const Config& config);
......@@ -61,19 +105,14 @@ class CUDARankXENDCG : public CUDALambdarankNDCG {
void Init(const Metadata& metadata, data_size_t num_data) override;
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;
bool IsCUDAObjective() const override { return true; }
private:
protected:
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const;
void GenerateItemRands() const;
mutable std::vector<double> item_rands_;
mutable std::vector<Random> rands_;
mutable double* cuda_item_rands_;
mutable double* cuda_params_buffer_;
CUDAVector<double> cuda_item_rands_;
CUDAVector<double> cuda_params_buffer_;
};
......
......@@ -14,49 +14,27 @@
namespace LightGBM {
CUDARegressionL2loss::CUDARegressionL2loss(const Config& config):
RegressionL2loss(config) {}
CUDARegressionObjectiveInterface<RegressionL2loss>(config) {}
CUDARegressionL2loss::CUDARegressionL2loss(const std::vector<std::string>& strs):
RegressionL2loss(strs) {}
CUDARegressionObjectiveInterface<RegressionL2loss>(strs) {}
CUDARegressionL2loss::~CUDARegressionL2loss() {}
void CUDARegressionL2loss::Init(const Metadata& metadata, data_size_t num_data) {
RegressionL2loss::Init(metadata, num_data);
cuda_labels_ = metadata.cuda_metadata()->cuda_label();
cuda_weights_ = metadata.cuda_metadata()->cuda_weights();
num_get_gradients_blocks_ = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION;
cuda_block_buffer_.Resize(static_cast<size_t>(num_get_gradients_blocks_));
if (sqrt_) {
cuda_trans_label_.Resize(trans_label_.size());
CopyFromHostToCUDADevice<label_t>(cuda_trans_label_.RawData(), trans_label_.data(), trans_label_.size(), __FILE__, __LINE__);
cuda_labels_ = cuda_trans_label_.RawData();
}
}
void CUDARegressionL2loss::GetGradients(const double* score, score_t* gradients, score_t* hessians) const {
LaunchGetGradientsKernel(score, gradients, hessians);
}
double CUDARegressionL2loss::BoostFromScore(int) const {
return LaunchCalcInitScoreKernel();
CUDARegressionObjectiveInterface<RegressionL2loss>::Init(metadata, num_data);
}
void CUDARegressionL2loss::ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const {
LaunchConvertOutputCUDAKernel(num_data, input, output);
}
CUDARegressionL1loss::CUDARegressionL1loss(const Config& config):
CUDARegressionL2loss(config) {}
CUDARegressionObjectiveInterface<RegressionL1loss>(config) {}
CUDARegressionL1loss::CUDARegressionL1loss(const std::vector<std::string>& strs):
CUDARegressionL2loss(strs) {}
CUDARegressionObjectiveInterface<RegressionL1loss>(strs) {}
CUDARegressionL1loss::~CUDARegressionL1loss() {}
void CUDARegressionL1loss::Init(const Metadata& metadata, data_size_t num_data) {
CUDARegressionL2loss::Init(metadata, num_data);
CUDARegressionObjectiveInterface<RegressionL1loss>::Init(metadata, num_data);
cuda_data_indices_buffer_.Resize(static_cast<size_t>(num_data));
cuda_percentile_result_.Resize(1);
if (cuda_weights_ != nullptr) {
......@@ -68,43 +46,43 @@ void CUDARegressionL1loss::Init(const Metadata& metadata, data_size_t num_data)
cuda_residual_buffer_.Resize(static_cast<size_t>(num_data));
}
void CUDARegressionL1loss::RenewTreeOutputCUDA(
const double* score,
const data_size_t* data_indices_in_leaf,
const data_size_t* num_data_in_leaf,
const data_size_t* data_start_in_leaf,
const int num_leaves,
double* leaf_value) const {
global_timer.Start("CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel");
LaunchRenewTreeOutputCUDAKernel(score, data_indices_in_leaf, num_data_in_leaf, data_start_in_leaf, num_leaves, leaf_value);
SynchronizeCUDADevice(__FILE__, __LINE__);
global_timer.Stop("CUDARegressionL1loss::LaunchRenewTreeOutputCUDAKernel");
}
CUDARegressionHuberLoss::CUDARegressionHuberLoss(const Config& config):
CUDARegressionL2loss(config), alpha_(config.alpha) {
if (sqrt_) {
Log::Warning("Cannot use sqrt transform in %s Regression, will auto disable it", GetName());
sqrt_ = false;
}
}
CUDARegressionObjectiveInterface<RegressionHuberLoss>(config) {}
CUDARegressionHuberLoss::CUDARegressionHuberLoss(const std::vector<std::string>& strs):
CUDARegressionL2loss(strs) {}
CUDARegressionObjectiveInterface<RegressionHuberLoss>(strs) {}
CUDARegressionHuberLoss::~CUDARegressionHuberLoss() {}
CUDARegressionFairLoss::CUDARegressionFairLoss(const Config& config):
CUDARegressionL2loss(config), c_(config.fair_c) {}
CUDARegressionObjectiveInterface<RegressionFairLoss>(config) {}
CUDARegressionFairLoss::CUDARegressionFairLoss(const std::vector<std::string>& strs):
CUDARegressionL2loss(strs) {}
CUDARegressionObjectiveInterface<RegressionFairLoss>(strs) {}
CUDARegressionFairLoss::~CUDARegressionFairLoss() {}
CUDARegressionPoissonLoss::CUDARegressionPoissonLoss(const Config& config):
CUDARegressionObjectiveInterface<RegressionPoissonLoss>(config) {}
CUDARegressionPoissonLoss::CUDARegressionPoissonLoss(const std::vector<std::string>& strs):
CUDARegressionObjectiveInterface<RegressionPoissonLoss>(strs) {}
CUDARegressionPoissonLoss::~CUDARegressionPoissonLoss() {}
void CUDARegressionPoissonLoss::Init(const Metadata& metadata, data_size_t num_data) {
CUDARegressionObjectiveInterface<RegressionPoissonLoss>::Init(metadata, num_data);
LaunchCheckLabelKernel();
}
double CUDARegressionPoissonLoss::LaunchCalcInitScoreKernel(const int class_id) const {
return Common::SafeLog(CUDARegressionObjectiveInterface<RegressionPoissonLoss>::LaunchCalcInitScoreKernel(class_id));
}
} // namespace LightGBM
#endif // USE_CUDA_EXP
......@@ -11,21 +11,51 @@
namespace LightGBM {
double CUDARegressionL2loss::LaunchCalcInitScoreKernel() const {
template <typename HOST_OBJECTIVE>
void CUDARegressionObjectiveInterface<HOST_OBJECTIVE>::Init(const Metadata& metadata, data_size_t num_data) {
CUDAObjectiveInterface<HOST_OBJECTIVE>::Init(metadata, num_data);
const data_size_t num_get_gradients_blocks = (this->num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION;
cuda_block_buffer_.Resize(static_cast<size_t>(num_get_gradients_blocks));
if (this->sqrt_) {
cuda_trans_label_.Resize(this->trans_label_.size());
CopyFromHostToCUDADevice<label_t>(cuda_trans_label_.RawData(), this->trans_label_.data(), this->trans_label_.size(), __FILE__, __LINE__);
this->cuda_labels_ = cuda_trans_label_.RawData();
}
}
template void CUDARegressionObjectiveInterface<RegressionL2loss>::Init(const Metadata& metadata, data_size_t num_data);
template void CUDARegressionObjectiveInterface<RegressionL1loss>::Init(const Metadata& metadata, data_size_t num_data);
template void CUDARegressionObjectiveInterface<RegressionHuberLoss>::Init(const Metadata& metadata, data_size_t num_data);
template void CUDARegressionObjectiveInterface<RegressionFairLoss>::Init(const Metadata& metadata, data_size_t num_data);
template void CUDARegressionObjectiveInterface<RegressionPoissonLoss>::Init(const Metadata& metadata, data_size_t num_data);
template void CUDARegressionObjectiveInterface<RegressionQuantileloss>::Init(const Metadata& metadata, data_size_t num_data);
template <typename HOST_OBJECTIVE>
double CUDARegressionObjectiveInterface<HOST_OBJECTIVE>::LaunchCalcInitScoreKernel(const int /*class_id*/) const {
double label_sum = 0.0f, weight_sum = 0.0f;
if (cuda_weights_ == nullptr) {
ShuffleReduceSumGlobal<label_t, double>(cuda_labels_, static_cast<size_t>(num_data_), cuda_block_buffer_.RawData());
if (this->cuda_weights_ == nullptr) {
ShuffleReduceSumGlobal<label_t, double>(this->cuda_labels_,
static_cast<size_t>(this->num_data_), cuda_block_buffer_.RawData());
CopyFromCUDADeviceToHost<double>(&label_sum, cuda_block_buffer_.RawData(), 1, __FILE__, __LINE__);
weight_sum = static_cast<double>(num_data_);
weight_sum = static_cast<double>(this->num_data_);
} else {
ShuffleReduceDotProdGlobal<label_t, double>(cuda_labels_, cuda_weights_, static_cast<size_t>(num_data_), cuda_block_buffer_.RawData());
ShuffleReduceDotProdGlobal<label_t, double>(this->cuda_labels_,
this->cuda_weights_, static_cast<size_t>(this->num_data_), cuda_block_buffer_.RawData());
CopyFromCUDADeviceToHost<double>(&label_sum, cuda_block_buffer_.RawData(), 1, __FILE__, __LINE__);
ShuffleReduceSumGlobal<label_t, double>(cuda_weights_, static_cast<size_t>(num_data_), cuda_block_buffer_.RawData());
ShuffleReduceSumGlobal<label_t, double>(this->cuda_weights_,
static_cast<size_t>(this->num_data_), cuda_block_buffer_.RawData());
CopyFromCUDADeviceToHost<double>(&weight_sum, cuda_block_buffer_.RawData(), 1, __FILE__, __LINE__);
}
return label_sum / weight_sum;
}
template double CUDARegressionObjectiveInterface<RegressionL2loss>::LaunchCalcInitScoreKernel(const int class_id) const;
template double CUDARegressionObjectiveInterface<RegressionL1loss>::LaunchCalcInitScoreKernel(const int class_id) const;
template double CUDARegressionObjectiveInterface<RegressionHuberLoss>::LaunchCalcInitScoreKernel(const int class_id) const;
template double CUDARegressionObjectiveInterface<RegressionFairLoss>::LaunchCalcInitScoreKernel(const int class_id) const;
template double CUDARegressionObjectiveInterface<RegressionPoissonLoss>::LaunchCalcInitScoreKernel(const int class_id) const;
template double CUDARegressionObjectiveInterface<RegressionQuantileloss>::LaunchCalcInitScoreKernel(const int class_id) const;
__global__ void ConvertOutputCUDAKernel_Regression(const bool sqrt, const data_size_t num_data, const double* input, double* output) {
const int data_index = static_cast<data_size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (data_index < num_data) {
......@@ -69,7 +99,7 @@ void CUDARegressionL2loss::LaunchGetGradientsKernel(const double* score, score_t
}
double CUDARegressionL1loss::LaunchCalcInitScoreKernel() const {
double CUDARegressionL1loss::LaunchCalcInitScoreKernel(const int /*class_id*/) const {
const double alpha = 0.5f;
if (cuda_weights_ == nullptr) {
PercentileGlobal<label_t, data_size_t, label_t, double, false, false>(
......@@ -133,7 +163,8 @@ __global__ void RenewTreeOutputCUDAKernel_RegressionL1(
double* weight_prefix_sum_buffer_pointer = weight_prefix_sum_buffer + data_start;
const double* residual_buffer_pointer = residual_buffer + data_start;
const double alpha = 0.5f;
for (data_size_t inner_data_index = data_start + static_cast<data_size_t>(threadIdx.x); inner_data_index < data_start + num_data; inner_data_index += static_cast<data_size_t>(blockDim.x)) {
for (data_size_t inner_data_index = data_start + static_cast<data_size_t>(threadIdx.x);
inner_data_index < data_start + num_data; inner_data_index += static_cast<data_size_t>(blockDim.x)) {
const data_size_t data_index = data_indices_in_leaf[inner_data_index];
const label_t data_label = label[data_index];
const double data_score = score[data_index];
......@@ -254,6 +285,65 @@ void CUDARegressionFairLoss::LaunchGetGradientsKernel(const double* score, score
}
}
void CUDARegressionPoissonLoss::LaunchCheckLabelKernel() const {
ShuffleReduceSumGlobal<label_t, double>(cuda_labels_, static_cast<size_t>(num_data_), cuda_block_buffer_.RawData());
double label_sum = 0.0f;
CopyFromCUDADeviceToHost<double>(&label_sum, cuda_block_buffer_.RawData(), 1, __FILE__, __LINE__);
ShuffleReduceMinGlobal<label_t, double>(cuda_labels_, static_cast<size_t>(num_data_), cuda_block_buffer_.RawData());
double label_min = 0.0f;
CopyFromCUDADeviceToHost<double>(&label_min, cuda_block_buffer_.RawData(), 1, __FILE__, __LINE__);
if (label_min < 0.0f) {
Log::Fatal("[%s]: at least one target label is negative", GetName());
}
if (label_sum == 0.0f) {
Log::Fatal("[%s]: sum of labels is zero", GetName());
}
}
template <bool USE_WEIGHT>
__global__ void GetGradientsKernel_Poisson(const double* cuda_scores, const label_t* cuda_labels, const label_t* cuda_weights, const data_size_t num_data,
const double max_delta_step, score_t* cuda_out_gradients, score_t* cuda_out_hessians) {
const data_size_t data_index = static_cast<data_size_t>(blockDim.x * blockIdx.x + threadIdx.x);
const double exp_max_delta_step = std::exp(max_delta_step);
if (data_index < num_data) {
if (!USE_WEIGHT) {
const double exp_score = exp(cuda_scores[data_index]);
cuda_out_gradients[data_index] = static_cast<score_t>(exp_score - cuda_labels[data_index]);
cuda_out_hessians[data_index] = static_cast<score_t>(exp_score * exp_max_delta_step);
} else {
const double exp_score = exp(cuda_scores[data_index]);
const score_t weight = static_cast<score_t>(cuda_weights[data_index]);
cuda_out_gradients[data_index] = static_cast<score_t>((exp_score - cuda_labels[data_index]) * weight);
cuda_out_hessians[data_index] = static_cast<score_t>(exp_score * exp_max_delta_step * weight);
}
}
}
void CUDARegressionPoissonLoss::LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const {
const int num_blocks = (num_data_ + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION;
if (cuda_weights_ == nullptr) {
GetGradientsKernel_Poisson<false><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_REGRESSION>>>(
score, cuda_labels_, nullptr, num_data_, max_delta_step_, gradients, hessians);
} else {
GetGradientsKernel_Poisson<true><<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_REGRESSION>>>(
score, cuda_labels_, cuda_weights_, num_data_, max_delta_step_, gradients, hessians);
}
}
__global__ void ConvertOutputCUDAKernel_Regression_Poisson(const data_size_t num_data, const double* input, double* output) {
const int data_index = static_cast<data_size_t>(blockIdx.x * blockDim.x + threadIdx.x);
if (data_index < num_data) {
output[data_index] = exp(input[data_index]);
}
}
void CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const {
const int num_blocks = (num_data + GET_GRADIENTS_BLOCK_SIZE_REGRESSION - 1) / GET_GRADIENTS_BLOCK_SIZE_REGRESSION;
ConvertOutputCUDAKernel_Regression_Poisson<<<num_blocks, GET_GRADIENTS_BLOCK_SIZE_REGRESSION>>>(num_data, input, output);
}
} // namespace LightGBM
......
......@@ -20,55 +20,40 @@
namespace LightGBM {
class CUDARegressionL2loss : public CUDAObjectiveInterface, public RegressionL2loss {
template <typename HOST_OBJECTIVE>
class CUDARegressionObjectiveInterface: public CUDAObjectiveInterface<HOST_OBJECTIVE> {
public:
explicit CUDARegressionL2loss(const Config& config);
explicit CUDARegressionObjectiveInterface(const Config& config): CUDAObjectiveInterface<HOST_OBJECTIVE>(config) {}
explicit CUDARegressionL2loss(const std::vector<std::string>& strs);
~CUDARegressionL2loss();
explicit CUDARegressionObjectiveInterface(const std::vector<std::string>& strs): CUDAObjectiveInterface<HOST_OBJECTIVE>(strs) {}
void Init(const Metadata& metadata, data_size_t num_data) override;
void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override;
protected:
double LaunchCalcInitScoreKernel(const int class_id) const override;
void ConvertOutputCUDA(const data_size_t num_data, const double* input, double* output) const override;
CUDAVector<double> cuda_block_buffer_;
CUDAVector<label_t> cuda_trans_label_;
};
double BoostFromScore(int) const override;
class CUDARegressionL2loss : public CUDARegressionObjectiveInterface<RegressionL2loss> {
public:
explicit CUDARegressionL2loss(const Config& config);
std::function<void(data_size_t, const double*, double*)> GetCUDAConvertOutputFunc() const override {
return [this] (data_size_t num_data, const double* input, double* output) {
ConvertOutputCUDA(num_data, input, output);
};
}
explicit CUDARegressionL2loss(const std::vector<std::string>& strs);
bool IsConstantHessian() const override {
if (cuda_weights_ == nullptr) {
return true;
} else {
return false;
}
}
~CUDARegressionL2loss();
bool IsCUDAObjective() const override { return true; }
void Init(const Metadata& metadata, data_size_t num_data) override;
protected:
virtual double LaunchCalcInitScoreKernel() const;
virtual void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const;
virtual void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const;
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
const label_t* cuda_labels_;
const label_t* cuda_weights_;
CUDAVector<label_t> cuda_trans_label_;
CUDAVector<double> cuda_block_buffer_;
data_size_t num_get_gradients_blocks_;
data_size_t num_init_score_blocks_;
void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
};
class CUDARegressionL1loss : public CUDARegressionL2loss {
class CUDARegressionL1loss : public CUDARegressionObjectiveInterface<RegressionL1loss> {
public:
explicit CUDARegressionL1loss(const Config& config);
......@@ -78,11 +63,6 @@ class CUDARegressionL1loss : public CUDARegressionL2loss {
void Init(const Metadata& metadata, data_size_t num_data) override;
void RenewTreeOutputCUDA(const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf,
const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override;
bool IsRenewTreeOutput() const override { return true; }
protected:
CUDAVector<data_size_t> cuda_data_indices_buffer_;
CUDAVector<double> cuda_weights_prefix_sum_;
......@@ -91,17 +71,17 @@ class CUDARegressionL1loss : public CUDARegressionL2loss {
CUDAVector<label_t> cuda_weight_by_leaf_buffer_;
CUDAVector<label_t> cuda_percentile_result_;
double LaunchCalcInitScoreKernel() const override;
double LaunchCalcInitScoreKernel(const int class_id) const override;
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
void LaunchRenewTreeOutputCUDAKernel(
const double* score, const data_size_t* data_indices_in_leaf, const data_size_t* num_data_in_leaf,
const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const;
const data_size_t* data_start_in_leaf, const int num_leaves, double* leaf_value) const override;
};
class CUDARegressionHuberLoss : public CUDARegressionL2loss {
class CUDARegressionHuberLoss : public CUDARegressionObjectiveInterface<RegressionHuberLoss> {
public:
explicit CUDARegressionHuberLoss(const Config& config);
......@@ -109,17 +89,13 @@ class CUDARegressionHuberLoss : public CUDARegressionL2loss {
~CUDARegressionHuberLoss();
bool IsRenewTreeOutput() const override { return true; }
private:
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
const double alpha_ = 0.0f;
};
// http://research.microsoft.com/en-us/um/people/zhang/INRIA/Publis/Tutorial-Estim/node24.html
class CUDARegressionFairLoss : public CUDARegressionL2loss {
class CUDARegressionFairLoss : public CUDARegressionObjectiveInterface<RegressionFairLoss> {
public:
explicit CUDARegressionFairLoss(const Config& config);
......@@ -127,14 +103,29 @@ class CUDARegressionFairLoss : public CUDARegressionL2loss {
~CUDARegressionFairLoss();
bool IsConstantHessian() const override {
return false;
}
private:
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
};
class CUDARegressionPoissonLoss : public CUDARegressionObjectiveInterface<RegressionPoissonLoss> {
public:
explicit CUDARegressionPoissonLoss(const Config& config);
explicit CUDARegressionPoissonLoss(const std::vector<std::string>& strs);
~CUDARegressionPoissonLoss();
void Init(const Metadata& metadata, data_size_t num_data) override;
protected:
void LaunchGetGradientsKernel(const double* score, score_t* gradients, score_t* hessians) const override;
void LaunchConvertOutputCUDAKernel(const data_size_t num_data, const double* input, double* output) const override;
double LaunchCalcInitScoreKernel(const int class_id) const override;
const double c_ = 0.0f;
void LaunchCheckLabelKernel() const;
};
......
......@@ -32,8 +32,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
} else if (type == std::string("fair")) {
return new CUDARegressionFairLoss(config);
} else if (type == std::string("poisson")) {
Log::Warning("Objective poisson is not implemented in cuda_exp version. Fall back to boosting on CPU.");
return new RegressionPoissonLoss(config);
return new CUDARegressionPoissonLoss(config);
} else if (type == std::string("binary")) {
return new CUDABinaryLogloss(config);
} else if (type == std::string("lambdarank")) {
......
......@@ -358,7 +358,7 @@ class RankXENDCG : public RankingObjective {
const char* GetName() const override { return "rank_xendcg"; }
private:
protected:
mutable std::vector<Random> rands_;
};
......
......@@ -341,7 +341,7 @@ class RegressionHuberLoss: public RegressionL2loss {
return "huber";
}
private:
protected:
/*! \brief delta for Huber loss */
double alpha_;
};
......@@ -386,7 +386,7 @@ class RegressionFairLoss: public RegressionL2loss {
return false;
}
private:
protected:
/*! \brief c for Fair loss */
double c_;
};
......@@ -398,6 +398,7 @@ class RegressionFairLoss: public RegressionL2loss {
class RegressionPoissonLoss: public RegressionL2loss {
public:
explicit RegressionPoissonLoss(const Config& config): RegressionL2loss(config) {
Log::Warning("RegressionPoissonLoss is created again");
max_delta_step_ = static_cast<double>(config.poisson_max_delta_step);
if (sqrt_) {
Log::Warning("Cannot use sqrt transform in %s Regression, will auto disable it", GetName());
......@@ -473,7 +474,7 @@ class RegressionPoissonLoss: public RegressionL2loss {
return false;
}
private:
protected:
/*! \brief used to safeguard optimization */
double max_delta_step_;
};
......@@ -568,7 +569,7 @@ class RegressionQuantileloss : public RegressionL2loss {
}
}
private:
protected:
score_t alpha_;
};
......
......@@ -112,7 +112,7 @@ def test_rf():
assert evals_result['valid_0']['binary_logloss'][-1] == pytest.approx(ret)
@pytest.mark.parametrize('objective', ['regression', 'regression_l1', 'huber', 'fair'])
@pytest.mark.parametrize('objective', ['regression', 'regression_l1', 'huber', 'fair', 'poisson'])
def test_regression(objective):
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
......@@ -136,6 +136,8 @@ def test_regression(objective):
assert ret < 35
elif objective == 'fair':
assert ret < 17
elif objective == 'poisson':
assert ret < 8
else:
assert ret < 7
assert evals_result['valid_0']['l2'][-1] == pytest.approx(ret)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment