Unverified Commit f901f471 authored by shiyu1994's avatar shiyu1994 Committed by GitHub
Browse files

[CUDA] CUDA Quantized Training (fixes #5606) (#5933)

* add quantized training (first stage)

* add histogram construction functions for integer gradients

* add stochastic rounding

* update docs

* fix compilation errors by adding template instantiations

* update files for compilation

* fix compilation of gpu version

* initialize gradient discretizer before share states

* add a test case for quantized training

* add quantized training for data distributed training

* Delete origin.pred

* Delete ifelse.pred

* Delete LightGBM_model.txt

* remove useless changes

* fix lint error

* remove debug loggings

* fix mismatch of vector and allocator types

* remove changes in main.cpp

* fix bugs with uninitialized gradient discretizer

* initialize ordered gradients in gradient discretizer

* disable quantized training with gpu and cuda

fix msvc compilation errors and warnings

* fix bug in data parallel tree learner

* make quantized training test deterministic

* make quantized training in test case more accurate

* refactor test_quantized_training

* fix leaf splits initialization with quantized training

* check distributed quantized training result

* add cuda gradient discretizer

* add quantized training for CUDA version in tree learner

* remove cuda computability 6.1 and 6.2

* fix parts of gpu quantized training errors and warnings

* fix build-python.sh to install locally built version

* fix memory access bugs

* fix lint errors

* mark cuda quantized training on cuda with categorical features as unsupported

* rename cuda_utils.h to cuda_utils.hu

* enable quantized training with cuda

* fix cuda quantized training with sparse row data

* allow using global memory buffer in histogram construction with cuda quantized training

* recover build-python.sh

enlarge allowed package size to 100M
parent 3d9ada76
......@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
pydistcheck \
--inspect \
--ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
--max-allowed-size-uncompressed '70M' \
--max-allowed-size-uncompressed '100M' \
--max-allowed-files 800 \
${DIST_DIR}/* || exit -1
elif { test $(uname -m) = "aarch64"; }; then
......
......@@ -13,7 +13,7 @@
#include <stdio.h>
#include <LightGBM/bin.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/log.h>
#include <algorithm>
......
......@@ -9,7 +9,7 @@
#define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
#include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/bin.h>
#include <LightGBM/utils/openmp_wrapper.h>
......
......@@ -8,7 +8,7 @@
#ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/meta.h>
#include <vector>
......
......@@ -9,7 +9,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/metric.h>
namespace LightGBM {
......
......@@ -9,7 +9,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/objective_function.h>
#include <LightGBM/meta.h>
......
......@@ -10,7 +10,7 @@
#include <LightGBM/bin.h>
#include <LightGBM/config.h>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/dataset.h>
#include <LightGBM/train_share_states.h>
#include <LightGBM/utils/openmp_wrapper.h>
......
......@@ -24,12 +24,14 @@ class CUDASplitInfo {
double left_sum_gradients;
double left_sum_hessians;
int64_t left_sum_of_gradients_hessians;
data_size_t left_count;
double left_gain;
double left_value;
double right_sum_gradients;
double right_sum_hessians;
int64_t right_sum_of_gradients_hessians;
data_size_t right_count;
double right_gain;
double right_value;
......
......@@ -7,15 +7,21 @@
#define LIGHTGBM_CUDA_CUDA_UTILS_H_
#ifdef USE_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include <LightGBM/utils/log.h>
#include <algorithm>
#include <vector>
#include <cmath>
namespace LightGBM {
typedef unsigned long long atomic_add_long_t;
#define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
if (code != cudaSuccess) {
......@@ -125,13 +131,19 @@ class CUDAVector {
T* new_data = nullptr;
AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
if (size_ > 0 && data_ != nullptr) {
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__);
const size_t size_for_old_content = std::min<size_t>(size_, size);
CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
}
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
data_ = new_data;
size_ = size;
}
void InitFromHostVector(const std::vector<T>& host_vector) {
Resize(host_vector.size());
CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
}
void Clear() {
if (size_ > 0 && data_ != nullptr) {
DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
......@@ -171,6 +183,10 @@ class CUDAVector {
return data_;
}
void SetValue(int value) {
SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
}
const T* RawDataReadOnly() const {
return data_;
}
......
......@@ -6,7 +6,7 @@
#ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
#define LIGHTGBM_SAMPLE_STRATEGY_H_
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/common.h>
#include <LightGBM/utils/threading.h>
......
......@@ -8,7 +8,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include "../score_updater.hpp"
......
......@@ -5,7 +5,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
namespace LightGBM {
......
......@@ -389,10 +389,6 @@ void Config::CheckParamConflict() {
if (deterministic) {
Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
}
if (use_quantized_grad) {
Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
use_quantized_grad = false;
}
}
// linear tree learner must be serial type and run on CPU device
if (linear_tree) {
......
......@@ -10,7 +10,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <vector>
......
......@@ -10,7 +10,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <vector>
......
......@@ -10,7 +10,7 @@
#ifdef USE_CUDA
#include <LightGBM/cuda/cuda_metric.hpp>
#include <LightGBM/cuda/cuda_utils.h>
#include <LightGBM/cuda/cuda_utils.hu>
#include <vector>
......
......@@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder(
select_features_by_node_(select_features_by_node),
cuda_hist_(cuda_hist) {
InitFeatureMetaInfo(train_data);
if (has_categorical_feature_ && config->use_quantized_grad) {
Log::Fatal("Quantized training on GPU with categorical features is not supported yet.");
}
cuda_leaf_best_split_info_ = nullptr;
cuda_best_split_info_ = nullptr;
cuda_best_split_info_buffer_ = nullptr;
......@@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
const data_size_t num_data_in_smaller_leaf,
const data_size_t num_data_in_larger_leaf,
const double sum_hessians_in_smaller_leaf,
const double sum_hessians_in_larger_leaf) {
const double sum_hessians_in_larger_leaf,
const score_t* grad_scale,
const score_t* hess_scale,
const uint8_t smaller_num_bits_in_histogram_bins,
const uint8_t larger_num_bits_in_histogram_bins) {
const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
if (grad_scale != nullptr && hess_scale != nullptr) {
LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
grad_scale, hess_scale, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
} else {
LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
}
global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
SynchronizeCUDADevice(__FILE__, __LINE__);
......
......@@ -320,6 +320,175 @@ __device__ void FindBestSplitsForLeafKernelInner(
}
}
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool REVERSE, typename BIN_HIST_TYPE, typename ACC_HIST_TYPE, bool USE_16BIT_BIN_HIST, bool USE_16BIT_ACC_HIST>
__device__ void FindBestSplitsDiscretizedForLeafKernelInner(
// input feature information
const BIN_HIST_TYPE* feature_hist_ptr,
// input task information
const SplitFindTask* task,
CUDARandom* cuda_random,
// input config parameter values
const double lambda_l1,
const double lambda_l2,
const double path_smooth,
const data_size_t min_data_in_leaf,
const double min_sum_hessian_in_leaf,
const double min_gain_to_split,
// input parent node information
const double parent_gain,
const int64_t sum_gradients_hessians,
const data_size_t num_data,
const double parent_output,
// gradient scale
const double grad_scale,
const double hess_scale,
// output parameters
CUDASplitInfo* cuda_best_split_info) {
const double sum_hessians = static_cast<double>(sum_gradients_hessians & 0x00000000ffffffff) * hess_scale;
const double cnt_factor = num_data / sum_hessians;
const double min_gain_shift = parent_gain + min_gain_to_split;
cuda_best_split_info->is_valid = false;
ACC_HIST_TYPE local_grad_hess_hist = 0;
double local_gain = 0.0f;
bool threshold_found = false;
uint32_t threshold_value = 0;
__shared__ int rand_threshold;
if (USE_RAND && threadIdx.x == 0) {
if (task->num_bin - 2 > 0) {
rand_threshold = cuda_random->NextInt(0, task->num_bin - 2);
}
}
__shared__ uint32_t best_thread_index;
__shared__ double shared_double_buffer[32];
__shared__ bool shared_bool_buffer[32];
__shared__ uint32_t shared_int_buffer[64];
const unsigned int threadIdx_x = threadIdx.x;
const bool skip_sum = REVERSE ?
(task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :
(task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast<int>(task->default_bin));
const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset;
if (!REVERSE) {
if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
const unsigned int bin_offset = threadIdx_x;
if (USE_16BIT_BIN_HIST && !USE_16BIT_ACC_HIST) {
const int32_t local_grad_hess_hist_int32 = feature_hist_ptr[bin_offset];
local_grad_hess_hist = (static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist_int32 >> 16)) << 32) | (static_cast<int64_t>(local_grad_hess_hist_int32 & 0x0000ffff));
} else {
local_grad_hess_hist = feature_hist_ptr[bin_offset];
}
}
} else {
if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) &&
threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
const unsigned int read_index = feature_num_bin_minus_offset - 1 - threadIdx_x;
if (USE_16BIT_BIN_HIST && !USE_16BIT_ACC_HIST) {
const int32_t local_grad_hess_hist_int32 = feature_hist_ptr[read_index];
local_grad_hess_hist = (static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist_int32 >> 16)) << 32) | (static_cast<int64_t>(local_grad_hess_hist_int32 & 0x0000ffff));
} else {
local_grad_hess_hist = feature_hist_ptr[read_index];
}
}
}
__syncthreads();
local_gain = kMinScore;
local_grad_hess_hist = ShufflePrefixSum<ACC_HIST_TYPE>(local_grad_hess_hist, reinterpret_cast<ACC_HIST_TYPE*>(shared_int_buffer));
double sum_left_gradient = 0.0f;
double sum_left_hessian = 0.0f;
double sum_right_gradient = 0.0f;
double sum_right_hessian = 0.0f;
data_size_t left_count = 0;
data_size_t right_count = 0;
int64_t sum_left_gradient_hessian = 0;
int64_t sum_right_gradient_hessian = 0;
if (REVERSE) {
if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) && threadIdx_x <= task->num_bin - 2 && !skip_sum) {
sum_right_gradient_hessian = USE_16BIT_ACC_HIST ?
(static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist >> 16)) << 32) | static_cast<int64_t>(local_grad_hess_hist & 0x0000ffff) :
local_grad_hess_hist;
sum_right_gradient = static_cast<double>(static_cast<int32_t>((sum_right_gradient_hessian & 0xffffffff00000000) >> 32)) * grad_scale;
sum_right_hessian = static_cast<double>(static_cast<int32_t>(sum_right_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
sum_left_gradient_hessian = sum_gradients_hessians - sum_right_gradient_hessian;
sum_left_gradient = static_cast<double>(static_cast<int32_t>((sum_left_gradient_hessian & 0xffffffff00000000)>> 32)) * grad_scale;
sum_left_hessian = static_cast<double>(static_cast<int32_t>(sum_left_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
left_count = num_data - right_count;
if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
(!USE_RAND || static_cast<int>(task->num_bin - 2 - threadIdx_x) == rand_threshold)) {
double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient,
sum_right_hessian + kEpsilon, lambda_l1,
lambda_l2, path_smooth, left_count, right_count, parent_output);
// gain with split is worse than without split
if (current_gain > min_gain_shift) {
local_gain = current_gain - min_gain_shift;
threshold_value = static_cast<uint32_t>(task->num_bin - 2 - threadIdx_x);
threshold_found = true;
}
}
}
} else {
if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) {
sum_left_gradient_hessian = USE_16BIT_ACC_HIST ?
(static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist >> 16)) << 32) | static_cast<int64_t>(local_grad_hess_hist & 0x0000ffff) :
local_grad_hess_hist;
sum_left_gradient = static_cast<double>(static_cast<int32_t>((sum_left_gradient_hessian & 0xffffffff00000000) >> 32)) * grad_scale;
sum_left_hessian = static_cast<double>(static_cast<int32_t>(sum_left_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
sum_right_gradient_hessian = sum_gradients_hessians - sum_left_gradient_hessian;
sum_right_gradient = static_cast<double>(static_cast<int32_t>((sum_right_gradient_hessian & 0xffffffff00000000) >> 32)) * grad_scale;
sum_right_hessian = static_cast<double>(static_cast<int32_t>(sum_right_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
right_count = num_data - left_count;
if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
(!USE_RAND || static_cast<int>(threadIdx_x + task->mfb_offset) == rand_threshold)) {
double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient,
sum_right_hessian + kEpsilon, lambda_l1,
lambda_l2, path_smooth, left_count, right_count, parent_output);
// gain with split is worse than without split
if (current_gain > min_gain_shift) {
local_gain = current_gain - min_gain_shift;
threshold_value = static_cast<uint32_t>(threadIdx_x + task->mfb_offset);
threshold_found = true;
}
}
}
}
__syncthreads();
const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_double_buffer, shared_bool_buffer, shared_int_buffer);
if (threadIdx_x == 0) {
best_thread_index = result;
}
__syncthreads();
if (threshold_found && threadIdx_x == best_thread_index) {
cuda_best_split_info->is_valid = true;
cuda_best_split_info->threshold = threshold_value;
cuda_best_split_info->gain = local_gain;
cuda_best_split_info->default_left = task->assume_out_default_left;
const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
cuda_best_split_info->left_sum_gradients = sum_left_gradient;
cuda_best_split_info->left_sum_hessians = sum_left_hessian;
cuda_best_split_info->left_sum_of_gradients_hessians = sum_left_gradient_hessian;
cuda_best_split_info->left_count = left_count;
cuda_best_split_info->right_sum_gradients = sum_right_gradient;
cuda_best_split_info->right_sum_hessians = sum_right_hessian;
cuda_best_split_info->right_sum_of_gradients_hessians = sum_right_gradient_hessian;
cuda_best_split_info->right_count = right_count;
cuda_best_split_info->left_value = left_output;
cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
sum_left_hessian, lambda_l1, lambda_l2, left_output);
cuda_best_split_info->right_value = right_output;
cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
sum_right_hessian, lambda_l1, lambda_l2, right_output);
}
}
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
__device__ void FindBestSplitsForLeafKernelCategoricalInner(
// input feature information
......@@ -715,6 +884,169 @@ __global__ void FindBestSplitsForLeafKernel(
}
}
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool IS_LARGER>
__global__ void FindBestSplitsDiscretizedForLeafKernel(
// input feature information
const int8_t* is_feature_used_bytree,
// input task information
const int num_tasks,
const SplitFindTask* tasks,
CUDARandom* cuda_randoms,
// input leaf information
const CUDALeafSplitsStruct* smaller_leaf_splits,
const CUDALeafSplitsStruct* larger_leaf_splits,
const uint8_t smaller_leaf_num_bits_in_histogram_bin,
const uint8_t larger_leaf_num_bits_in_histogram_bin,
// input config parameter values
const data_size_t min_data_in_leaf,
const double min_sum_hessian_in_leaf,
const double min_gain_to_split,
const double lambda_l1,
const double lambda_l2,
const double path_smooth,
const double cat_smooth,
const double cat_l2,
const int max_cat_threshold,
const int min_data_per_group,
const int max_cat_to_onehot,
// gradient scale
const score_t* grad_scale,
const score_t* hess_scale,
// output
CUDASplitInfo* cuda_best_split_info) {
const unsigned int task_index = blockIdx.x;
const SplitFindTask* task = tasks + task_index;
const int inner_feature_index = task->inner_feature_index;
const double parent_gain = IS_LARGER ? larger_leaf_splits->gain : smaller_leaf_splits->gain;
const int64_t sum_gradients_hessians = IS_LARGER ? larger_leaf_splits->sum_of_gradients_hessians : smaller_leaf_splits->sum_of_gradients_hessians;
const data_size_t num_data = IS_LARGER ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf;
const double parent_output = IS_LARGER ? larger_leaf_splits->leaf_value : smaller_leaf_splits->leaf_value;
const unsigned int output_offset = IS_LARGER ? (task_index + num_tasks) : task_index;
CUDASplitInfo* out = cuda_best_split_info + output_offset;
CUDARandom* cuda_random = USE_RAND ?
(IS_LARGER ? cuda_randoms + task_index * 2 + 1 : cuda_randoms + task_index * 2) : nullptr;
const bool use_16bit_bin = IS_LARGER ? (larger_leaf_num_bits_in_histogram_bin <= 16) : (smaller_leaf_num_bits_in_histogram_bin <= 16);
if (is_feature_used_bytree[inner_feature_index]) {
if (task->is_categorical) {
__threadfence(); // ensure store issued before trap
asm("trap;");
} else {
if (!task->reverse) {
if (use_16bit_bin) {
const int32_t* hist_ptr =
reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, false, int32_t, int32_t, true, true>(
// input feature information
hist_ptr,
// input task information
task,
cuda_random,
// input config parameter values
lambda_l1,
lambda_l2,
path_smooth,
min_data_in_leaf,
min_sum_hessian_in_leaf,
min_gain_to_split,
// input parent node information
parent_gain,
sum_gradients_hessians,
num_data,
parent_output,
// gradient scale
*grad_scale,
*hess_scale,
// output parameters
out);
} else {
const int32_t* hist_ptr =
reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, false, int32_t, int64_t, false, false>(
// input feature information
hist_ptr,
// input task information
task,
cuda_random,
// input config parameter values
lambda_l1,
lambda_l2,
path_smooth,
min_data_in_leaf,
min_sum_hessian_in_leaf,
min_gain_to_split,
// input parent node information
parent_gain,
sum_gradients_hessians,
num_data,
parent_output,
// gradient scale
*grad_scale,
*hess_scale,
// output parameters
out);
}
} else {
if (use_16bit_bin) {
const int32_t* hist_ptr =
reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, true, int32_t, int32_t, true, true>(
// input feature information
hist_ptr,
// input task information
task,
cuda_random,
// input config parameter values
lambda_l1,
lambda_l2,
path_smooth,
min_data_in_leaf,
min_sum_hessian_in_leaf,
min_gain_to_split,
// input parent node information
parent_gain,
sum_gradients_hessians,
num_data,
parent_output,
// gradient scale
*grad_scale,
*hess_scale,
// output parameters
out);
} else {
const int32_t* hist_ptr =
reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, true, int32_t, int64_t, false, false>(
// input feature information
hist_ptr,
// input task information
task,
cuda_random,
// input config parameter values
lambda_l1,
lambda_l2,
path_smooth,
min_data_in_leaf,
min_sum_hessian_in_leaf,
min_gain_to_split,
// input parent node information
parent_gain,
sum_gradients_hessians,
num_data,
parent_output,
// gradient scale
*grad_scale,
*hess_scale,
// output parameters
out);
}
}
}
} else {
out->is_valid = false;
}
}
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool REVERSE>
__device__ void FindBestSplitsForLeafKernelInner_GlobalMemory(
// input feature information
......@@ -1466,6 +1798,108 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBest
#undef FindBestSplitsForLeafKernel_ARGS
#undef GlobalMemory_Buffer_ARGS
#define LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS \
const CUDALeafSplitsStruct* smaller_leaf_splits, \
const CUDALeafSplitsStruct* larger_leaf_splits, \
const int smaller_leaf_index, \
const int larger_leaf_index, \
const bool is_smaller_leaf_valid, \
const bool is_larger_leaf_valid, \
const score_t* grad_scale, \
const score_t* hess_scale, \
const uint8_t smaller_num_bits_in_histogram_bins, \
const uint8_t larger_num_bits_in_histogram_bins
#define LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS \
smaller_leaf_splits, \
larger_leaf_splits, \
smaller_leaf_index, \
larger_leaf_index, \
is_smaller_leaf_valid, \
is_larger_leaf_valid, \
grad_scale, \
hess_scale, \
smaller_num_bits_in_histogram_bins, \
larger_num_bits_in_histogram_bins
#define FindBestSplitsDiscretizedForLeafKernel_ARGS \
cuda_is_feature_used_bytree_, \
num_tasks_, \
cuda_split_find_tasks_.RawData(), \
cuda_randoms_.RawData(), \
smaller_leaf_splits, \
larger_leaf_splits, \
smaller_num_bits_in_histogram_bins, \
larger_num_bits_in_histogram_bins, \
min_data_in_leaf_, \
min_sum_hessian_in_leaf_, \
min_gain_to_split_, \
lambda_l1_, \
lambda_l2_, \
path_smooth_, \
cat_smooth_, \
cat_l2_, \
max_cat_threshold_, \
min_data_per_group_, \
max_cat_to_onehot_, \
grad_scale, \
hess_scale, \
cuda_best_split_info_
void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernel(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
if (!is_smaller_leaf_valid && !is_larger_leaf_valid) {
return;
}
if (!extra_trees_) {
LaunchFindBestSplitsDiscretizedForLeafKernelInner0<false>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
} else {
LaunchFindBestSplitsDiscretizedForLeafKernelInner0<true>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
}
}
template <bool USE_RAND>
void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernelInner0(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
if (lambda_l1_ <= 0.0f) {
LaunchFindBestSplitsDiscretizedForLeafKernelInner1<USE_RAND, false>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
} else {
LaunchFindBestSplitsDiscretizedForLeafKernelInner1<USE_RAND, true>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
}
}
template <bool USE_RAND, bool USE_L1>
void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernelInner1(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
if (!use_smoothing_) {
LaunchFindBestSplitsDiscretizedForLeafKernelInner2<USE_RAND, USE_L1, false>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
} else {
LaunchFindBestSplitsDiscretizedForLeafKernelInner2<USE_RAND, USE_L1, true>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
}
}
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernelInner2(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
if (!use_global_memory_) {
if (is_smaller_leaf_valid) {
FindBestSplitsDiscretizedForLeafKernel<USE_RAND, USE_L1, USE_SMOOTHING, false>
<<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[0]>>>
(FindBestSplitsDiscretizedForLeafKernel_ARGS);
}
SynchronizeCUDADevice(__FILE__, __LINE__);
if (is_larger_leaf_valid) {
FindBestSplitsDiscretizedForLeafKernel<USE_RAND, USE_L1, USE_SMOOTHING, true>
<<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[1]>>>
(FindBestSplitsDiscretizedForLeafKernel_ARGS);
}
} else {
// TODO(shiyu1994)
}
}
#undef LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS
#undef LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS
#undef FindBestSplitsDiscretizedForLeafKernel_ARGS
__device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read_index,
uint32_t num_features_aligned) {
const uint32_t threadIdx_x = threadIdx.x;
......
......@@ -67,7 +67,11 @@ class CUDABestSplitFinder {
const data_size_t num_data_in_smaller_leaf,
const data_size_t num_data_in_larger_leaf,
const double sum_hessians_in_smaller_leaf,
const double sum_hessians_in_larger_leaf);
const double sum_hessians_in_larger_leaf,
const score_t* grad_scale,
const score_t* hess_scale,
const uint8_t smaller_num_bits_in_histogram_bins,
const uint8_t larger_num_bits_in_histogram_bins);
const CUDASplitInfo* FindBestFromAllSplits(
const int cur_num_leaves,
......@@ -114,6 +118,31 @@ class CUDABestSplitFinder {
#undef LaunchFindBestSplitsForLeafKernel_PARAMS
#define LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS \
const CUDALeafSplitsStruct* smaller_leaf_splits, \
const CUDALeafSplitsStruct* larger_leaf_splits, \
const int smaller_leaf_index, \
const int larger_leaf_index, \
const bool is_smaller_leaf_valid, \
const bool is_larger_leaf_valid, \
const score_t* grad_scale, \
const score_t* hess_scale, \
const uint8_t smaller_num_bits_in_histogram_bins, \
const uint8_t larger_num_bits_in_histogram_bins
void LaunchFindBestSplitsDiscretizedForLeafKernel(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner0(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND, bool USE_L1>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner1(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
void LaunchFindBestSplitsDiscretizedForLeafKernelInner2(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
#undef LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS
void LaunchSyncBestSplitForLeafKernel(
const int host_smaller_leaf_index,
const int host_larger_leaf_index,
......
......@@ -368,6 +368,12 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n
cur_num_leaves_ = num_leaves;
}
void CUDADataPartition::ReduceLeafGradStat(
const score_t* gradients, const score_t* hessians,
CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const {
LaunchReduceLeafGradStat(gradients, hessians, tree, leaf_grad_stat_buffer, leaf_hess_state_buffer);
}
} // namespace LightGBM
#endif // USE_CUDA
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment