Unverified Commit e50a9151 authored by Belinda Trotta's avatar Belinda Trotta Committed by GitHub
Browse files

Path smoothing (#2950)

* Path smoothing

* Try to fix issue with gpu version.

* Fix failing CI for R package.

* Minor fixes.

* Minor refactor.

* Restore old code to get CI working.

* Fix style issues.

* Fix ci for R package.

* Minor fixes for docs and code style.

* Update docs.
parent 6823af94
...@@ -98,7 +98,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then ...@@ -98,7 +98,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
exit -1 exit -1
fi fi
ALLOWED_CHECK_NOTES=2 ALLOWED_CHECK_NOTES=3
NUM_CHECK_NOTES=$( NUM_CHECK_NOTES=$(
cat ${LOG_FILE_NAME} \ cat ${LOG_FILE_NAME} \
| grep -e '^Status: .* NOTE.*' \ | grep -e '^Status: .* NOTE.*' \
......
...@@ -81,4 +81,6 @@ Deal with Over-fitting ...@@ -81,4 +81,6 @@ Deal with Over-fitting
- Try ``extra_trees`` - Try ``extra_trees``
- Try increasing ``path_smooth``
.. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258 .. _Optuna: https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258
...@@ -522,6 +522,22 @@ Learning Control Parameters ...@@ -522,6 +522,22 @@ Learning Control Parameters
- applied once per forest - applied once per forest
- ``path_smooth`` :raw-html:`<a id="path_smooth" title="Permalink to this parameter" href="#path_smooth">&#x1F517;&#xFE0E;</a>`, default = ``0``, type = double, constraints: ``path_smooth >= 0.0``
- controls smoothing applied to tree nodes
- helps prevent overfitting on leaves with few samples
- if set to zero, no smoothing is applied
- if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
- larger values give stronger regularisation
- the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
- note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
- ``verbosity`` :raw-html:`<a id="verbosity" title="Permalink to this parameter" href="#verbosity">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``verbose`` - ``verbosity`` :raw-html:`<a id="verbosity" title="Permalink to this parameter" href="#verbosity">&#x1F517;&#xFE0E;</a>`, default = ``1``, type = int, aliases: ``verbose``
- controls the level of LightGBM's verbosity - controls the level of LightGBM's verbosity
......
...@@ -495,6 +495,16 @@ struct Config { ...@@ -495,6 +495,16 @@ struct Config {
// desc = applied once per forest // desc = applied once per forest
std::vector<double> cegb_penalty_feature_coupled; std::vector<double> cegb_penalty_feature_coupled;
// check = >= 0.0
// desc = controls smoothing applied to tree nodes
// desc = helps prevent overfitting on leaves with few samples
// desc = if set to zero, no smoothing is applied
// desc = if ``path_smooth > 0`` then ``min_data_in_leaf`` must be at least ``2``
// desc = larger values give stronger regularisation
// descl2 = the weight of each node is ``(n / path_smooth) * w + w_p / (n / path_smooth + 1)``, where ``n`` is the number of samples in the node, ``w`` is the optimal node weight to minimise the loss (approximately ``-sum_gradients / sum_hessians``), and ``w_p`` is the weight of the parent node
// descl2 = note that the parent output ``w_p`` itself has smoothing applied, unless it is the root node, so that the smoothing effect accumulates with the tree depth
double path_smooth = 0;
// alias = verbose // alias = verbose
// desc = controls the level of LightGBM's verbosity // desc = controls the level of LightGBM's verbosity
// desc = ``< 0``: Fatal, ``= 0``: Error (Warning), ``= 1``: Info, ``> 1``: Debug // desc = ``< 0``: Fatal, ``= 0``: Error (Warning), ``= 1``: Info, ``> 1``: Debug
......
...@@ -142,6 +142,9 @@ class Tree { ...@@ -142,6 +142,9 @@ class Tree {
/*! \brief Get depth of specific leaf*/ /*! \brief Get depth of specific leaf*/
inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; } inline int leaf_depth(int leaf_idx) const { return leaf_depth_[leaf_idx]; }
/*! \brief Get parent of specific leaf*/
inline int leaf_parent(int leaf_idx) const {return leaf_parent_[leaf_idx]; }
/*! \brief Get feature of specific split*/ /*! \brief Get feature of specific split*/
inline int split_feature(int split_idx) const { return split_feature_[split_idx]; } inline int split_feature(int split_idx) const { return split_feature_[split_idx]; }
...@@ -163,8 +166,6 @@ class Tree { ...@@ -163,8 +166,6 @@ class Tree {
return split_feature_inner_[node_idx]; return split_feature_inner_[node_idx];
} }
inline int leaf_parent(int leaf_idx) const { return leaf_parent_[leaf_idx]; }
inline uint32_t threshold_in_bin(int node_idx) const { inline uint32_t threshold_in_bin(int node_idx) const {
return threshold_in_bin_[node_idx]; return threshold_in_bin_[node_idx];
} }
......
...@@ -314,6 +314,14 @@ void Config::CheckParamConflict() { ...@@ -314,6 +314,14 @@ void Config::CheckParamConflict() {
force_col_wise = true; force_col_wise = true;
force_row_wise = false; force_row_wise = false;
} }
// min_data_in_leaf must be at least 2 if path smoothing is active. This is because when the split is calculated
// the count is calculated using the proportion of hessian in the leaf which is rounded up to nearest int, so it can
// be 1 when there is actually no data in the leaf. In rare cases this can cause a bug because with path smoothing the
// calculated split gain can be positive even with zero gradient and hessian.
if (path_smooth > kEpsilon && min_data_in_leaf < 2) {
min_data_in_leaf = 2;
Log::Warning("min_data_in_leaf has been increased to 2 because this is required when path smoothing is active.");
}
if (is_parallel && monotone_constraints_method == std::string("intermediate")) { if (is_parallel && monotone_constraints_method == std::string("intermediate")) {
// In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints. // In distributed mode, local node doesn't have histograms on all features, cannot perform "intermediate" monotone constraints.
Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method."); Log::Warning("Cannot use \"intermediate\" monotone constraints in parallel learning, auto set to \"basic\" method.");
......
...@@ -229,6 +229,7 @@ const std::unordered_set<std::string>& Config::parameter_set() { ...@@ -229,6 +229,7 @@ const std::unordered_set<std::string>& Config::parameter_set() {
"cegb_penalty_split", "cegb_penalty_split",
"cegb_penalty_feature_lazy", "cegb_penalty_feature_lazy",
"cegb_penalty_feature_coupled", "cegb_penalty_feature_coupled",
"path_smooth",
"verbosity", "verbosity",
"input_model", "input_model",
"output_model", "output_model",
...@@ -450,6 +451,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str ...@@ -450,6 +451,9 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
cegb_penalty_feature_coupled = Common::StringToArray<double>(tmp_str, ','); cegb_penalty_feature_coupled = Common::StringToArray<double>(tmp_str, ',');
} }
GetDouble(params, "path_smooth", &path_smooth);
CHECK_GE(path_smooth, 0.0);
GetInt(params, "verbosity", &verbosity); GetInt(params, "verbosity", &verbosity);
GetString(params, "input_model", &input_model); GetString(params, "input_model", &input_model);
...@@ -654,6 +658,7 @@ std::string Config::SaveMembersToString() const { ...@@ -654,6 +658,7 @@ std::string Config::SaveMembersToString() const {
str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n"; str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n";
str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n"; str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n";
str_buf << "[path_smooth: " << path_smooth << "]\n";
str_buf << "[verbosity: " << verbosity << "]\n"; str_buf << "[verbosity: " << verbosity << "]\n";
str_buf << "[max_bin: " << max_bin << "]\n"; str_buf << "[max_bin: " << max_bin << "]\n";
str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n"; str_buf << "[max_bin_by_feature: " << Common::Join(max_bin_by_feature, ",") << "]\n";
......
...@@ -85,22 +85,24 @@ class FeatureHistogram { ...@@ -85,22 +85,24 @@ class FeatureHistogram {
void FindBestThreshold(double sum_gradient, double sum_hessian, void FindBestThreshold(double sum_gradient, double sum_hessian,
data_size_t num_data, data_size_t num_data,
const ConstraintEntry& constraints, const ConstraintEntry& constraints,
double parent_output,
SplitInfo* output) { SplitInfo* output) {
output->default_left = true; output->default_left = true;
output->gain = kMinScore; output->gain = kMinScore;
find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data, find_best_threshold_fun_(sum_gradient, sum_hessian + 2 * kEpsilon, num_data,
constraints, output); constraints, parent_output, output);
output->gain *= meta_->penalty; output->gain *= meta_->penalty;
} }
template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT> template <bool USE_RAND, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
double BeforeNumercal(double sum_gradient, double sum_hessian, double BeforeNumercal(double sum_gradient, double sum_hessian, double parent_output, data_size_t num_data,
SplitInfo* output, int* rand_threshold) { SplitInfo* output, int* rand_threshold) {
is_splittable_ = false; is_splittable_ = false;
output->monotone_type = meta_->monotone_type; output->monotone_type = meta_->monotone_type;
double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>(
sum_gradient, sum_hessian, meta_->config->lambda_l1, double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
meta_->config->lambda_l2, meta_->config->max_delta_step); sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step, meta_->config->path_smooth, num_data, parent_output);
*rand_threshold = 0; *rand_threshold = 0;
if (USE_RAND) { if (USE_RAND) {
if (meta_->num_bin - 2 > 0) { if (meta_->num_bin - 2 > 0) {
...@@ -144,21 +146,30 @@ class FeatureHistogram { ...@@ -144,21 +146,30 @@ class FeatureHistogram {
template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT> template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT>
void FuncForNumricalL2() { void FuncForNumricalL2() {
#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT if (meta_->config->path_smooth > kEpsilon) {
FuncForNumricalL3<USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, true>();
} else {
FuncForNumricalL3<USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, false>();
}
}
template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
void FuncForNumricalL3() {
#define TEMPLATE_PREFIX USE_RAND, USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING
#define LAMBDA_ARGUMENTS \ #define LAMBDA_ARGUMENTS \
double sum_gradient, double sum_hessian, data_size_t num_data, \ double sum_gradient, double sum_hessian, data_size_t num_data, \
const ConstraintEntry &constraints, SplitInfo *output const ConstraintEntry &constraints, double parent_output, SplitInfo *output
#define BEFORE_ARGUMENTS sum_gradient, sum_hessian, output, &rand_threshold #define BEFORE_ARGUMENTS sum_gradient, sum_hessian, parent_output, num_data, output, &rand_threshold
#define FUNC_ARGUMENTS \ #define FUNC_ARGUMENTS \
sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, output, \ sum_gradient, sum_hessian, num_data, constraints, min_gain_shift, \
rand_threshold output, rand_threshold, parent_output
if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) { if (meta_->num_bin > 2 && meta_->missing_type != MissingType::None) {
if (meta_->missing_type == MissingType::Zero) { if (meta_->missing_type == MissingType::Zero) {
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0; int rand_threshold = 0;
double min_gain_shift = double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>( BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
BEFORE_ARGUMENTS); BEFORE_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, true, true, false>( FindBestThresholdSequentially<TEMPLATE_PREFIX, true, true, false>(
FUNC_ARGUMENTS); FUNC_ARGUMENTS);
...@@ -169,7 +180,7 @@ class FeatureHistogram { ...@@ -169,7 +180,7 @@ class FeatureHistogram {
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0; int rand_threshold = 0;
double min_gain_shift = double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>( BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
BEFORE_ARGUMENTS); BEFORE_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, true>( FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, true>(
FUNC_ARGUMENTS); FUNC_ARGUMENTS);
...@@ -182,7 +193,7 @@ class FeatureHistogram { ...@@ -182,7 +193,7 @@ class FeatureHistogram {
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0; int rand_threshold = 0;
double min_gain_shift = double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>( BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
BEFORE_ARGUMENTS); BEFORE_ARGUMENTS);
FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>( FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
FUNC_ARGUMENTS); FUNC_ARGUMENTS);
...@@ -191,10 +202,9 @@ class FeatureHistogram { ...@@ -191,10 +202,9 @@ class FeatureHistogram {
find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) { find_best_threshold_fun_ = [=](LAMBDA_ARGUMENTS) {
int rand_threshold = 0; int rand_threshold = 0;
double min_gain_shift = double min_gain_shift =
BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT>( BeforeNumercal<USE_RAND, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
BEFORE_ARGUMENTS); BEFORE_ARGUMENTS);
FindBestThresholdSequentially<USE_RAND, USE_MC, USE_L1, FindBestThresholdSequentially<TEMPLATE_PREFIX, true, false, false>(
USE_MAX_OUTPUT, true, false, false>(
FUNC_ARGUMENTS); FUNC_ARGUMENTS);
output->default_left = false; output->default_left = false;
}; };
...@@ -224,42 +234,52 @@ class FeatureHistogram { ...@@ -224,42 +234,52 @@ class FeatureHistogram {
template <bool USE_RAND, bool USE_MC> template <bool USE_RAND, bool USE_MC>
void FuncForCategoricalL1() { void FuncForCategoricalL1() {
if (meta_->config->path_smooth > kEpsilon) {
FuncForCategoricalL2<USE_RAND, USE_MC, true>();
} else {
FuncForCategoricalL2<USE_RAND, USE_MC, false>();
}
}
template <bool USE_RAND, bool USE_MC, bool USE_SMOOTHING>
void FuncForCategoricalL2() {
#define ARGUMENTS \ #define ARGUMENTS \
std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \ std::placeholders::_1, std::placeholders::_2, std::placeholders::_3, \
std::placeholders::_4, std::placeholders::_5 std::placeholders::_4, std::placeholders::_5, std::placeholders::_6
if (meta_->config->lambda_l1 > 0) { if (meta_->config->lambda_l1 > 0) {
if (meta_->config->max_delta_step > 0) { if (meta_->config->max_delta_step > 0) {
find_best_threshold_fun_ = find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, true, true>, USE_RAND, USE_MC, true, true, USE_SMOOTHING>,
this, ARGUMENTS); this, ARGUMENTS);
} else { } else {
find_best_threshold_fun_ = find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, true, false>, USE_RAND, USE_MC, true, false, USE_SMOOTHING>,
this, ARGUMENTS); this, ARGUMENTS);
} }
} else { } else {
if (meta_->config->max_delta_step > 0) { if (meta_->config->max_delta_step > 0) {
find_best_threshold_fun_ = find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, false, true>, USE_RAND, USE_MC, false, true, USE_SMOOTHING>,
this, ARGUMENTS); this, ARGUMENTS);
} else { } else {
find_best_threshold_fun_ = find_best_threshold_fun_ =
std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner< std::bind(&FeatureHistogram::FindBestThresholdCategoricalInner<
USE_RAND, USE_MC, false, false>, USE_RAND, USE_MC, false, false, USE_SMOOTHING>,
this, ARGUMENTS); this, ARGUMENTS);
} }
} }
#undef ARGUMENTS #undef ARGUMENTS
} }
template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT> template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
void FindBestThresholdCategoricalInner(double sum_gradient, void FindBestThresholdCategoricalInner(double sum_gradient,
double sum_hessian, double sum_hessian,
data_size_t num_data, data_size_t num_data,
const ConstraintEntry& constraints, const ConstraintEntry& constraints,
double parent_output,
SplitInfo* output) { SplitInfo* output) {
is_splittable_ = false; is_splittable_ = false;
output->default_left = false; output->default_left = false;
...@@ -267,9 +287,17 @@ class FeatureHistogram { ...@@ -267,9 +287,17 @@ class FeatureHistogram {
data_size_t best_left_count = 0; data_size_t best_left_count = 0;
double best_sum_left_gradient = 0; double best_sum_left_gradient = 0;
double best_sum_left_hessian = 0; double best_sum_left_hessian = 0;
double gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT>( double gain_shift;
sum_gradient, sum_hessian, meta_->config->lambda_l1, if (USE_SMOOTHING) {
meta_->config->lambda_l2, meta_->config->max_delta_step); gain_shift = GetLeafGainGivenOutput<USE_L1>(
sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
} else {
// Need special case for no smoothing to preserve existing behaviour. If no smoothing, the parent output is calculated
// with the larger categorical l2, whereas min_split_gain uses the original l2.
gain_shift = GetLeafGain<USE_L1, USE_MAX_OUTPUT, false>(sum_gradient, sum_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->max_delta_step, 0,
num_data, 0);
}
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
bool is_full_categorical = meta_->missing_type == MissingType::None; bool is_full_categorical = meta_->missing_type == MissingType::None;
...@@ -317,10 +345,10 @@ class FeatureHistogram { ...@@ -317,10 +345,10 @@ class FeatureHistogram {
} }
} }
// current split gain // current split gain
double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>( double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon, sum_other_gradient, sum_other_hessian, grad, hess + kEpsilon,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step, meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
constraints, 0); constraints, 0, meta_->config->path_smooth, other_count, cnt, parent_output);
// gain with split is worse than without split // gain with split is worse than without split
if (current_gain <= min_gain_shift) { if (current_gain <= min_gain_shift) {
continue; continue;
...@@ -419,10 +447,11 @@ class FeatureHistogram { ...@@ -419,10 +447,11 @@ class FeatureHistogram {
continue; continue;
} }
} }
double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>( double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_left_gradient, sum_left_hessian, sum_right_gradient,
sum_right_hessian, meta_->config->lambda_l1, l2, sum_right_hessian, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step, constraints, 0); meta_->config->max_delta_step, constraints, 0, meta_->config->path_smooth,
left_count, right_count, parent_output);
if (current_gain <= min_gain_shift) { if (current_gain <= min_gain_shift) {
continue; continue;
} }
...@@ -440,19 +469,18 @@ class FeatureHistogram { ...@@ -440,19 +469,18 @@ class FeatureHistogram {
} }
if (is_splittable_) { if (is_splittable_) {
output->left_output = output->left_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>( best_sum_left_gradient, best_sum_left_hessian,
best_sum_left_gradient, best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step,
meta_->config->lambda_l1, l2, meta_->config->max_delta_step, constraints, meta_->config->path_smooth, best_left_count, parent_output);
constraints);
output->left_count = best_left_count; output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient; output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon; output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = output->right_output = CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>( sum_gradient - best_sum_left_gradient,
sum_gradient - best_sum_left_gradient, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2,
sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, l2, meta_->config->max_delta_step, constraints, meta_->config->path_smooth,
meta_->config->max_delta_step, constraints); num_data - best_left_count, parent_output);
output->right_count = num_data - best_left_count; output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = output->right_sum_hessian =
...@@ -484,22 +512,38 @@ class FeatureHistogram { ...@@ -484,22 +512,38 @@ class FeatureHistogram {
void GatherInfoForThreshold(double sum_gradient, double sum_hessian, void GatherInfoForThreshold(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data, uint32_t threshold, data_size_t num_data,
SplitInfo* output) { double parent_output, SplitInfo* output) {
if (meta_->bin_type == BinType::NumericalBin) { if (meta_->bin_type == BinType::NumericalBin) {
GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold, GatherInfoForThresholdNumerical(sum_gradient, sum_hessian, threshold,
num_data, output); num_data, parent_output, output);
} else { } else {
GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold, GatherInfoForThresholdCategorical(sum_gradient, sum_hessian, threshold,
num_data, output); num_data, parent_output, output);
} }
} }
void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian, void GatherInfoForThresholdNumerical(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data, uint32_t threshold, data_size_t num_data,
SplitInfo* output) { double parent_output, SplitInfo* output) {
double gain_shift = GetLeafGain<true, true>( bool use_smoothing = meta_->config->path_smooth > kEpsilon;
if (use_smoothing) {
GatherInfoForThresholdNumericalInner<true>(sum_gradient, sum_hessian,
threshold, num_data,
parent_output, output);
} else {
GatherInfoForThresholdNumericalInner<false>(sum_gradient, sum_hessian,
threshold, num_data,
parent_output, output);
}
}
template<bool USE_SMOOTHING>
void GatherInfoForThresholdNumericalInner(double sum_gradient, double sum_hessian,
uint32_t threshold, data_size_t num_data,
double parent_output, SplitInfo* output) {
double gain_shift = GetLeafGainGivenOutput<true>(
sum_gradient, sum_hessian, meta_->config->lambda_l1, sum_gradient, sum_hessian, meta_->config->lambda_l1,
meta_->config->lambda_l2, meta_->config->max_delta_step); meta_->config->lambda_l2, parent_output);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
// do stuff here // do stuff here
...@@ -544,12 +588,14 @@ class FeatureHistogram { ...@@ -544,12 +588,14 @@ class FeatureHistogram {
double sum_left_hessian = sum_hessian - sum_right_hessian; double sum_left_hessian = sum_hessian - sum_right_hessian;
data_size_t left_count = num_data - right_count; data_size_t left_count = num_data - right_count;
double current_gain = double current_gain =
GetLeafGain<true, true>( GetLeafGain<true, true, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
meta_->config->lambda_l2, meta_->config->max_delta_step) + meta_->config->lambda_l2, meta_->config->max_delta_step,
GetLeafGain<true, true>( meta_->config->path_smooth, left_count, parent_output) +
GetLeafGain<true, true, USE_SMOOTHING>(
sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1,
meta_->config->lambda_l2, meta_->config->max_delta_step); meta_->config->lambda_l2, meta_->config->max_delta_step,
meta_->config->path_smooth, right_count, parent_output);
// gain with split is worse than without split // gain with split is worse than without split
if (std::isnan(current_gain) || current_gain <= min_gain_shift) { if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
...@@ -561,16 +607,18 @@ class FeatureHistogram { ...@@ -561,16 +607,18 @@ class FeatureHistogram {
// update split information // update split information
output->threshold = threshold; output->threshold = threshold;
output->left_output = CalculateSplittedLeafOutput<true, true>( output->left_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1,
meta_->config->lambda_l2, meta_->config->max_delta_step); meta_->config->lambda_l2, meta_->config->max_delta_step,
meta_->config->path_smooth, left_count, parent_output);
output->left_count = left_count; output->left_count = left_count;
output->left_sum_gradient = sum_left_gradient; output->left_sum_gradient = sum_left_gradient;
output->left_sum_hessian = sum_left_hessian - kEpsilon; output->left_sum_hessian = sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput<true, true>( output->right_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian, sum_gradient - sum_left_gradient, sum_hessian - sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step); meta_->config->max_delta_step, meta_->config->path_smooth,
right_count, parent_output);
output->right_count = num_data - left_count; output->right_count = num_data - left_count;
output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_gradient = sum_gradient - sum_left_gradient;
output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon; output->right_sum_hessian = sum_hessian - sum_left_hessian - kEpsilon;
...@@ -578,15 +626,28 @@ class FeatureHistogram { ...@@ -578,15 +626,28 @@ class FeatureHistogram {
output->default_left = true; output->default_left = true;
} }
void GatherInfoForThresholdCategorical(double sum_gradient, void GatherInfoForThresholdCategorical(double sum_gradient, double sum_hessian,
double sum_hessian, uint32_t threshold, uint32_t threshold, data_size_t num_data,
data_size_t num_data, double parent_output, SplitInfo* output) {
SplitInfo* output) { bool use_smoothing = meta_->config->path_smooth > kEpsilon;
if (use_smoothing) {
GatherInfoForThresholdCategoricalInner<true>(sum_gradient, sum_hessian, threshold,
num_data, parent_output, output);
} else {
GatherInfoForThresholdCategoricalInner<false>(sum_gradient, sum_hessian, threshold,
num_data, parent_output, output);
}
}
template<bool USE_SMOOTHING>
void GatherInfoForThresholdCategoricalInner(double sum_gradient,
double sum_hessian, uint32_t threshold,
data_size_t num_data, double parent_output,
SplitInfo* output) {
// get SplitInfo for a given one-hot categorical split. // get SplitInfo for a given one-hot categorical split.
output->default_left = false; output->default_left = false;
double gain_shift = GetLeafGain<true, true>( double gain_shift = GetLeafGainGivenOutput<true>(
sum_gradient, sum_hessian, meta_->config->lambda_l1, sum_gradient, sum_hessian, meta_->config->lambda_l1, meta_->config->lambda_l2, parent_output);
meta_->config->lambda_l2, meta_->config->max_delta_step);
double min_gain_shift = gain_shift + meta_->config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->config->min_gain_to_split;
bool is_full_categorical = meta_->missing_type == MissingType::None; bool is_full_categorical = meta_->missing_type == MissingType::None;
int used_bin = meta_->num_bin - 1 + is_full_categorical; int used_bin = meta_->num_bin - 1 + is_full_categorical;
...@@ -610,28 +671,33 @@ class FeatureHistogram { ...@@ -610,28 +671,33 @@ class FeatureHistogram {
double sum_right_gradient = sum_gradient - sum_left_gradient; double sum_right_gradient = sum_gradient - sum_left_gradient;
// current split gain // current split gain
double current_gain = double current_gain =
GetLeafGain<true, true>(sum_right_gradient, sum_right_hessian, GetLeafGain<true, true, USE_SMOOTHING>(sum_right_gradient, sum_right_hessian,
meta_->config->lambda_l1, l2, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step) + meta_->config->max_delta_step,
GetLeafGain<true, true>(sum_left_gradient, sum_left_hessian, meta_->config->path_smooth, right_count,
meta_->config->lambda_l1, l2, parent_output) +
meta_->config->max_delta_step); GetLeafGain<true, true, USE_SMOOTHING>(sum_left_gradient, sum_left_hessian,
meta_->config->lambda_l1, l2,
meta_->config->max_delta_step,
meta_->config->path_smooth, left_count,
parent_output);
if (std::isnan(current_gain) || current_gain <= min_gain_shift) { if (std::isnan(current_gain) || current_gain <= min_gain_shift) {
output->gain = kMinScore; output->gain = kMinScore;
Log::Warning( Log::Warning(
"'Forced Split' will be ignored since the gain getting worse."); "'Forced Split' will be ignored since the gain getting worse.");
return; return;
} }
output->left_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
output->left_output = CalculateSplittedLeafOutput<true, true>(
sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2, sum_left_gradient, sum_left_hessian, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step); meta_->config->max_delta_step, meta_->config->path_smooth, left_count,
parent_output);
output->left_count = left_count; output->left_count = left_count;
output->left_sum_gradient = sum_left_gradient; output->left_sum_gradient = sum_left_gradient;
output->left_sum_hessian = sum_left_hessian - kEpsilon; output->left_sum_hessian = sum_left_hessian - kEpsilon;
output->right_output = CalculateSplittedLeafOutput<true, true>( output->right_output = CalculateSplittedLeafOutput<true, true, USE_SMOOTHING>(
sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2, sum_right_gradient, sum_right_hessian, meta_->config->lambda_l1, l2,
meta_->config->max_delta_step); meta_->config->max_delta_step, meta_->config->path_smooth, right_count,
parent_output);
output->right_count = right_count; output->right_count = right_count;
output->right_sum_gradient = sum_gradient - sum_left_gradient; output->right_sum_gradient = sum_gradient - sum_left_gradient;
output->right_sum_hessian = sum_right_hessian - kEpsilon; output->right_sum_hessian = sum_right_hessian - kEpsilon;
...@@ -670,35 +736,37 @@ class FeatureHistogram { ...@@ -670,35 +736,37 @@ class FeatureHistogram {
return Common::Sign(s) * reg_s; return Common::Sign(s) * reg_s;
} }
template <bool USE_L1, bool USE_MAX_OUTPUT> template <bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
static double CalculateSplittedLeafOutput(double sum_gradients, static double CalculateSplittedLeafOutput(double sum_gradients,
double sum_hessians, double l1, double sum_hessians, double l1,
double l2, double max_delta_step) { double l2, double max_delta_step,
double smoothing, data_size_t num_data,
double parent_output) {
double ret;
if (USE_L1) { if (USE_L1) {
double ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2); ret = -ThresholdL1(sum_gradients, l1) / (sum_hessians + l2);
if (USE_MAX_OUTPUT) {
if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
return Common::Sign(ret) * max_delta_step;
}
}
return ret;
} else { } else {
double ret = -sum_gradients / (sum_hessians + l2); ret = -sum_gradients / (sum_hessians + l2);
if (USE_MAX_OUTPUT) { }
if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) { if (USE_MAX_OUTPUT) {
return Common::Sign(ret) * max_delta_step; if (max_delta_step > 0 && std::fabs(ret) > max_delta_step) {
} ret = Common::Sign(ret) * max_delta_step;
} }
return ret;
} }
if (USE_SMOOTHING) {
ret = ret * (num_data / smoothing) / (num_data / smoothing + 1) \
+ parent_output / (num_data / smoothing + 1);
}
return ret;
} }
template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT> template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
static double CalculateSplittedLeafOutput( static double CalculateSplittedLeafOutput(
double sum_gradients, double sum_hessians, double l1, double l2, double sum_gradients, double sum_hessians, double l1, double l2,
double max_delta_step, const ConstraintEntry& constraints) { double max_delta_step, const ConstraintEntry& constraints,
double ret = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>( double smoothing, data_size_t num_data, double parent_output) {
sum_gradients, sum_hessians, l1, l2, max_delta_step); double ret = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_gradients, sum_hessians, l1, l2, max_delta_step, smoothing, num_data, parent_output);
if (USE_MC) { if (USE_MC) {
if (ret < constraints.min) { if (ret < constraints.min) {
ret = constraints.min; ret = constraints.min;
...@@ -710,30 +778,36 @@ class FeatureHistogram { ...@@ -710,30 +778,36 @@ class FeatureHistogram {
} }
private: private:
template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT> template <bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
static double GetSplitGains(double sum_left_gradients, static double GetSplitGains(double sum_left_gradients,
double sum_left_hessians, double sum_left_hessians,
double sum_right_gradients, double sum_right_gradients,
double sum_right_hessians, double l1, double l2, double sum_right_hessians, double l1, double l2,
double max_delta_step, double max_delta_step,
const ConstraintEntry& constraints, const ConstraintEntry& constraints,
int8_t monotone_constraint) { int8_t monotone_constraint,
double smoothing,
data_size_t left_count,
data_size_t right_count,
double parent_output) {
if (!USE_MC) { if (!USE_MC) {
return GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_left_gradients, return GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(sum_left_gradients,
sum_left_hessians, l1, l2, sum_left_hessians, l1, l2,
max_delta_step) + max_delta_step, smoothing,
GetLeafGain<USE_L1, USE_MAX_OUTPUT>(sum_right_gradients, left_count, parent_output) +
sum_right_hessians, l1, l2, GetLeafGain<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(sum_right_gradients,
max_delta_step); sum_right_hessians, l1, l2,
max_delta_step, smoothing,
right_count, parent_output);
} else { } else {
double left_output = double left_output =
CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>( CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step, sum_left_gradients, sum_left_hessians, l1, l2, max_delta_step,
constraints); constraints, smoothing, left_count, parent_output);
double right_output = double right_output =
CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>( CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step, sum_right_gradients, sum_right_hessians, l1, l2, max_delta_step,
constraints); constraints, smoothing, right_count, parent_output);
if (((monotone_constraint > 0) && (left_output > right_output)) || if (((monotone_constraint > 0) && (left_output > right_output)) ||
((monotone_constraint < 0) && (left_output < right_output))) { ((monotone_constraint < 0) && (left_output < right_output))) {
return 0; return 0;
...@@ -745,10 +819,11 @@ class FeatureHistogram { ...@@ -745,10 +819,11 @@ class FeatureHistogram {
} }
} }
template <bool USE_L1, bool USE_MAX_OUTPUT> template <bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING>
static double GetLeafGain(double sum_gradients, double sum_hessians, static double GetLeafGain(double sum_gradients, double sum_hessians,
double l1, double l2, double max_delta_step) { double l1, double l2, double max_delta_step,
if (!USE_MAX_OUTPUT) { double smoothing, data_size_t num_data, double parent_output) {
if (!USE_MAX_OUTPUT && !USE_SMOOTHING) {
if (USE_L1) { if (USE_L1) {
const double sg_l1 = ThresholdL1(sum_gradients, l1); const double sg_l1 = ThresholdL1(sum_gradients, l1);
return (sg_l1 * sg_l1) / (sum_hessians + l2); return (sg_l1 * sg_l1) / (sum_hessians + l2);
...@@ -756,10 +831,9 @@ class FeatureHistogram { ...@@ -756,10 +831,9 @@ class FeatureHistogram {
return (sum_gradients * sum_gradients) / (sum_hessians + l2); return (sum_gradients * sum_gradients) / (sum_hessians + l2);
} }
} else { } else {
double output = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT>( double output = CalculateSplittedLeafOutput<USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_gradients, sum_hessians, l1, l2, max_delta_step); sum_gradients, sum_hessians, l1, l2, max_delta_step, smoothing, num_data, parent_output);
return GetLeafGainGivenOutput<USE_L1>(sum_gradients, sum_hessians, l1, l2, return GetLeafGainGivenOutput<USE_L1>(sum_gradients, sum_hessians, l1, l2, output);
output);
} }
} }
...@@ -776,13 +850,13 @@ class FeatureHistogram { ...@@ -776,13 +850,13 @@ class FeatureHistogram {
} }
} }
template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, template <bool USE_RAND, bool USE_MC, bool USE_L1, bool USE_MAX_OUTPUT, bool USE_SMOOTHING,
bool REVERSE, bool SKIP_DEFAULT_BIN, bool NA_AS_MISSING> bool REVERSE, bool SKIP_DEFAULT_BIN, bool NA_AS_MISSING>
void FindBestThresholdSequentially(double sum_gradient, double sum_hessian, void FindBestThresholdSequentially(double sum_gradient, double sum_hessian,
data_size_t num_data, data_size_t num_data,
const ConstraintEntry& constraints, const ConstraintEntry& constraints,
double min_gain_shift, SplitInfo* output, double min_gain_shift, SplitInfo* output,
int rand_threshold) { int rand_threshold, double parent_output) {
const int8_t offset = meta_->offset; const int8_t offset = meta_->offset;
double best_sum_left_gradient = NAN; double best_sum_left_gradient = NAN;
double best_sum_left_hessian = NAN; double best_sum_left_hessian = NAN;
...@@ -837,11 +911,12 @@ class FeatureHistogram { ...@@ -837,11 +911,12 @@ class FeatureHistogram {
} }
} }
// current split gain // current split gain
double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>( double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_left_gradient, sum_left_hessian, sum_right_gradient,
sum_right_hessian, meta_->config->lambda_l1, sum_right_hessian, meta_->config->lambda_l1,
meta_->config->lambda_l2, meta_->config->max_delta_step, meta_->config->lambda_l2, meta_->config->max_delta_step,
constraints, meta_->monotone_type); constraints, meta_->monotone_type, meta_->config->path_smooth,
left_count, right_count, parent_output);
// gain with split is worse than without split // gain with split is worse than without split
if (current_gain <= min_gain_shift) { if (current_gain <= min_gain_shift) {
continue; continue;
...@@ -921,11 +996,12 @@ class FeatureHistogram { ...@@ -921,11 +996,12 @@ class FeatureHistogram {
} }
} }
// current split gain // current split gain
double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT>( double current_gain = GetSplitGains<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_left_gradient, sum_left_hessian, sum_right_gradient, sum_left_gradient, sum_left_hessian, sum_right_gradient,
sum_right_hessian, meta_->config->lambda_l1, sum_right_hessian, meta_->config->lambda_l1,
meta_->config->lambda_l2, meta_->config->max_delta_step, meta_->config->lambda_l2, meta_->config->max_delta_step,
constraints, meta_->monotone_type); constraints, meta_->monotone_type, meta_->config->path_smooth, left_count,
right_count, parent_output);
// gain with split is worse than without split // gain with split is worse than without split
if (current_gain <= min_gain_shift) { if (current_gain <= min_gain_shift) {
continue; continue;
...@@ -948,19 +1024,21 @@ class FeatureHistogram { ...@@ -948,19 +1024,21 @@ class FeatureHistogram {
// update split information // update split information
output->threshold = best_threshold; output->threshold = best_threshold;
output->left_output = output->left_output =
CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>( CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
best_sum_left_gradient, best_sum_left_hessian, best_sum_left_gradient, best_sum_left_hessian,
meta_->config->lambda_l1, meta_->config->lambda_l2, meta_->config->lambda_l1, meta_->config->lambda_l2,
meta_->config->max_delta_step, constraints); meta_->config->max_delta_step, constraints, meta_->config->path_smooth,
best_left_count, parent_output);
output->left_count = best_left_count; output->left_count = best_left_count;
output->left_sum_gradient = best_sum_left_gradient; output->left_sum_gradient = best_sum_left_gradient;
output->left_sum_hessian = best_sum_left_hessian - kEpsilon; output->left_sum_hessian = best_sum_left_hessian - kEpsilon;
output->right_output = output->right_output =
CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT>( CalculateSplittedLeafOutput<USE_MC, USE_L1, USE_MAX_OUTPUT, USE_SMOOTHING>(
sum_gradient - best_sum_left_gradient, sum_gradient - best_sum_left_gradient,
sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1, sum_hessian - best_sum_left_hessian, meta_->config->lambda_l1,
meta_->config->lambda_l2, meta_->config->max_delta_step, meta_->config->lambda_l2, meta_->config->max_delta_step,
constraints); constraints, meta_->config->path_smooth, num_data - best_left_count,
parent_output);
output->right_count = num_data - best_left_count; output->right_count = num_data - best_left_count;
output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = output->right_sum_hessian =
...@@ -976,7 +1054,7 @@ class FeatureHistogram { ...@@ -976,7 +1054,7 @@ class FeatureHistogram {
bool is_splittable_ = true; bool is_splittable_ = true;
std::function<void(double, double, data_size_t, const ConstraintEntry&, std::function<void(double, double, data_size_t, const ConstraintEntry&,
SplitInfo*)> double, SplitInfo*)>
find_best_threshold_fun_; find_best_threshold_fun_;
}; };
...@@ -1133,7 +1211,8 @@ class HistogramPool { ...@@ -1133,7 +1211,8 @@ class HistogramPool {
if (old_config->lambda_l1 != config->lambda_l1 || if (old_config->lambda_l1 != config->lambda_l1 ||
old_config->monotone_constraints != config->monotone_constraints || old_config->monotone_constraints != config->monotone_constraints ||
old_config->extra_trees != config->extra_trees || old_config->extra_trees != config->extra_trees ||
old_config->max_delta_step != config->max_delta_step) { old_config->max_delta_step != config->max_delta_step ||
old_config->path_smooth != config->path_smooth) {
#pragma omp parallel for schedule(static) #pragma omp parallel for schedule(static)
for (int i = 0; i < cache_size_; ++i) { for (int i = 0; i < cache_size_; ++i) {
for (int j = 0; j < train_data->num_features(); ++j) { for (int j = 0; j < train_data->num_features(); ++j) {
......
...@@ -1090,8 +1090,8 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right ...@@ -1090,8 +1090,8 @@ void GPUTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right
Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
} }
} else { } else {
smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian); smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), best_split_info.right_sum_gradient, best_split_info.right_sum_hessian, best_split_info.right_output);
larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian); larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), best_split_info.left_sum_gradient, best_split_info.left_sum_hessian, best_split_info.left_output);
if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
(best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
......
...@@ -21,7 +21,7 @@ class LeafSplits { ...@@ -21,7 +21,7 @@ class LeafSplits {
public: public:
explicit LeafSplits(data_size_t num_data) explicit LeafSplits(data_size_t num_data)
:num_data_in_leaf_(num_data), num_data_(num_data), :num_data_in_leaf_(num_data), num_data_(num_data),
data_indices_(nullptr) { data_indices_(nullptr), weight_(0) {
} }
void ResetNumData(data_size_t num_data) { void ResetNumData(data_size_t num_data) {
num_data_ = num_data; num_data_ = num_data;
...@@ -37,11 +37,13 @@ class LeafSplits { ...@@ -37,11 +37,13 @@ class LeafSplits {
* \param sum_gradients * \param sum_gradients
* \param sum_hessians * \param sum_hessians
*/ */
void Init(int leaf, const DataPartition* data_partition, double sum_gradients, double sum_hessians) { void Init(int leaf, const DataPartition* data_partition, double sum_gradients,
double sum_hessians, double weight) {
leaf_index_ = leaf; leaf_index_ = leaf;
data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_); data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
sum_gradients_ = sum_gradients; sum_gradients_ = sum_gradients;
sum_hessians_ = sum_hessians; sum_hessians_ = sum_hessians;
weight_ = weight;
} }
/*! /*!
...@@ -135,6 +137,10 @@ class LeafSplits { ...@@ -135,6 +137,10 @@ class LeafSplits {
/*! \brief Get indices of data of current leaf */ /*! \brief Get indices of data of current leaf */
const data_size_t* data_indices() const { return data_indices_; } const data_size_t* data_indices() const { return data_indices_; }
/*! \brief Get weight of current leaf */
double weight() const { return weight_; }
private: private:
/*! \brief current leaf index */ /*! \brief current leaf index */
...@@ -149,6 +155,8 @@ class LeafSplits { ...@@ -149,6 +155,8 @@ class LeafSplits {
double sum_hessians_; double sum_hessians_;
/*! \brief indices of data of current leaf */ /*! \brief indices of data of current leaf */
const data_size_t* data_indices_; const data_size_t* data_indices_;
/*! \brief weight of current leaf */
double weight_;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -214,9 +214,16 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* ...@@ -214,9 +214,16 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t*
sum_grad += gradients[idx]; sum_grad += gradients[idx];
sum_hess += hessians[idx]; sum_hess += hessians[idx];
} }
double output = FeatureHistogram::CalculateSplittedLeafOutput<true, true>( double output;
sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2, if ((config_->path_smooth > kEpsilon) & (i > 0)) {
config_->max_delta_step); output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true>(
sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
config_->max_delta_step, config_->path_smooth, cnt_leaf_data, tree->leaf_parent(i));
} else {
output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(
sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
config_->max_delta_step, config_->path_smooth, cnt_leaf_data, 0);
}
auto old_leaf_output = tree->LeafOutput(i); auto old_leaf_output = tree->LeafOutput(i);
auto new_leaf_output = output * tree->shrinkage(); auto new_leaf_output = output * tree->shrinkage();
tree->SetLeafOutput(i, config_->refit_decay_rate * old_leaf_output + (1.0 - config_->refit_decay_rate) * new_leaf_output); tree->SetLeafOutput(i, config_->refit_decay_rate * old_leaf_output + (1.0 - config_->refit_decay_rate) * new_leaf_output);
...@@ -449,6 +456,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf, ...@@ -449,6 +456,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
left_leaf_splits->sum_hessians(), left_leaf_splits->sum_hessians(),
left_threshold, left_threshold,
left_leaf_splits->num_data_in_leaf(), left_leaf_splits->num_data_in_leaf(),
left_leaf_splits->weight(),
&left_split); &left_split);
left_split.feature = left_feature; left_split.feature = left_feature;
forceSplitMap[*left_leaf] = left_split; forceSplitMap[*left_leaf] = left_split;
...@@ -470,6 +478,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf, ...@@ -470,6 +478,7 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
right_leaf_splits->sum_hessians(), right_leaf_splits->sum_hessians(),
right_threshold, right_threshold,
right_leaf_splits->num_data_in_leaf(), right_leaf_splits->num_data_in_leaf(),
right_leaf_splits->weight(),
&right_split); &right_split);
right_split.feature = right_feature; right_split.feature = right_feature;
forceSplitMap[*right_leaf] = right_split; forceSplitMap[*right_leaf] = right_split;
...@@ -613,18 +622,22 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf, ...@@ -613,18 +622,22 @@ void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
CHECK_GT(best_split_info.left_count, 0); CHECK_GT(best_split_info.left_count, 0);
smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(), smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
best_split_info.left_sum_gradient, best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian); best_split_info.left_sum_hessian,
best_split_info.left_output);
larger_leaf_splits_->Init(*right_leaf, data_partition_.get(), larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
best_split_info.right_sum_gradient, best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian); best_split_info.right_sum_hessian,
best_split_info.right_output);
} else { } else {
CHECK_GT(best_split_info.right_count, 0); CHECK_GT(best_split_info.right_count, 0);
smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(), smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
best_split_info.right_sum_gradient, best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian); best_split_info.right_sum_hessian,
best_split_info.right_output);
larger_leaf_splits_->Init(*left_leaf, data_partition_.get(), larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
best_split_info.left_sum_gradient, best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian); best_split_info.left_sum_hessian,
best_split_info.left_output);
} }
auto leaves_need_update = constraints_->Update( auto leaves_need_update = constraints_->Update(
tree, is_numerical_split, *left_leaf, *right_leaf, tree, is_numerical_split, *left_leaf, *right_leaf,
...@@ -685,9 +698,19 @@ void SerialTreeLearner::ComputeBestSplitForFeature( ...@@ -685,9 +698,19 @@ void SerialTreeLearner::ComputeBestSplitForFeature(
return; return;
} }
SplitInfo new_split; SplitInfo new_split;
double parent_output;
if (leaf_splits->leaf_index() == 0) {
// for root leaf the "parent" output is its own output because we don't apply any smoothing to the root
parent_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), config_->lambda_l1,
config_->lambda_l2, config_->max_delta_step, constraints_->Get(leaf_splits->leaf_index()),
config_->path_smooth, static_cast<data_size_t>(num_data), 0);
} else {
parent_output = leaf_splits->weight();
}
histogram_array_[feature_index].FindBestThreshold( histogram_array_[feature_index].FindBestThreshold(
leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data, leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
constraints_->Get(leaf_splits->leaf_index()), &new_split); constraints_->Get(leaf_splits->leaf_index()), parent_output, &new_split);
new_split.feature = real_fidx; new_split.feature = real_fidx;
if (cegb_ != nullptr) { if (cegb_ != nullptr) {
new_split.gain -= new_split.gain -=
......
...@@ -436,17 +436,21 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, ...@@ -436,17 +436,21 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
if (best_split_info.left_count < best_split_info.right_count) { if (best_split_info.left_count < best_split_info.right_count) {
smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), smaller_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
best_split_info.left_sum_gradient, best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian); best_split_info.left_sum_hessian,
best_split_info.left_output);
larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), larger_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
best_split_info.right_sum_gradient, best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian); best_split_info.right_sum_hessian,
best_split_info.right_output);
} else { } else {
smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(), smaller_leaf_splits_global_->Init(*right_leaf, this->data_partition_.get(),
best_split_info.right_sum_gradient, best_split_info.right_sum_gradient,
best_split_info.right_sum_hessian); best_split_info.right_sum_hessian,
best_split_info.right_output);
larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(), larger_leaf_splits_global_->Init(*left_leaf, this->data_partition_.get(),
best_split_info.left_sum_gradient, best_split_info.left_sum_gradient,
best_split_info.left_sum_hessian); best_split_info.left_sum_hessian,
best_split_info.left_output);
} }
} }
......
...@@ -2118,6 +2118,23 @@ class TestEngine(unittest.TestCase): ...@@ -2118,6 +2118,23 @@ class TestEngine(unittest.TestCase):
err_new = mean_squared_error(y, predicted_new) err_new = mean_squared_error(y, predicted_new)
self.assertLess(err, err_new) self.assertLess(err, err_new)
def test_path_smoothing(self):
# check path smoothing increases regularization
X, y = load_boston(True)
lgb_x = lgb.Dataset(X, label=y)
params = {'objective': 'regression',
'num_leaves': 32,
'verbose': -1,
'seed': 0}
est = lgb.train(params, lgb_x, num_boost_round=10)
predicted = est.predict(X)
err = mean_squared_error(y, predicted)
params['path_smooth'] = 1
est = lgb.train(params, lgb_x, num_boost_round=10)
predicted_new = est.predict(X)
err_new = mean_squared_error(y, predicted_new)
self.assertLess(err, err_new)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_trees_to_dataframe(self): def test_trees_to_dataframe(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment