Commit cc11525d authored by ChenZhiyong's avatar ChenZhiyong Committed by Guolin Ke
Browse files

refine categorical split (#919)

* refine categorical split

* add test
parent b1b24ee2
...@@ -224,7 +224,12 @@ public: ...@@ -224,7 +224,12 @@ public:
int gpu_device_id = -1; int gpu_device_id = -1;
/*! \brief Set to true to use double precision math on GPU (default using single precision) */ /*! \brief Set to true to use double precision math on GPU (default using single precision) */
bool gpu_use_dp = false; bool gpu_use_dp = false;
int max_cat_group = 64;
int min_data_per_group = 10;
int max_cat_threshold = 256; int max_cat_threshold = 256;
double cat_smooth_ratio = 0.01;
double min_cat_smooth = 5;
double max_cat_smooth = 100;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override; LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params) override;
}; };
...@@ -463,8 +468,8 @@ struct ParameterAlias { ...@@ -463,8 +468,8 @@ struct ParameterAlias {
"snapshot_freq", "verbosity", "sparse_threshold", "enable_load_from_binary_file", "snapshot_freq", "verbosity", "sparse_threshold", "enable_load_from_binary_file",
"max_conflict_rate", "poisson_max_delta_step", "gaussian_eta", "max_conflict_rate", "poisson_max_delta_step", "gaussian_eta",
"histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename", "histogram_pool_size", "output_freq", "is_provide_training_metric", "machine_list_filename",
"zero_as_missing", "max_cat_threshold", "zero_as_missing", "init_score_file", "valid_init_score_file", "is_predict_contrib",
"init_score_file", "valid_init_score_file", "is_predict_contrib" "max_cat_threshold", "max_cat_group", "cat_smooth_ratio", "min_cat_smooth", "max_cat_smooth", "min_data_per_group"
}); });
std::unordered_map<std::string, std::string> tmp_map; std::unordered_map<std::string, std::string> tmp_map;
for (const auto& pair : *params) { for (const auto& pair : *params) {
......
...@@ -370,7 +370,18 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params) ...@@ -370,7 +370,18 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetInt(params, "gpu_platform_id", &gpu_platform_id); GetInt(params, "gpu_platform_id", &gpu_platform_id);
GetInt(params, "gpu_device_id", &gpu_device_id); GetInt(params, "gpu_device_id", &gpu_device_id);
GetBool(params, "gpu_use_dp", &gpu_use_dp); GetBool(params, "gpu_use_dp", &gpu_use_dp);
GetInt(params, "max_cat_group", &max_cat_group);
GetInt(params, "max_cat_threshold", &max_cat_threshold); GetInt(params, "max_cat_threshold", &max_cat_threshold);
GetDouble(params, "cat_smooth_ratio", &cat_smooth_ratio);
GetDouble(params, "min_cat_smooth", &min_cat_smooth);
GetDouble(params, "max_cat_smooth", &max_cat_smooth);
GetInt(params, "min_data_per_group", &min_data_per_group);
CHECK(max_cat_group > 1);
CHECK(max_cat_threshold > 0);
CHECK(cat_smooth_ratio >= 0);
CHECK(min_cat_smooth >= 1);
CHECK(max_cat_smooth > min_cat_smooth);
CHECK(min_data_per_group > 0);
} }
void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) { void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& params) {
......
...@@ -106,58 +106,90 @@ public: ...@@ -106,58 +106,90 @@ public:
output->default_left = false; output->default_left = false;
double best_gain = kMinScore; double best_gain = kMinScore;
data_size_t best_left_count = 0; data_size_t best_left_count = 0;
double best_sum_left_gradient = 0.0f; double best_sum_left_gradient = 0;
double best_sum_left_hessian = 0.0f; double best_sum_left_hessian = 0;
double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, double gain_shift = GetLeafSplitGain(sum_gradient, sum_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split; double min_gain_shift = gain_shift + meta_->tree_config->min_gain_to_split;
bool is_full_categorical = meta_->missing_type == MissingType::None;
int used_bin = meta_->num_bin - 1;
if (is_full_categorical) ++used_bin;
std::vector<int> sorted_idx(used_bin);
for (int i = 0; i < used_bin; ++i) sorted_idx[i] = i;
double smooth_hess = meta_->tree_config->cat_smooth_ratio * num_data;
smooth_hess = std::min(meta_->tree_config->max_cat_smooth, std::max(smooth_hess, meta_->tree_config->min_cat_smooth));
const double smooth_grad = smooth_hess * sum_gradient / sum_hessian;
auto ctr_fun = [&smooth_hess, &smooth_grad](double sum_grad, double sum_hess) {
return (sum_grad + smooth_grad) / (sum_hess + smooth_hess);
};
std::sort(sorted_idx.begin(), sorted_idx.end(),
[this, &ctr_fun](int i, int j) {
return ctr_fun(data_[i].sum_gradients, data_[i].sum_hessians) < ctr_fun(data_[j].sum_gradients, data_[j].sum_hessians);
});
std::vector<int> find_direction(1, 1);
std::vector<int> start_position(1, 0);
if (!is_full_categorical
|| meta_->tree_config->max_cat_threshold * 2 < meta_->num_bin) {
find_direction.push_back(-1);
start_position.push_back(used_bin - 1);
}
is_splittable_ = false; is_splittable_ = false;
int best_threshold = -1;
int best_dir = 1;
for (size_t out_i = 0; out_i < find_direction.size(); ++out_i) {
auto dir = find_direction[out_i];
auto start_pos = start_position[out_i];
data_size_t rest_group = meta_->tree_config->max_cat_group;
data_size_t min_data_per_group = std::max(meta_->tree_config->min_data_per_group, num_data / rest_group);
data_size_t cnt_cur_group = 0;
double sum_left_gradient = 0.0f;
double sum_left_hessian = kEpsilon;
data_size_t left_count = 0;
for (int i = 0; i < used_bin && i < meta_->tree_config->max_cat_threshold; ++i) {
auto t = sorted_idx[start_pos];
start_pos += dir;
uint32_t best_threshold = 0; sum_left_gradient += data_[t].sum_gradients;
bool is_full_categorical = meta_->missing_type == MissingType::None; sum_left_hessian += data_[t].sum_hessians;
left_count += data_[t].cnt;
cnt_cur_group += data_[t].cnt;
int used_bin = meta_->num_bin - 1 + is_full_categorical; if (left_count < meta_->tree_config->min_data_in_leaf
|| sum_left_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t right_count = num_data - left_count;
if (right_count < meta_->tree_config->min_data_in_leaf || right_count < min_data_per_group) break;
// from right to left, and we don't need data in bin0 double sum_right_hessian = sum_hessian - sum_left_hessian;
for (int t = 0; t < used_bin; ++t) { if (sum_right_hessian < meta_->tree_config->min_sum_hessian_in_leaf) break;
// if data not enough, or sum hessian too small
if (data_[t].cnt < meta_->tree_config->min_data_in_leaf
|| data_[t].sum_hessians < meta_->tree_config->min_sum_hessian_in_leaf) continue;
data_size_t other_count = num_data - data_[t].cnt;
// if data not enough
if (other_count < meta_->tree_config->min_data_in_leaf) continue;
double sum_other_hessian = sum_hessian - data_[t].sum_hessians - kEpsilon; if (cnt_cur_group < min_data_per_group) continue;
// if sum hessian too small
if (sum_other_hessian < meta_->tree_config->min_sum_hessian_in_leaf) continue;
double sum_other_gradient = sum_gradient - data_[t].sum_gradients; cnt_cur_group = 0;
// current split gain if (--rest_group > 0) min_data_per_group = std::max(meta_->tree_config->min_data_per_group, right_count / rest_group);
double current_gain = GetLeafSplitGain(sum_other_gradient, sum_other_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(data_[t].sum_gradients, data_[t].sum_hessians + kEpsilon,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
// gain with split is worse than without split
if (current_gain <= min_gain_shift) continue;
// mark to is splittable double sum_right_gradient = sum_gradient - sum_left_gradient;
double current_gain = GetLeafSplitGain(sum_left_gradient, sum_left_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2)
+ GetLeafSplitGain(sum_right_gradient, sum_right_hessian, meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
if (current_gain <= min_gain_shift) continue;
is_splittable_ = true; is_splittable_ = true;
// better split point
if (current_gain > best_gain) { if (current_gain > best_gain) {
best_threshold = static_cast<uint32_t>(t); best_left_count = left_count;
best_sum_left_gradient = data_[t].sum_gradients; best_sum_left_gradient = sum_left_gradient;
best_sum_left_hessian = data_[t].sum_hessians + kEpsilon; best_sum_left_hessian = sum_left_hessian;
best_left_count = data_[t].cnt; best_threshold = i;
best_gain = current_gain; best_gain = current_gain;
best_dir = dir;
}
} }
} }
if (is_splittable_) { if (is_splittable_) {
// update split information
output->num_cat_threshold = 1;
output->cat_threshold.resize(output->num_cat_threshold);
output->cat_threshold[0] = best_threshold;
output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian, output->left_output = CalculateSplittedLeafOutput(best_sum_left_gradient, best_sum_left_hessian,
meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2); meta_->tree_config->lambda_l1, meta_->tree_config->lambda_l2);
output->left_count = best_left_count; output->left_count = best_left_count;
...@@ -170,6 +202,17 @@ public: ...@@ -170,6 +202,17 @@ public:
output->right_sum_gradient = sum_gradient - best_sum_left_gradient; output->right_sum_gradient = sum_gradient - best_sum_left_gradient;
output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon; output->right_sum_hessian = sum_hessian - best_sum_left_hessian - kEpsilon;
output->gain = best_gain - min_gain_shift; output->gain = best_gain - min_gain_shift;
output->num_cat_threshold = best_threshold + 1;
output->cat_threshold = std::vector<uint32_t>(output->num_cat_threshold);
if (best_dir == 1) {
for (int i = 0; i < output->num_cat_threshold; ++i) {
output->cat_threshold[i] = sorted_idx[i];
}
} else {
for (int i = 0; i < output->num_cat_threshold; ++i) {
output->cat_threshold[i] = sorted_idx[used_bin - 1 - i];
}
}
} }
} }
...@@ -287,7 +330,7 @@ private: ...@@ -287,7 +330,7 @@ private:
best_gain = current_gain; best_gain = current_gain;
} }
} }
} else{ } else {
double sum_left_gradient = 0.0f; double sum_left_gradient = 0.0f;
double sum_left_hessian = kEpsilon; double sum_left_hessian = kEpsilon;
data_size_t left_count = 0; data_size_t left_count = 0;
......
...@@ -132,7 +132,7 @@ class TestEngine(unittest.TestCase): ...@@ -132,7 +132,7 @@ class TestEngine(unittest.TestCase):
lgb_eval = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_train, y_train)
params = { params = {
'objective': 'binary', 'objective': 'regression',
'metric': 'auc', 'metric': 'auc',
'verbose': -1, 'verbose': -1,
'boost_from_average': False, 'boost_from_average': False,
...@@ -149,7 +149,7 @@ class TestEngine(unittest.TestCase): ...@@ -149,7 +149,7 @@ class TestEngine(unittest.TestCase):
verbose_eval=True, verbose_eval=True,
evals_result=evals_result) evals_result=evals_result)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[-1], pred[0], places=5) np.testing.assert_almost_equal(pred, y)
def test_missing_value_handle_zero(self): def test_missing_value_handle_zero(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
...@@ -161,7 +161,7 @@ class TestEngine(unittest.TestCase): ...@@ -161,7 +161,7 @@ class TestEngine(unittest.TestCase):
lgb_eval = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_train, y_train)
params = { params = {
'objective': 'binary', 'objective': 'regression',
'metric': 'auc', 'metric': 'auc',
'verbose': -1, 'verbose': -1,
'boost_from_average': False, 'boost_from_average': False,
...@@ -178,8 +178,7 @@ class TestEngine(unittest.TestCase): ...@@ -178,8 +178,7 @@ class TestEngine(unittest.TestCase):
verbose_eval=True, verbose_eval=True,
evals_result=evals_result) evals_result=evals_result)
pred = gbm.predict(X_train) pred = gbm.predict(X_train)
self.assertAlmostEqual(pred[-1], pred[-2], places=5) np.testing.assert_almost_equal(pred, y)
self.assertAlmostEqual(pred[-1], pred[0], places=5)
def test_missing_value_handle_none(self): def test_missing_value_handle_none(self):
x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan] x = [0, 1, 2, 3, 4, 5, 6, 7, np.nan]
...@@ -191,7 +190,7 @@ class TestEngine(unittest.TestCase): ...@@ -191,7 +190,7 @@ class TestEngine(unittest.TestCase):
lgb_eval = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_train, y_train)
params = { params = {
'objective': 'binary', 'objective': 'regression',
'metric': 'auc', 'metric': 'auc',
'verbose': -1, 'verbose': -1,
'boost_from_average': False, 'boost_from_average': False,
...@@ -211,6 +210,37 @@ class TestEngine(unittest.TestCase): ...@@ -211,6 +210,37 @@ class TestEngine(unittest.TestCase):
self.assertAlmostEqual(pred[0], pred[1], places=5) self.assertAlmostEqual(pred[0], pred[1], places=5)
self.assertAlmostEqual(pred[-1], pred[0], places=5) self.assertAlmostEqual(pred[-1], pred[0], places=5)
def test_categorical_handle(self):
x = [0, 1, 2, 3, 4, 5, 6, 7]
y = [0, 1, 0, 1, 0, 1, 0, 1]
X_train = np.array(x).reshape(len(x), 1)
y_train = np.array(y)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_train, y_train)
params = {
'objective': 'regression',
'metric': 'auc',
'verbose': -1,
'boost_from_average': False,
'min_data': 1,
'num_leaves': 2,
'learning_rate': 1,
'min_data_in_bin': 1,
'min_data_per_group': 1,
'zero_as_missing': True,
'categorical_column': 0
}
evals_result = {}
gbm = lgb.train(params, lgb_train,
num_boost_round=1,
valid_sets=lgb_eval,
verbose_eval=True,
evals_result=evals_result)
pred = gbm.predict(X_train)
np.testing.assert_almost_equal(pred, y)
def test_multiclass(self): def test_multiclass(self):
X, y = load_digits(10, True) X, y = load_digits(10, True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment