Commit c421f898 authored by Belinda Trotta's avatar Belinda Trotta Committed by Guolin Ke
Browse files

Bug fix: small values of max_bin cause program to crash (#2299)

* Fix bug where small values of max_bin cause crash.

* Revert "Fix bug where small values of max_bin cause crash."

This reverts commit fe5c8e2547057c1fa5750bcddd359dd7708fab4b.

* Fix bug where small values of max_bin cause crash.

* Reset random seed in test, remove extra blank line.

* Minor bug fix. Remove extra blank line.

* Change old test to account for new binning behavior.
parent 9558417a
......@@ -177,11 +177,10 @@ namespace LightGBM {
left_cnt = num_distinct_values;
}
if (left_cnt > 0) {
if ((left_cnt > 0) && (max_bin > 1)) {
int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - cnt_zero) * (max_bin - 1));
left_max_bin = std::max(1, left_max_bin);
bin_upper_bound = GreedyFindBin(distinct_values, counts, left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
bin_upper_bound.back() = -kZeroThreshold;
}
int right_start = -1;
......@@ -192,16 +191,32 @@ namespace LightGBM {
}
}
if (right_start >= 0) {
int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound.size());
CHECK(right_max_bin > 0);
if (bin_upper_bound.size() == 0) {
if (max_bin > 2) {
// create zero bin
bin_upper_bound.push_back(-kZeroThreshold);
bin_upper_bound.push_back(kZeroThreshold);
}
else if (max_bin > 1) {
bin_upper_bound.push_back(kZeroThreshold);
}
} else {
bin_upper_bound.back() = -kZeroThreshold;
if (max_bin > 2) {
// create zero bin
bin_upper_bound.push_back(kZeroThreshold);
}
}
int right_max_bin = max_bin - static_cast<int>(bin_upper_bound.size());
if ((right_start >= 0) && (right_max_bin > 0)) {
auto right_bounds = GreedyFindBin(distinct_values + right_start, counts + right_start,
num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
bin_upper_bound.push_back(kZeroThreshold);
bin_upper_bound.insert(bin_upper_bound.end(), right_bounds.begin(), right_bounds.end());
} else {
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
}
CHECK(bin_upper_bound.size() <= max_bin);
return bin_upper_bound;
}
......
......@@ -921,12 +921,32 @@ class TestEngine(unittest.TestCase):
}
lgb_data = lgb.Dataset(X, label=y)
est = lgb.train(params, lgb_data, num_boost_round=1)
self.assertEqual(len(np.unique(est.predict(X))), 100)
self.assertEqual(len(np.unique(est.predict(X))), 99)
params['max_bin_by_feature'] = [2, 100]
lgb_data = lgb.Dataset(X, label=y)
est = lgb.train(params, lgb_data, num_boost_round=1)
self.assertEqual(len(np.unique(est.predict(X))), 3)
def test_small_max_bin(self):
np.random.seed(0)
y = np.random.choice([0, 1], 100)
x = np.zeros((100, 1))
x[:30, 0] = -1
x[30:60, 0] = 1
x[60:, 0] = 2
params = {'objective': 'binary',
'seed': 0,
'min_data_in_leaf': 1,
'verbose': -1,
'max_bin': 2}
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=5)
x[0, 0] = np.nan
params['max_bin'] = 3
lgb_x = lgb.Dataset(x, label=y)
est = lgb.train(params, lgb_x, num_boost_round=5)
np.random.seed() # reset seed
def test_refit(self):
X, y = load_breast_cancer(True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment