Unverified Commit 465d1262 authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

check sorted indices in Subset (#2510)

* Update sparse_bin.hpp

* check sorted in c_api

* fix python package

* fix tests

* fix test

* std::is_sorted

* Update basic.py
parent 198d0f3b
......@@ -1095,7 +1095,7 @@ class Dataset(object):
free_raw_data=self.free_raw_data)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices
ret.used_indices = sorted(used_indices)
return ret
def save_binary(self, filename):
......
......@@ -339,8 +339,8 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
ret = _CVBooster()
for train_idx, test_idx in folds:
train_set = full_data.subset(train_idx)
valid_set = full_data.subset(test_idx)
train_set = full_data.subset(sorted(train_idx))
valid_set = full_data.subset(sorted(test_idx))
# run preprocessing on the data set if needed
if fpreproc is not None:
train_set, valid_set, tparam = fpreproc(train_set, valid_set, params.copy())
......
......@@ -921,6 +921,9 @@ int LGBM_DatasetGetSubset(
const int32_t lower = 0;
const int32_t upper = full_dataset->num_data() - 1;
Common::CheckElementsIntervalClosed(used_row_indices, lower, upper, num_used_row_indices, "Used indices of subset");
if (!std::is_sorted(used_row_indices, used_row_indices + num_used_row_indices)) {
Log::Fatal("used_row_indices should be sorted in Subset");
}
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices));
ret->CopyFeatureMapperFrom(full_dataset);
ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true);
......
......@@ -820,9 +820,9 @@ class TestEngine(unittest.TestCase):
data = np.random.random((500, 2))
y = [1] * 250 + [0] * 250
lgb_train = lgb.Dataset(data, y, free_raw_data=False)
subset_index_1 = np.random.choice(np.arange(500), 300, replace=False)
subset_index_1 = sorted(np.random.choice(np.arange(500), 300, replace=False))
subset_data_1 = lgb_train.subset(subset_index_1)
subset_index_2 = np.random.choice(np.arange(500), 200, replace=False)
subset_index_2 = sorted(np.random.choice(np.arange(500), 200, replace=False))
subset_data_2 = lgb_train.subset(subset_index_2)
params = {
'objective': 'binary',
......@@ -1601,8 +1601,8 @@ class TestEngine(unittest.TestCase):
iter_min = min([iter_min_l1, iter_min_l2])
iter_min_valid1 = min([iter_valid1_l1, iter_valid1_l2])
iter_cv_l1 = 3
iter_cv_l2 = 17
iter_cv_l1 = 4
iter_cv_l2 = 12
self.assertEqual(len(set([iter_cv_l1, iter_cv_l2])), 2)
iter_cv_min = min([iter_cv_l1, iter_cv_l2])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment