"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "d1f873524e52319c24f909c42b24b2abeb84c3f5"
Unverified Commit 877d58fa authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

simplify start_iteration param for predict in Python and some code cleanup for...

simplify start_iteration param for predict in Python and some code cleanup for start_iteration (#3288)

* simplify start_iteration param for predict in Python and some code cleanup for start_iteration

* revert docs changes about the prediction result shape
parent 97d5758f
...@@ -38,9 +38,8 @@ test_that("start_iteration works correctly", { ...@@ -38,9 +38,8 @@ test_that("start_iteration works correctly", {
, label = train$label , label = train$label
, num_leaves = 4L , num_leaves = 4L
, learning_rate = 0.6 , learning_rate = 0.6
, nrounds = 100L , nrounds = 50L
, objective = "binary" , objective = "binary"
, save_name = tempfile(fileext = ".model")
, valids = list("test" = dtest) , valids = list("test" = dtest)
, early_stopping_rounds = 2L , early_stopping_rounds = 2L
) )
...@@ -50,7 +49,7 @@ test_that("start_iteration works correctly", { ...@@ -50,7 +49,7 @@ test_that("start_iteration works correctly", {
pred2 <- rep(0.0, length(pred1)) pred2 <- rep(0.0, length(pred1))
pred_contrib2 <- rep(0.0, length(pred2)) pred_contrib2 <- rep(0.0, length(pred2))
step <- 11L step <- 11L
end_iter <- 99L end_iter <- 49L
if (bst$best_iter != -1L) { if (bst$best_iter != -1L) {
end_iter <- bst$best_iter - 1L end_iter <- bst$best_iter - 1L
} }
......
...@@ -2813,7 +2813,7 @@ class Booster(object): ...@@ -2813,7 +2813,7 @@ class Booster(object):
default=json_default_with_numpy)) default=json_default_with_numpy))
return ret return ret
def predict(self, data, start_iteration=None, num_iteration=None, def predict(self, data, start_iteration=0, num_iteration=None,
raw_score=False, pred_leaf=False, pred_contrib=False, raw_score=False, pred_leaf=False, pred_contrib=False,
data_has_header=False, is_reshape=True, **kwargs): data_has_header=False, is_reshape=True, **kwargs):
"""Make a prediction. """Make a prediction.
...@@ -2823,14 +2823,14 @@ class Booster(object): ...@@ -2823,14 +2823,14 @@ class Booster(object):
data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse data : string, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
Data source for prediction. Data source for prediction.
If string, it represents the path to txt file. If string, it represents the path to txt file.
start_iteration : int or None, optional (default=None) start_iteration : int, optional (default=0)
Start index of the iteration to predict. Start index of the iteration to predict.
If None or <= 0, starts from the first iteration. If <= 0, starts from the first iteration.
num_iteration : int or None, optional (default=None) num_iteration : int or None, optional (default=None)
Limit number of iterations in the prediction. Total number of iterations used in the prediction.
If None, if the best iteration exists and start_iteration is None or <= 0, the best iteration is used; If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
otherwise, all iterations from start_iteration are used. otherwise, all iterations from ``start_iteration`` are used (no limits).
If <= 0, all iterations from start_iteration are used (no limits). If <= 0, all iterations from ``start_iteration`` are used (no limits).
raw_score : bool, optional (default=False) raw_score : bool, optional (default=False)
Whether to predict raw scores. Whether to predict raw scores.
pred_leaf : bool, optional (default=False) pred_leaf : bool, optional (default=False)
...@@ -2861,10 +2861,8 @@ class Booster(object): ...@@ -2861,10 +2861,8 @@ class Booster(object):
Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``). Can be sparse or a list of sparse objects (each element represents predictions for one class) for feature contributions (when ``pred_contrib=True``).
""" """
predictor = self._to_predictor(copy.deepcopy(kwargs)) predictor = self._to_predictor(copy.deepcopy(kwargs))
if start_iteration is None or start_iteration < 0:
start_iteration = 0
if num_iteration is None: if num_iteration is None:
if start_iteration == 0: if start_iteration <= 0:
num_iteration = self.best_iteration num_iteration = self.best_iteration
else: else:
num_iteration = -1 num_iteration = -1
......
...@@ -612,7 +612,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -612,7 +612,7 @@ class LGBMModel(_LGBMModelBase):
del train_set, valid_sets del train_set, valid_sets
return self return self
def predict(self, X, raw_score=False, start_iteration=None, num_iteration=None, def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
pred_leaf=False, pred_contrib=False, **kwargs): pred_leaf=False, pred_contrib=False, **kwargs):
"""Return the predicted value for each sample. """Return the predicted value for each sample.
...@@ -622,13 +622,14 @@ class LGBMModel(_LGBMModelBase): ...@@ -622,13 +622,14 @@ class LGBMModel(_LGBMModelBase):
Input features matrix. Input features matrix.
raw_score : bool, optional (default=False) raw_score : bool, optional (default=False)
Whether to predict raw scores. Whether to predict raw scores.
start_iteration : int or None, optional (default=None) start_iteration : int, optional (default=0)
Start index of the iteration to predict. Start index of the iteration to predict.
If None or <= 0, starts from the first iteration. If <= 0, starts from the first iteration.
num_iteration : int or None, optional (default=None) num_iteration : int or None, optional (default=None)
Limit number of iterations in the prediction. Total number of iterations used in the prediction.
If None, if the best iteration exists, it is used; otherwise, all trees are used. If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
If <= 0, all trees are used (no limits). otherwise, all iterations from ``start_iteration`` are used (no limits).
If <= 0, all iterations from ``start_iteration`` are used (no limits).
pred_leaf : bool, optional (default=False) pred_leaf : bool, optional (default=False)
Whether to predict leaf index. Whether to predict leaf index.
pred_contrib : bool, optional (default=False) pred_contrib : bool, optional (default=False)
...@@ -835,7 +836,7 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase): ...@@ -835,7 +836,7 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase):
fit.__doc__ = LGBMModel.fit.__doc__ fit.__doc__ = LGBMModel.fit.__doc__
def predict(self, X, raw_score=False, start_iteration=None, num_iteration=None, def predict(self, X, raw_score=False, start_iteration=0, num_iteration=None,
pred_leaf=False, pred_contrib=False, **kwargs): pred_leaf=False, pred_contrib=False, **kwargs):
"""Docstring is inherited from the LGBMModel.""" """Docstring is inherited from the LGBMModel."""
result = self.predict_proba(X, raw_score, start_iteration, num_iteration, result = self.predict_proba(X, raw_score, start_iteration, num_iteration,
...@@ -848,7 +849,7 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase): ...@@ -848,7 +849,7 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase):
predict.__doc__ = LGBMModel.predict.__doc__ predict.__doc__ = LGBMModel.predict.__doc__
def predict_proba(self, X, raw_score=False, start_iteration=None, num_iteration=None, def predict_proba(self, X, raw_score=False, start_iteration=0, num_iteration=None,
pred_leaf=False, pred_contrib=False, **kwargs): pred_leaf=False, pred_contrib=False, **kwargs):
"""Return the predicted probability for each class for each sample. """Return the predicted probability for each class for each sample.
...@@ -858,13 +859,14 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase): ...@@ -858,13 +859,14 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase):
Input features matrix. Input features matrix.
raw_score : bool, optional (default=False) raw_score : bool, optional (default=False)
Whether to predict raw scores. Whether to predict raw scores.
start_iteration : int or None, optional (default=None) start_iteration : int, optional (default=0)
Start index of the iteration to predict. Start index of the iteration to predict.
If None or <= 0, starts from the first iteration. If <= 0, starts from the first iteration.
num_iteration : int or None, optional (default=None) num_iteration : int or None, optional (default=None)
Limit number of iterations in the prediction. Total number of iterations used in the prediction.
If None, if the best iteration exists, it is used; otherwise, all trees are used. If None, if the best iteration exists and start_iteration <= 0, the best iteration is used;
If <= 0, all trees are used (no limits). otherwise, all iterations from ``start_iteration`` are used (no limits).
If <= 0, all iterations from ``start_iteration`` are used (no limits).
pred_leaf : bool, optional (default=False) pred_leaf : bool, optional (default=False)
Whether to predict leaf index. Whether to predict leaf index.
pred_contrib : bool, optional (default=False) pred_contrib : bool, optional (default=False)
......
...@@ -226,7 +226,6 @@ class Predictor { ...@@ -226,7 +226,6 @@ class Predictor {
data_size_t, const std::vector<std::string>& lines) { data_size_t, const std::vector<std::string>& lines) {
std::vector<std::pair<int, double>> oneline_features; std::vector<std::pair<int, double>> oneline_features;
std::vector<std::string> result_to_write(lines.size()); std::vector<std::string> result_to_write(lines.size());
Log::Warning("before predict_fun_ is called");
OMP_INIT_EX(); OMP_INIT_EX();
#pragma omp parallel for schedule(static) firstprivate(oneline_features) #pragma omp parallel for schedule(static) firstprivate(oneline_features)
for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) { for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
...@@ -241,7 +240,6 @@ class Predictor { ...@@ -241,7 +240,6 @@ class Predictor {
result_to_write[i] = str_result; result_to_write[i] = str_result;
OMP_LOOP_EX_END(); OMP_LOOP_EX_END();
} }
Log::Warning("after predict_fun_ is called");
OMP_THROW_EX(); OMP_THROW_EX();
for (data_size_t i = 0; i < static_cast<data_size_t>(result_to_write.size()); ++i) { for (data_size_t i = 0; i < static_cast<data_size_t>(result_to_write.size()); ++i) {
writer->Write(result_to_write[i].c_str(), result_to_write[i].size()); writer->Write(result_to_write[i].c_str(), result_to_write[i].size());
......
...@@ -2321,7 +2321,7 @@ class TestEngine(unittest.TestCase): ...@@ -2321,7 +2321,7 @@ class TestEngine(unittest.TestCase):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
train_data = lgb.Dataset(X_train, label=y_train) train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test) valid_data = lgb.Dataset(X_test, label=y_test)
booster = lgb.train(params, train_data, num_boost_round=100, early_stopping_rounds=early_stopping_rounds, valid_sets=[valid_data]) booster = lgb.train(params, train_data, num_boost_round=50, early_stopping_rounds=early_stopping_rounds, valid_sets=[valid_data])
# test that the predict once with all iterations equals summed results with start_iteration and num_iteration # test that the predict once with all iterations equals summed results with start_iteration and num_iteration
all_pred = booster.predict(X, raw_score=True) all_pred = booster.predict(X, raw_score=True)
...@@ -2330,17 +2330,15 @@ class TestEngine(unittest.TestCase): ...@@ -2330,17 +2330,15 @@ class TestEngine(unittest.TestCase):
for step in steps: for step in steps:
pred = np.zeros_like(all_pred) pred = np.zeros_like(all_pred)
pred_contrib = np.zeros_like(all_pred_contrib) pred_contrib = np.zeros_like(all_pred_contrib)
for start_iter in range(0, 100, step): for start_iter in range(0, 50, step):
pred += booster.predict(X, num_iteration=step, start_iteration=start_iter, raw_score=True) pred += booster.predict(X, start_iteration=start_iter, num_iteration=step, raw_score=True)
pred_contrib += booster.predict(X, num_iteration=step, start_iteration=start_iter, pred_contrib=True) pred_contrib += booster.predict(X, start_iteration=start_iter, num_iteration=step, pred_contrib=True)
np.testing.assert_allclose(all_pred, pred) np.testing.assert_allclose(all_pred, pred)
np.testing.assert_allclose(all_pred_contrib, pred_contrib) np.testing.assert_allclose(all_pred_contrib, pred_contrib)
# test the case where start_iteration <= 0, and num_iteration is None # test the case where start_iteration <= 0, and num_iteration is None
pred1 = booster.predict(X, start_iteration=-1) pred1 = booster.predict(X, start_iteration=-1)
pred2 = booster.predict(X, num_iteration=booster.best_iteration) pred2 = booster.predict(X, num_iteration=booster.best_iteration)
pred3 = booster.predict(X, num_iteration=booster.best_iteration, start_iteration=0)
np.testing.assert_allclose(pred1, pred2) np.testing.assert_allclose(pred1, pred2)
np.testing.assert_allclose(pred1, pred3)
# test the case where start_iteration > 0, and num_iteration <= 0 # test the case where start_iteration > 0, and num_iteration <= 0
pred4 = booster.predict(X, start_iteration=10, num_iteration=-1) pred4 = booster.predict(X, start_iteration=10, num_iteration=-1)
...@@ -2351,14 +2349,14 @@ class TestEngine(unittest.TestCase): ...@@ -2351,14 +2349,14 @@ class TestEngine(unittest.TestCase):
# test the case where start_iteration > 0, and num_iteration <= 0, with pred_leaf=True # test the case where start_iteration > 0, and num_iteration <= 0, with pred_leaf=True
pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_leaf=True) pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_leaf=True)
pred5 = booster.predict(X, start_iteration=10, num_iteration=90, pred_leaf=True) pred5 = booster.predict(X, start_iteration=10, num_iteration=40, pred_leaf=True)
pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_leaf=True) pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_leaf=True)
np.testing.assert_allclose(pred4, pred5) np.testing.assert_allclose(pred4, pred5)
np.testing.assert_allclose(pred4, pred6) np.testing.assert_allclose(pred4, pred6)
# test the case where start_iteration > 0, and num_iteration <= 0, with pred_contrib=True # test the case where start_iteration > 0, and num_iteration <= 0, with pred_contrib=True
pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_contrib=True) pred4 = booster.predict(X, start_iteration=10, num_iteration=-1, pred_contrib=True)
pred5 = booster.predict(X, start_iteration=10, num_iteration=90, pred_contrib=True) pred5 = booster.predict(X, start_iteration=10, num_iteration=40, pred_contrib=True)
pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_contrib=True) pred6 = booster.predict(X, start_iteration=10, num_iteration=0, pred_contrib=True)
np.testing.assert_allclose(pred4, pred5) np.testing.assert_allclose(pred4, pred5)
np.testing.assert_allclose(pred4, pred6) np.testing.assert_allclose(pred4, pred6)
...@@ -2373,7 +2371,7 @@ class TestEngine(unittest.TestCase): ...@@ -2373,7 +2371,7 @@ class TestEngine(unittest.TestCase):
} }
# test both with and without early stopping # test both with and without early stopping
inner_test(X, y, params, early_stopping_rounds=1) inner_test(X, y, params, early_stopping_rounds=1)
inner_test(X, y, params, early_stopping_rounds=10) inner_test(X, y, params, early_stopping_rounds=5)
inner_test(X, y, params, early_stopping_rounds=None) inner_test(X, y, params, early_stopping_rounds=None)
# test for multi-class # test for multi-class
...@@ -2387,7 +2385,7 @@ class TestEngine(unittest.TestCase): ...@@ -2387,7 +2385,7 @@ class TestEngine(unittest.TestCase):
} }
# test both with and without early stopping # test both with and without early stopping
inner_test(X, y, params, early_stopping_rounds=1) inner_test(X, y, params, early_stopping_rounds=1)
inner_test(X, y, params, early_stopping_rounds=10) inner_test(X, y, params, early_stopping_rounds=5)
inner_test(X, y, params, early_stopping_rounds=None) inner_test(X, y, params, early_stopping_rounds=None)
# test for binary # test for binary
...@@ -2400,5 +2398,5 @@ class TestEngine(unittest.TestCase): ...@@ -2400,5 +2398,5 @@ class TestEngine(unittest.TestCase):
} }
# test both with and without early stopping # test both with and without early stopping
inner_test(X, y, params, early_stopping_rounds=1) inner_test(X, y, params, early_stopping_rounds=1)
inner_test(X, y, params, early_stopping_rounds=10) inner_test(X, y, params, early_stopping_rounds=5)
inner_test(X, y, params, early_stopping_rounds=None) inner_test(X, y, params, early_stopping_rounds=None)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment