Unverified Commit eec60731 authored by Nikita Titov's avatar Nikita Titov Committed by GitHub
Browse files

[python][sklearn] do not modify args in fit function and minor code cleanup (#2619)

* clean code

* clean code

* do not modify args in fit function

* added test
parent b36926d8
...@@ -350,7 +350,6 @@ class LGBMModel(_LGBMModelBase): ...@@ -350,7 +350,6 @@ class LGBMModel(_LGBMModelBase):
params.update(self._other_params) params.update(self._other_params)
return params return params
# minor change to support `**kwargs`
def set_params(self, **params): def set_params(self, **params):
"""Set the parameters of this estimator. """Set the parameters of this estimator.
...@@ -555,8 +554,8 @@ class LGBMModel(_LGBMModelBase): ...@@ -555,8 +554,8 @@ class LGBMModel(_LGBMModelBase):
self._n_features = _X.shape[1] self._n_features = _X.shape[1]
def _construct_dataset(X, y, sample_weight, init_score, group, params): def _construct_dataset(X, y, sample_weight, init_score, group, params):
ret = Dataset(X, label=y, weight=sample_weight, group=group, params=params) return Dataset(X, label=y, weight=sample_weight, group=group,
return ret.set_init_score(init_score) init_score=init_score, params=params)
train_set = _construct_dataset(_X, _y, sample_weight, init_score, group, params) train_set = _construct_dataset(_X, _y, sample_weight, init_score, group, params)
...@@ -616,7 +615,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -616,7 +615,7 @@ class LGBMModel(_LGBMModelBase):
self._best_score = self._Booster.best_score self._best_score = self._Booster.best_score
# free dataset # free dataset
self.booster_.free_dataset() self._Booster.free_dataset()
del train_set, valid_sets del train_set, valid_sets
return self return self
...@@ -669,7 +668,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -669,7 +668,7 @@ class LGBMModel(_LGBMModelBase):
"match the input. Model n_features_ is %s and " "match the input. Model n_features_ is %s and "
"input n_features is %s " "input n_features is %s "
% (self._n_features, n_features)) % (self._n_features, n_features))
return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration, return self._Booster.predict(X, raw_score=raw_score, num_iteration=num_iteration,
pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs) pred_leaf=pred_leaf, pred_contrib=pred_contrib, **kwargs)
@property @property
...@@ -720,14 +719,12 @@ class LGBMModel(_LGBMModelBase): ...@@ -720,14 +719,12 @@ class LGBMModel(_LGBMModelBase):
.. note:: .. note::
Feature importance in sklearn interface used to normalize to 1,
it's deprecated after 2.0.4 and is the same as Booster.feature_importance() now.
``importance_type`` attribute is passed to the function ``importance_type`` attribute is passed to the function
to configure the type of importance values to be extracted. to configure the type of importance values to be extracted.
""" """
if self._n_features is None: if self._n_features is None:
raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.') raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
return self.booster_.feature_importance(importance_type=self.importance_type) return self._Booster.feature_importance(importance_type=self.importance_type)
class LGBMRegressor(LGBMModel, _LGBMRegressorBase): class LGBMRegressor(LGBMModel, _LGBMRegressorBase):
...@@ -780,30 +777,33 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase): ...@@ -780,30 +777,33 @@ class LGBMClassifier(LGBMModel, _LGBMClassifierBase):
self._n_classes = len(self._classes) self._n_classes = len(self._classes)
if self._n_classes > 2: if self._n_classes > 2:
# Switch to using a multiclass objective in the underlying LGBM instance # Switch to using a multiclass objective in the underlying LGBM instance
ova_aliases = ("multiclassova", "multiclass_ova", "ova", "ovr") ova_aliases = {"multiclassova", "multiclass_ova", "ova", "ovr"}
if self._objective not in ova_aliases and not callable(self._objective): if self._objective not in ova_aliases and not callable(self._objective):
self._objective = "multiclass" self._objective = "multiclass"
if eval_metric in ('logloss', 'binary_logloss'): if eval_metric in {'logloss', 'binary_logloss'}:
eval_metric = "multi_logloss" eval_metric = "multi_logloss"
elif eval_metric in ('error', 'binary_error'): elif eval_metric in {'error', 'binary_error'}:
eval_metric = "multi_error" eval_metric = "multi_error"
else: else:
if eval_metric in ('logloss', 'multi_logloss'): if eval_metric in {'logloss', 'multi_logloss'}:
eval_metric = 'binary_logloss' eval_metric = 'binary_logloss'
elif eval_metric in ('error', 'multi_error'): elif eval_metric in {'error', 'multi_error'}:
eval_metric = 'binary_error' eval_metric = 'binary_error'
# do not modify args, as it causes errors in model selection tools
valid_sets = None
if eval_set is not None: if eval_set is not None:
if isinstance(eval_set, tuple): if isinstance(eval_set, tuple):
eval_set = [eval_set] eval_set = [eval_set]
valid_sets = [None] * len(eval_set)
for i, (valid_x, valid_y) in enumerate(eval_set): for i, (valid_x, valid_y) in enumerate(eval_set):
if valid_x is X and valid_y is y: if valid_x is X and valid_y is y:
eval_set[i] = (valid_x, _y) valid_sets[i] = (valid_x, _y)
else: else:
eval_set[i] = (valid_x, self._le.transform(valid_y)) valid_sets[i] = (valid_x, self._le.transform(valid_y))
super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight, super(LGBMClassifier, self).fit(X, _y, sample_weight=sample_weight,
init_score=init_score, eval_set=eval_set, init_score=init_score, eval_set=valid_sets,
eval_names=eval_names, eval_names=eval_names,
eval_sample_weight=eval_sample_weight, eval_sample_weight=eval_sample_weight,
eval_class_weight=eval_class_weight, eval_class_weight=eval_class_weight,
...@@ -903,7 +903,7 @@ class LGBMRanker(LGBMModel): ...@@ -903,7 +903,7 @@ class LGBMRanker(LGBMModel):
sample_weight=None, init_score=None, group=None, sample_weight=None, init_score=None, group=None,
eval_set=None, eval_names=None, eval_sample_weight=None, eval_set=None, eval_names=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None, eval_metric=None, eval_init_score=None, eval_group=None, eval_metric=None,
eval_at=[1], early_stopping_rounds=None, verbose=True, eval_at=[1, 2, 3, 4, 5], early_stopping_rounds=None, verbose=True,
feature_name='auto', categorical_feature='auto', feature_name='auto', categorical_feature='auto',
callbacks=None, init_model=None): callbacks=None, init_model=None):
"""Docstring is inherited from the LGBMModel.""" """Docstring is inherited from the LGBMModel."""
...@@ -942,6 +942,6 @@ class LGBMRanker(LGBMModel): ...@@ -942,6 +942,6 @@ class LGBMRanker(LGBMModel):
_base_doc = fit.__doc__ _base_doc = fit.__doc__
_before_early_stop, _early_stop, _after_early_stop = _base_doc.partition('early_stopping_rounds :') _before_early_stop, _early_stop, _after_early_stop = _base_doc.partition('early_stopping_rounds :')
fit.__doc__ = (_before_early_stop fit.__doc__ = (_before_early_stop
+ 'eval_at : list of int, optional (default=[1])\n' + 'eval_at : list of int, optional (default=[1, 2, 3, 4, 5])\n'
+ ' ' * 12 + 'The evaluation positions of the specified metric.\n' + ' ' * 12 + 'The evaluation positions of the specified metric.\n'
+ ' ' * 8 + _early_stop + _after_early_stop) + ' ' * 8 + _early_stop + _after_early_stop)
...@@ -148,16 +148,27 @@ class TestSklearn(unittest.TestCase): ...@@ -148,16 +148,27 @@ class TestSklearn(unittest.TestCase):
self.assertLessEqual(score, 1.) self.assertLessEqual(score, 1.)
def test_grid_search(self): def test_grid_search(self):
X, y = load_boston(True) X, y = load_iris(True)
params = {'boosting_type': ['dart', 'gbdt'], y = np.array(list(map(str, y))) # utilize label encoder at it's max power
'n_estimators': [5, 8], X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
'drop_rate': [0.05, 0.1]} params = {'subsample': 0.8,
grid = GridSearchCV(lgb.LGBMRegressor(n_estimators=10), params, cv=3) 'subsample_freq': 1}
grid.fit(X, y) grid_params = {'boosting_type': ['rf', 'gbdt'],
self.assertIn(grid.best_params_['boosting_type'], ['dart', 'gbdt']) 'n_estimators': [4, 6],
self.assertIn(grid.best_params_['n_estimators'], [5, 8]) 'reg_alpha': [0.01, 0.005]}
self.assertIn(grid.best_params_['drop_rate'], [0.05, 0.1]) fit_params = {'verbose': False,
self.assertLess(grid.best_score_, 0.3) 'eval_set': [(X_test, y_test)],
'eval_metric': constant_metric,
'early_stopping_rounds': 2}
grid = GridSearchCV(lgb.LGBMClassifier(**params), grid_params, cv=2)
grid.fit(X, y, **fit_params)
self.assertIn(grid.best_params_['boosting_type'], ['rf', 'gbdt'])
self.assertIn(grid.best_params_['n_estimators'], [4, 6])
self.assertIn(grid.best_params_['reg_alpha'], [0.01, 0.005])
self.assertLess(grid.best_score_, 0.9)
self.assertEqual(grid.best_estimator_.best_iteration_, 1)
self.assertLess(grid.best_estimator_.best_score_['valid_0']['multi_logloss'], 0.25)
self.assertEqual(grid.best_estimator_.best_score_['valid_0']['error'], 0)
def test_clone_and_property(self): def test_clone_and_property(self):
X, y = load_boston(True) X, y = load_boston(True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment