[python] parameters renaming for sklearn naming convention (#854)

* updated scikit-learn interface * fixed better description * updated set_params() * removed backward compatibility * removed excess lines * replaced pop with setdefault * added deprecated warnings * added tests

[python] parameters renaming for sklearn naming convention (#854)
* updated scikit-learn interface * fixed better description * updated set_params() * removed backward compatibility * removed excess lines * replaced pop with setdefault * added deprecated warnings * added tests
3f0061ca · Nikita Titov · Guolin Ke · 49412ba7 · 3f0061ca · 3f0061ca
Commit 3f0061ca authored Aug 23, 2017 by Nikita Titov Committed by Guolin Ke Aug 23, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 20 deletions

python-package/lightgbm/sklearn.py python-package/lightgbm/sklearn.py +36 -20

tests/python_package_test/test_sklearn.py tests/python_package_test/test_sklearn.py +19 -0

No files found.
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -4,6 +4,7 @@
 from __future__ import absolute_import

 import numpy as np
+import warnings

 from .basic import Dataset, LightGBMError
 from .compat import (SKLEARN_INSTALLED, LGBMClassifierBase, LGBMDeprecated,
@@ -12,6 +13,11 @@ from .compat import (SKLEARN_INSTALLED, LGBMClassifierBase, LGBMDeprecated,
 from .engine import train


+# DeprecationWarning is not shown by default, so let's create our own with higher level
+class LGBMDeprecationWarning(UserWarning):
+    pass
+
+
 def _objective_function_wrapper(func):
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
@@ -127,51 +133,52 @@ class LGBMModel(LGBMModelBase):
                 subsample_for_bin=50000, objective=None,
                 min_split_gain=0, min_child_weight=5, min_child_samples=10,
                 subsample=1, subsample_freq=1, colsample_bytree=1,
-                 reg_alpha=0, reg_lambda=0, seed=0, nthread=-1, silent=True, **kwargs):
+                 reg_alpha=0, reg_lambda=0, random_state=0,
+                 n_jobs=-1, silent=True, **kwargs):
        """
        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
        boosting_type : string
-            gbdt, traditional Gradient Boosting Decision Tree
-            dart, Dropouts meet Multiple Additive Regression Trees
+            gbdt, traditional Gradient Boosting Decision Tree.
+            dart, Dropouts meet Multiple Additive Regression Trees.
        num_leaves : int
            Maximum tree leaves for base learners.
        max_depth : int
            Maximum tree depth for base learners, -1 means no limit.
        learning_rate : float
-            Boosting learning rate
+            Boosting learning rate.
        n_estimators : int
            Number of boosted trees to fit.
        max_bin : int
-            Number of bucketed bin for feature values
+            Number of bucketed bin for feature values.
        subsample_for_bin : int
            Number of samples for constructing bins.
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
-            default: binary for LGBMClassifier, lambdarank for LGBMRanker
+            default: binary for LGBMClassifier, lambdarank for LGBMRanker.
        min_split_gain : float
            Minimum loss reduction required to make a further partition on a leaf node of the tree.
        min_child_weight : int
-            Minimum sum of instance weight(hessian) needed in a child(leaf)
+            Minimum sum of instance weight(hessian) needed in a child(leaf).
        min_child_samples : int
-            Minimum number of data need in a child(leaf)
+            Minimum number of data need in a child(leaf).
        subsample : float
            Subsample ratio of the training instance.
        subsample_freq : int
-            frequence of subsample, <=0 means no enable
+            frequence of subsample, <=0 means no enable.
        colsample_bytree : float
            Subsample ratio of columns when constructing each tree.
        reg_alpha : float
-            L1 regularization term on weights
+            L1 regularization term on weights.
        reg_lambda : float
-            L2 regularization term on weights
-        seed : int
+            L2 regularization term on weights.
+        random_state : int
            Random number seed.
-        nthread : int
-            Number of parallel threads
+        n_jobs : int
+            Number of parallel threads.
        silent : boolean
            Whether to print messages while running boosting.
        **kwargs : other parameters
@@ -186,15 +193,15 @@ class LGBMModel(LGBMModelBase):
            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
-                The target values
+                The target values.
            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class]
-                The predicted values
+                The predicted values.
            group: array_like
-                group/query data, used for ranking task
+                group/query data, used for ranking task.
            grad: array_like of shape [n_samples] or shape[n_samples * n_class]
                The value of the gradient for each sample point.
            hess: array_like of shape [n_samples] or shape[n_samples * n_class]
-                The value of the second derivative for each sample point
+                The value of the second derivative for each sample point.

        for multi-class task, the y_pred is group by class_id first, then group by row_id
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
@@ -229,8 +236,8 @@ class LGBMModel(LGBMModelBase):
        self.colsample_bytree = colsample_bytree
        self.reg_alpha = reg_alpha
        self.reg_lambda = reg_lambda
-        self.seed = seed
-        self.nthread = nthread
+        self.random_state = random_state
+        self.n_jobs = n_jobs
        self.silent = silent
        self._Booster = None
        self.evals_result = None
@@ -246,6 +253,12 @@ class LGBMModel(LGBMModelBase):
    def get_params(self, deep=True):
        params = super(LGBMModel, self).get_params(deep=deep)
        params.update(self.other_params)
+        if 'seed' in params:
+            warnings.warn('The `seed` parameter is deprecated and will be removed in next version. '
+                          'Please use `random_state` instead.', LGBMDeprecationWarning)
+        if 'nthread' in params:
+            warnings.warn('The `nthread` parameter is deprecated and will be removed in next version. '
+                          'Please use `n_jobs` instead.', LGBMDeprecationWarning)
        return params

    # minor change to support `**kwargs`
@@ -333,6 +346,9 @@ class LGBMModel(LGBMModelBase):
        """
        evals_result = {}
        params = self.get_params()
+        # sklearn interface has another naming convention
+        params.setdefault('seed', params.pop('random_state'))
+        params.setdefault('nthread', params.pop('n_jobs'))
        # user can set verbose with kwargs, it has higher priority
        if 'verbose' not in params and self.silent:
            params['verbose'] = -1

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -3,6 +3,7 @@
 import math
 import os
 import unittest
+import warnings

 import lightgbm as lgb
 import numpy as np
@@ -158,3 +159,21 @@ class TestSklearn(unittest.TestCase):
        clf.fit(data.data, data.target)
        importances = clf.feature_importances_
        self.assertEqual(len(importances), 4)
+
+    def test_sklearn_backward_compatibility(self):
+        iris = load_iris()
+        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
+
+        # Tests that `seed` is the same as `random_state`
+        clf_1 = lgb.sklearn.LGBMClassifier(seed=42, subsample=0.6, colsample_bytree=0.8)
+        clf_2 = lgb.sklearn.LGBMClassifier(random_state=42, subsample=0.6, colsample_bytree=0.8)
+        y_pred_1 = clf_1.fit(X_train, y_train).predict_proba(X_test)
+        y_pred_2 = clf_2.fit(X_train, y_train).predict_proba(X_test)
+        np.testing.assert_allclose(y_pred_1, y_pred_2)
+
+        # Tests that warnings were raised
+        with warnings.catch_warnings(record=True) as w:
+            clf_1.get_params()
+            clf_2.set_params(nthread=-1).fit(X_train, y_train)
+            self.assertEqual(len(w), 2)
+            self.assertTrue(issubclass(w[-1].category, Warning))