[python] max_bin parameter deprecated (#1046)

* made max_bin parameter deprecated * fixed accidental docstrings in Sphinx * concrete version when deprecated stuff will be removed * added warnings in case of duplicated params to Dataset * fixed indents in docs

[python] max_bin parameter deprecated (#1046)
* made max_bin parameter deprecated * fixed accidental docstrings in Sphinx * concrete version when deprecated stuff will be removed * added warnings in case of duplicated params to Dataset * fixed indents in docs
bd5e5e3e · Nikita Titov · Guolin Ke · 6c0baa37 · bd5e5e3e · bd5e5e3e
Commit bd5e5e3e authored Nov 13, 2017 by Nikita Titov Committed by Guolin Ke Nov 13, 2017
5 changed files
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -48,7 +48,8 @@ Core Parameters
   -  ``convert_model`` for converting model file into if-else format, see more information in `Convert model parameters <#convert-model-parameters>`__

 -  ``application``, default=\ ``regression``, type=enum,
-   options=\ ``regression``, ``regression_l2``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``binary``, ``lambdarank``, ``multiclass``,
+   options=\ ``regression``, ``regression_l2``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``quantile_l2``,
+   ``binary``, ``lambdarank``, ``multiclass``,
   alias=\ ``objective``, ``app``

   -  ``regression``, regression application
@@ -63,9 +64,9 @@ Core Parameters

      -  ``poisson``, `Poisson regression`_

-	  -  ``quantile``, `Quantile Regression`_
+      -  ``quantile``, `Quantile regression`_

-	  -  ``quantile_l2``, `Like the ``quantile``, but use l2 loss instead
+      -  ``quantile_l2``, like the ``quantile``, but L2 loss is used instead

   -  ``binary``, binary classification application

@@ -496,7 +497,7 @@ Objective Parameters

 -  ``alpha``, default=\ ``0.9``, type=double

-   -  parameter for `Huber loss`_ and `Quantile Regression`_. Will be used in ``regression`` task
+   -  parameter for `Huber loss`_ and `Quantile regression`_. Will be used in ``regression`` task

 -  ``fair_c``, default=\ ``1.0``, type=double

@@ -546,7 +547,9 @@ Objective Parameters

 -  ``reg_sqrt``, default=\ ``false``, type=bool

-   -  only used in Regression. Will fit ``sqrt(label)`` instead. And prediction result is also automatically converted to ``pow2(prediction)``
+   -  only used in ``regression``
+
+   -  will fit ``sqrt(label)`` instead and prediction result will be also automatically converted to ``pow2(prediction)``

 Metric Parameters
 -----------------
@@ -560,7 +563,7 @@ Metric Parameters

   -  ``l2_root``, root square loss, alias=\ ``root_mean_squared_error``, ``rmse``

-   -  ``quantile``, `Quantile Regression`_
+   -  ``quantile``, `Quantile regression`_

   -  ``huber``, `Huber loss`_

@@ -725,7 +728,7 @@ You can specific query/group id in data file now. Please refer to parameter ``gr

 .. _Huber loss: https://en.wikipedia.org/wiki/Huber_loss

-.. _Quantile Regression: https://en.wikipedia.org/wiki/Quantile_regression
+.. _Quantile regression: https://en.wikipedia.org/wiki/Quantile_regression

 .. _Fair loss: https://www.kaggle.com/c/allstate-claims-severity/discussion/24520


--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -15,7 +15,7 @@ import scipy.sparse

 from .compat import (DataFrame, Series, integer_types, json,
                     json_default_with_numpy, numeric_types, range_,
-                     string_type)
+                     string_type, LGBMDeprecationWarning)
 from .libpath import find_lib_path


@@ -570,6 +570,7 @@ class Dataset(object):
            Label of the data.
        max_bin : int or None, optional (default=None)
            Max number of discrete bins for features.
+            If None, default value from parameters of CLI-version will be used.
        reference : Dataset or None, optional (default=None)
            If this is Dataset for validation, training data should be used as reference.
        weight : list, numpy 1-D array or None, optional (default=None)
@@ -632,19 +633,26 @@ class Dataset(object):
        data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
        label = _label_from_pandas(label)
        self.data_has_header = False
-        """process for args"""
+        # process for args
        params = {} if params is None else params
+        args_names = getattr(self.__class__, '_lazy_init').__code__.co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount]
+        for key, _ in params.items():
+            if key in args_names:
+                warnings.warn('{0} keyword has been found in `params` and will be ignored. '
+                              'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
        self.max_bin = max_bin
        self.predictor = predictor
        if self.max_bin is not None:
            params["max_bin"] = self.max_bin
+            warnings.warn('The `max_bin` parameter is deprecated and will be removed in 2.0.12 version. '
+                          'Please use `params` to pass this parameter.', LGBMDeprecationWarning)
        if "verbosity" in params:
            params.setdefault("verbose", params.pop("verbosity"))
        if silent:
            params["verbose"] = 0
        elif "verbose" not in params:
            params["verbose"] = 1
-        """get categorical features"""
+        # get categorical features
        if categorical_feature is not None:
            categorical_indices = set()
            feature_dict = {}
@@ -666,15 +674,15 @@ class Dataset(object):
                params['categorical_column'] = sorted(categorical_indices)

        params_str = param_dict_to_str(params)
-        """process for reference dataset"""
+        # process for reference dataset
        ref_dataset = None
        if isinstance(reference, Dataset):
            ref_dataset = reference.construct().handle
        elif reference is not None:
            raise TypeError('Reference dataset should be None or dataset instance')
-        """start construct data"""
+        # start construct data
        if isinstance(data, string_type):
-            """check data has header or not"""
+            # check data has header or not
            if str(params.get("has_header", "")).lower() == "true" \
                    or str(params.get("header", "")).lower() == "true":
                self.data_has_header = True
@@ -739,7 +747,7 @@ class Dataset(object):
        if mat.dtype == np.float32 or mat.dtype == np.float64:
            data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
        else:
-            """change non-float data to float data, need to copy"""
+            # change non-float data to float data, need to copy
            data = np.array(mat.reshape(mat.size), dtype=np.float32)

        ptr_data, type_ptr_data = c_float_array(data)
@@ -812,12 +820,12 @@ class Dataset(object):
        if self.handle is None:
            if self.reference is not None:
                if self.used_indices is None:
-                    """create valid"""
+                    # create valid
                    self._lazy_init(self.data, label=self.label, max_bin=self.max_bin, reference=self.reference,
                                    weight=self.weight, group=self.group, init_score=self.init_score, predictor=self._predictor,
                                    silent=self.silent, feature_name=self.feature_name, params=self.params)
                else:
-                    """construct subset"""
+                    # construct subset
                    used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
                    self.handle = ctypes.c_void_p()
                    params_str = param_dict_to_str(self.params)
@@ -830,7 +838,7 @@ class Dataset(object):
                    if self.get_label() is None:
                        raise ValueError("Label should not be None.")
            else:
-                """create train"""
+                # create train
                self._lazy_init(self.data, label=self.label, max_bin=self.max_bin,
                                weight=self.weight, group=self.group, init_score=self.init_score,
                                predictor=self._predictor, silent=self.silent, feature_name=self.feature_name,
@@ -933,7 +941,7 @@ class Dataset(object):
        if self.handle is None:
            raise Exception("Cannot set %s before construct dataset" % field_name)
        if data is None:
-            """set to None"""
+            # set to None
            _safe_call(_LIB.LGBM_DatasetSetField(
                self.handle,
                c_str(field_name),
@@ -1276,17 +1284,17 @@ class Booster(object):
        elif "verbose" not in params:
            params["verbose"] = 1
        if train_set is not None:
-            """Training task"""
+            # Training task
            if not isinstance(train_set, Dataset):
                raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
            params_str = param_dict_to_str(params)
-            """construct booster object"""
+            # construct booster object
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_BoosterCreate(
                train_set.construct().handle,
                c_str(params_str),
                ctypes.byref(self.handle)))
-            """save reference to data"""
+            # save reference to data
            self.train_set = train_set
            self.valid_sets = []
            self.name_valid_sets = []
@@ -1301,12 +1309,12 @@ class Booster(object):
                self.handle,
                ctypes.byref(out_num_class)))
            self.__num_class = out_num_class.value
-            """buffer for inner predict"""
+            # buffer for inner predict
            self.__inner_predict_buffer = [None]
            self.__is_predicted_cur_iter = [False]
            self.__get_eval_info()
            self.pandas_categorical = train_set.pandas_categorical
-            """set network if necessary"""
+            # set network if necessary
            if "machines" in params:
                machines = params["machines"]
                if isinstance(machines, string_type):
@@ -1321,7 +1329,7 @@ class Booster(object):
                                 listen_time_out=params.get("listen_time_out", 120),
                                 num_machines=params.get("num_machines", num_machines))
        elif model_file is not None:
-            """Prediction task"""
+            # Prediction task
            out_num_iterations = ctypes.c_int(0)
            self.handle = ctypes.c_void_p()
            _safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
@@ -1481,7 +1489,7 @@ class Booster(object):
            Whether the update was successfully finished.
        """

-        """need reset training data"""
+        # need reset training data
        if train_set is not None and train_set is not self.train_set:
            if not isinstance(train_set, Dataset):
                raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
@@ -1581,7 +1589,7 @@ class Booster(object):
                if data is self.valid_sets[i]:
                    data_idx = i + 1
                    break
-        """need to push new valid data"""
+        # need to push new valid data
        if data_idx == -1:
            self.add_valid(data, name)
            data_idx = self.__num_dataset - 1
@@ -1809,7 +1817,7 @@ class Booster(object):
            List with names of features.
        """
        num_feature = self.num_feature()
-        """Get name of features"""
+        # Get name of features
        tmp_out_len = ctypes.c_int(0)
        string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)]
        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
@@ -1901,7 +1909,7 @@ class Booster(object):
                n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
            self.__inner_predict_buffer[data_idx] = \
                np.array([0.0 for _ in range_(n_preds)], dtype=np.float64, copy=False)
-        """avoid to predict many time in one iteration"""
+        # avoid to predict many time in one iteration
        if not self.__is_predicted_cur_iter[data_idx]:
            tmp_out_len = ctypes.c_int64(0)
            data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
@@ -1922,13 +1930,13 @@ class Booster(object):
        if self.__need_reload_eval_info:
            self.__need_reload_eval_info = False
            out_num_eval = ctypes.c_int(0)
-            """Get num of inner evals"""
+            # Get num of inner evals
            _safe_call(_LIB.LGBM_BoosterGetEvalCounts(
                self.handle,
                ctypes.byref(out_num_eval)))
            self.__num_inner_eval = out_num_eval.value
            if self.__num_inner_eval > 0:
-                """Get name of evals"""
+                # Get name of evals
                tmp_out_len = ctypes.c_int(0)
                string_buffers = [ctypes.create_string_buffer(255) for i in range_(self.__num_inner_eval)]
                ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))

--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -96,3 +96,8 @@ except ImportError:
    _LGBMCheckArray = None
    _LGBMCheckConsistentLength = None
    _LGBMCheckClassificationTargets = None
+
+
+# DeprecationWarning is not shown by default, so let's create our own with higher level
+class LGBMDeprecationWarning(UserWarning):
+    pass
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -92,7 +92,7 @@ def train(params, train_set, num_boost_round=100,
    booster : Booster
        The trained Booster model.
    """
-    """create predictor first"""
+    # create predictor first
    for alias in ["num_boost_round", "num_iterations", "num_iteration", "num_tree", "num_trees", "num_round", "num_rounds"]:
        if alias in params:
            num_boost_round = int(params.pop(alias))
@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
    else:
        predictor = None
    init_iteration = predictor.num_total_iteration if predictor is not None else 0
-    """check dataset"""
+    # check dataset
    if not isinstance(train_set, Dataset):
        raise TypeError("Training only accepts Dataset object")

@@ -130,7 +130,7 @@ def train(params, train_set, num_boost_round=100,
        if isinstance(valid_names, string_type):
            valid_names = [valid_names]
        for i, valid_data in enumerate(valid_sets):
-            """reduce cost for prediction training data"""
+            # reduce cost for prediction training data
            if valid_data is train_set:
                is_valid_contain_train = True
                if valid_names is not None:
@@ -145,7 +145,7 @@ def train(params, train_set, num_boost_round=100,
                name_valid_sets.append(valid_names[i])
            else:
                name_valid_sets.append('valid_' + str(i))
-    """process callbacks"""
+    # process callbacks
    if callbacks is None:
        callbacks = set()
    else:
@@ -173,7 +173,7 @@ def train(params, train_set, num_boost_round=100,
    callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
    callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))

-    """construct booster"""
+    # construct booster
    try:
        booster = Booster(params=params, train_set=train_set)
        if is_valid_contain_train:
@@ -186,7 +186,7 @@ def train(params, train_set, num_boost_round=100,
            valid_set._reverse_update_params()
    booster.best_iteration = 0

-    """start training"""
+    # start training
    for i in range_(init_iteration, init_iteration + num_boost_round):
        for cb in callbacks_before_iter:
            cb(callback.CallbackEnv(model=booster,

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -15,15 +15,10 @@ from .basic import Dataset, LightGBMError
 from .compat import (SKLEARN_INSTALLED, _LGBMClassifierBase,
                     LGBMNotFittedError, _LGBMLabelEncoder, _LGBMModelBase,
                     _LGBMRegressorBase, _LGBMCheckXY, _LGBMCheckArray, _LGBMCheckConsistentLength,
-                     _LGBMCheckClassificationTargets, argc_, range_)
+                     _LGBMCheckClassificationTargets, argc_, range_, LGBMDeprecationWarning)
 from .engine import train


-# DeprecationWarning is not shown by default, so let's create our own with higher level
-class LGBMDeprecationWarning(UserWarning):
-    pass
-
-
 def _objective_function_wrapper(func):
    """Decorate an objective function
    Note: for multi-class task, the y_pred is group by class_id first, then group by row_id.
@@ -162,7 +157,7 @@ class LGBMModel(_LGBMModelBase):
        n_estimators : int, optional (default=10)
            Number of boosted trees to fit.
        max_bin : int, optional (default=255)
-            Number of bucketed bin for feature values.
+            Number of bucketed bins for feature values.
        subsample_for_bin : int, optional (default=50000)
            Number of samples for constructing bins.
        objective : string, callable or None, optional (default=None)
@@ -279,10 +274,10 @@ class LGBMModel(_LGBMModelBase):
        params = super(LGBMModel, self).get_params(deep=deep)
        params.update(self._other_params)
        if 'seed' in params:
-            warnings.warn('The `seed` parameter is deprecated and will be removed in next version. '
+            warnings.warn('The `seed` parameter is deprecated and will be removed in 2.0.12 version. '
                          'Please use `random_state` instead.', LGBMDeprecationWarning)
        if 'nthread' in params:
-            warnings.warn('The `nthread` parameter is deprecated and will be removed in next version. '
+            warnings.warn('The `nthread` parameter is deprecated and will be removed in 2.0.12 version. '
                          'Please use `n_jobs` instead.', LGBMDeprecationWarning)
        return params

@@ -432,7 +427,7 @@ class LGBMModel(_LGBMModelBase):
            if isinstance(eval_set, tuple):
                eval_set = [eval_set]
            for i, valid_data in enumerate(eval_set):
-                """reduce cost for prediction training data"""
+                # reduce cost for prediction training data
                if valid_data[0] is X and valid_data[1] is y:
                    valid_set = train_set
                else:
@@ -584,12 +579,12 @@ class LGBMModel(_LGBMModelBase):
        return self.booster_.feature_importance()

    def booster(self):
-        warnings.warn('The `booster()` method is deprecated and will be removed in next version. '
+        warnings.warn('The `booster()` method is deprecated and will be removed in 2.0.12 version. '
                      'Please use attribute `booster_` instead.', LGBMDeprecationWarning)
        return self.booster_

    def feature_importance(self):
-        warnings.warn('The `feature_importance()` method is deprecated and will be removed in next version. '
+        warnings.warn('The `feature_importance()` method is deprecated and will be removed in 2.0.12 version. '
                      'Please use attribute `feature_importances_` instead.', LGBMDeprecationWarning)
        return self.feature_importances_