Commit bd5e5e3e authored by Nikita Titov's avatar Nikita Titov Committed by Guolin Ke
Browse files

[python] max_bin parameter deprecated (#1046)

* made max_bin parameter deprecated

* fixed accidental docstrings in Sphinx

* concrete version when deprecated stuff will be removed

* added warnings in case of duplicated params to Dataset

* fixed indents in docs
parent 6c0baa37
......@@ -48,7 +48,8 @@ Core Parameters
- ``convert_model`` for converting model file into if-else format, see more information in `Convert model parameters <#convert-model-parameters>`__
- ``application``, default=\ ``regression``, type=enum,
options=\ ``regression``, ``regression_l2``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``binary``, ``lambdarank``, ``multiclass``,
options=\ ``regression``, ``regression_l2``, ``regression_l1``, ``huber``, ``fair``, ``poisson``, ``quantile``, ``quantile_l2``,
``binary``, ``lambdarank``, ``multiclass``,
alias=\ ``objective``, ``app``
- ``regression``, regression application
......@@ -63,9 +64,9 @@ Core Parameters
- ``poisson``, `Poisson regression`_
- ``quantile``, `Quantile Regression`_
- ``quantile``, `Quantile regression`_
- ``quantile_l2``, `Like the ``quantile``, but use l2 loss instead
- ``quantile_l2``, like the ``quantile``, but L2 loss is used instead
- ``binary``, binary classification application
......@@ -496,7 +497,7 @@ Objective Parameters
- ``alpha``, default=\ ``0.9``, type=double
- parameter for `Huber loss`_ and `Quantile Regression`_. Will be used in ``regression`` task
- parameter for `Huber loss`_ and `Quantile regression`_. Will be used in ``regression`` task
- ``fair_c``, default=\ ``1.0``, type=double
......@@ -546,7 +547,9 @@ Objective Parameters
- ``reg_sqrt``, default=\ ``false``, type=bool
- only used in Regression. Will fit ``sqrt(label)`` instead. And prediction result is also automatically converted to ``pow2(prediction)``
- only used in ``regression``
- will fit ``sqrt(label)`` instead and prediction result will be also automatically converted to ``pow2(prediction)``
Metric Parameters
-----------------
......@@ -560,7 +563,7 @@ Metric Parameters
- ``l2_root``, root square loss, alias=\ ``root_mean_squared_error``, ``rmse``
- ``quantile``, `Quantile Regression`_
- ``quantile``, `Quantile regression`_
- ``huber``, `Huber loss`_
......@@ -725,7 +728,7 @@ You can specific query/group id in data file now. Please refer to parameter ``gr
.. _Huber loss: https://en.wikipedia.org/wiki/Huber_loss
.. _Quantile Regression: https://en.wikipedia.org/wiki/Quantile_regression
.. _Quantile regression: https://en.wikipedia.org/wiki/Quantile_regression
.. _Fair loss: https://www.kaggle.com/c/allstate-claims-severity/discussion/24520
......
......@@ -15,7 +15,7 @@ import scipy.sparse
from .compat import (DataFrame, Series, integer_types, json,
json_default_with_numpy, numeric_types, range_,
string_type)
string_type, LGBMDeprecationWarning)
from .libpath import find_lib_path
......@@ -570,6 +570,7 @@ class Dataset(object):
Label of the data.
max_bin : int or None, optional (default=None)
Max number of discrete bins for features.
If None, default value from parameters of CLI-version will be used.
reference : Dataset or None, optional (default=None)
If this is Dataset for validation, training data should be used as reference.
weight : list, numpy 1-D array or None, optional (default=None)
......@@ -632,19 +633,26 @@ class Dataset(object):
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data, feature_name, categorical_feature, self.pandas_categorical)
label = _label_from_pandas(label)
self.data_has_header = False
"""process for args"""
# process for args
params = {} if params is None else params
args_names = getattr(self.__class__, '_lazy_init').__code__.co_varnames[:getattr(self.__class__, '_lazy_init').__code__.co_argcount]
for key, _ in params.items():
if key in args_names:
warnings.warn('{0} keyword has been found in `params` and will be ignored. '
'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))
self.max_bin = max_bin
self.predictor = predictor
if self.max_bin is not None:
params["max_bin"] = self.max_bin
warnings.warn('The `max_bin` parameter is deprecated and will be removed in 2.0.12 version. '
'Please use `params` to pass this parameter.', LGBMDeprecationWarning)
if "verbosity" in params:
params.setdefault("verbose", params.pop("verbosity"))
if silent:
params["verbose"] = 0
elif "verbose" not in params:
params["verbose"] = 1
"""get categorical features"""
# get categorical features
if categorical_feature is not None:
categorical_indices = set()
feature_dict = {}
......@@ -666,15 +674,15 @@ class Dataset(object):
params['categorical_column'] = sorted(categorical_indices)
params_str = param_dict_to_str(params)
"""process for reference dataset"""
# process for reference dataset
ref_dataset = None
if isinstance(reference, Dataset):
ref_dataset = reference.construct().handle
elif reference is not None:
raise TypeError('Reference dataset should be None or dataset instance')
"""start construct data"""
# start construct data
if isinstance(data, string_type):
"""check data has header or not"""
# check data has header or not
if str(params.get("has_header", "")).lower() == "true" \
or str(params.get("header", "")).lower() == "true":
self.data_has_header = True
......@@ -739,7 +747,7 @@ class Dataset(object):
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
# change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
......@@ -812,12 +820,12 @@ class Dataset(object):
if self.handle is None:
if self.reference is not None:
if self.used_indices is None:
"""create valid"""
# create valid
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin, reference=self.reference,
weight=self.weight, group=self.group, init_score=self.init_score, predictor=self._predictor,
silent=self.silent, feature_name=self.feature_name, params=self.params)
else:
"""construct subset"""
# construct subset
used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
self.handle = ctypes.c_void_p()
params_str = param_dict_to_str(self.params)
......@@ -830,7 +838,7 @@ class Dataset(object):
if self.get_label() is None:
raise ValueError("Label should not be None.")
else:
"""create train"""
# create train
self._lazy_init(self.data, label=self.label, max_bin=self.max_bin,
weight=self.weight, group=self.group, init_score=self.init_score,
predictor=self._predictor, silent=self.silent, feature_name=self.feature_name,
......@@ -933,7 +941,7 @@ class Dataset(object):
if self.handle is None:
raise Exception("Cannot set %s before construct dataset" % field_name)
if data is None:
"""set to None"""
# set to None
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
......@@ -1276,17 +1284,17 @@ class Booster(object):
elif "verbose" not in params:
params["verbose"] = 1
if train_set is not None:
"""Training task"""
# Training task
if not isinstance(train_set, Dataset):
raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
params_str = param_dict_to_str(params)
"""construct booster object"""
# construct booster object
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_BoosterCreate(
train_set.construct().handle,
c_str(params_str),
ctypes.byref(self.handle)))
"""save reference to data"""
# save reference to data
self.train_set = train_set
self.valid_sets = []
self.name_valid_sets = []
......@@ -1301,12 +1309,12 @@ class Booster(object):
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
"""buffer for inner predict"""
# buffer for inner predict
self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False]
self.__get_eval_info()
self.pandas_categorical = train_set.pandas_categorical
"""set network if necessary"""
# set network if necessary
if "machines" in params:
machines = params["machines"]
if isinstance(machines, string_type):
......@@ -1321,7 +1329,7 @@ class Booster(object):
listen_time_out=params.get("listen_time_out", 120),
num_machines=params.get("num_machines", num_machines))
elif model_file is not None:
"""Prediction task"""
# Prediction task
out_num_iterations = ctypes.c_int(0)
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
......@@ -1481,7 +1489,7 @@ class Booster(object):
Whether the update was successfully finished.
"""
"""need reset training data"""
# need reset training data
if train_set is not None and train_set is not self.train_set:
if not isinstance(train_set, Dataset):
raise TypeError('Training data should be Dataset instance, met {}'.format(type(train_set).__name__))
......@@ -1581,7 +1589,7 @@ class Booster(object):
if data is self.valid_sets[i]:
data_idx = i + 1
break
"""need to push new valid data"""
# need to push new valid data
if data_idx == -1:
self.add_valid(data, name)
data_idx = self.__num_dataset - 1
......@@ -1809,7 +1817,7 @@ class Booster(object):
List with names of features.
"""
num_feature = self.num_feature()
"""Get name of features"""
# Get name of features
tmp_out_len = ctypes.c_int(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range_(num_feature)]
ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
......@@ -1901,7 +1909,7 @@ class Booster(object):
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range_(n_preds)], dtype=np.float64, copy=False)
"""avoid to predict many time in one iteration"""
# avoid to predict many time in one iteration
if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_double))
......@@ -1922,13 +1930,13 @@ class Booster(object):
if self.__need_reload_eval_info:
self.__need_reload_eval_info = False
out_num_eval = ctypes.c_int(0)
"""Get num of inner evals"""
# Get num of inner evals
_safe_call(_LIB.LGBM_BoosterGetEvalCounts(
self.handle,
ctypes.byref(out_num_eval)))
self.__num_inner_eval = out_num_eval.value
if self.__num_inner_eval > 0:
"""Get name of evals"""
# Get name of evals
tmp_out_len = ctypes.c_int(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range_(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
......
......@@ -96,3 +96,8 @@ except ImportError:
_LGBMCheckArray = None
_LGBMCheckConsistentLength = None
_LGBMCheckClassificationTargets = None
# DeprecationWarning is not shown by default, so let's create our own with higher level
class LGBMDeprecationWarning(UserWarning):
pass
......@@ -92,7 +92,7 @@ def train(params, train_set, num_boost_round=100,
booster : Booster
The trained Booster model.
"""
"""create predictor first"""
# create predictor first
for alias in ["num_boost_round", "num_iterations", "num_iteration", "num_tree", "num_trees", "num_round", "num_rounds"]:
if alias in params:
num_boost_round = int(params.pop(alias))
......@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
else:
predictor = None
init_iteration = predictor.num_total_iteration if predictor is not None else 0
"""check dataset"""
# check dataset
if not isinstance(train_set, Dataset):
raise TypeError("Training only accepts Dataset object")
......@@ -130,7 +130,7 @@ def train(params, train_set, num_boost_round=100,
if isinstance(valid_names, string_type):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_sets):
"""reduce cost for prediction training data"""
# reduce cost for prediction training data
if valid_data is train_set:
is_valid_contain_train = True
if valid_names is not None:
......@@ -145,7 +145,7 @@ def train(params, train_set, num_boost_round=100,
name_valid_sets.append(valid_names[i])
else:
name_valid_sets.append('valid_' + str(i))
"""process callbacks"""
# process callbacks
if callbacks is None:
callbacks = set()
else:
......@@ -173,7 +173,7 @@ def train(params, train_set, num_boost_round=100,
callbacks_before_iter = sorted(callbacks_before_iter, key=attrgetter('order'))
callbacks_after_iter = sorted(callbacks_after_iter, key=attrgetter('order'))
"""construct booster"""
# construct booster
try:
booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train:
......@@ -186,7 +186,7 @@ def train(params, train_set, num_boost_round=100,
valid_set._reverse_update_params()
booster.best_iteration = 0
"""start training"""
# start training
for i in range_(init_iteration, init_iteration + num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster,
......
......@@ -15,15 +15,10 @@ from .basic import Dataset, LightGBMError
from .compat import (SKLEARN_INSTALLED, _LGBMClassifierBase,
LGBMNotFittedError, _LGBMLabelEncoder, _LGBMModelBase,
_LGBMRegressorBase, _LGBMCheckXY, _LGBMCheckArray, _LGBMCheckConsistentLength,
_LGBMCheckClassificationTargets, argc_, range_)
_LGBMCheckClassificationTargets, argc_, range_, LGBMDeprecationWarning)
from .engine import train
# DeprecationWarning is not shown by default, so let's create our own with higher level
class LGBMDeprecationWarning(UserWarning):
pass
def _objective_function_wrapper(func):
"""Decorate an objective function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id.
......@@ -162,7 +157,7 @@ class LGBMModel(_LGBMModelBase):
n_estimators : int, optional (default=10)
Number of boosted trees to fit.
max_bin : int, optional (default=255)
Number of bucketed bin for feature values.
Number of bucketed bins for feature values.
subsample_for_bin : int, optional (default=50000)
Number of samples for constructing bins.
objective : string, callable or None, optional (default=None)
......@@ -279,10 +274,10 @@ class LGBMModel(_LGBMModelBase):
params = super(LGBMModel, self).get_params(deep=deep)
params.update(self._other_params)
if 'seed' in params:
warnings.warn('The `seed` parameter is deprecated and will be removed in next version. '
warnings.warn('The `seed` parameter is deprecated and will be removed in 2.0.12 version. '
'Please use `random_state` instead.', LGBMDeprecationWarning)
if 'nthread' in params:
warnings.warn('The `nthread` parameter is deprecated and will be removed in next version. '
warnings.warn('The `nthread` parameter is deprecated and will be removed in 2.0.12 version. '
'Please use `n_jobs` instead.', LGBMDeprecationWarning)
return params
......@@ -432,7 +427,7 @@ class LGBMModel(_LGBMModelBase):
if isinstance(eval_set, tuple):
eval_set = [eval_set]
for i, valid_data in enumerate(eval_set):
"""reduce cost for prediction training data"""
# reduce cost for prediction training data
if valid_data[0] is X and valid_data[1] is y:
valid_set = train_set
else:
......@@ -584,12 +579,12 @@ class LGBMModel(_LGBMModelBase):
return self.booster_.feature_importance()
def booster(self):
warnings.warn('The `booster()` method is deprecated and will be removed in next version. '
warnings.warn('The `booster()` method is deprecated and will be removed in 2.0.12 version. '
'Please use attribute `booster_` instead.', LGBMDeprecationWarning)
return self.booster_
def feature_importance(self):
warnings.warn('The `feature_importance()` method is deprecated and will be removed in next version. '
warnings.warn('The `feature_importance()` method is deprecated and will be removed in 2.0.12 version. '
'Please use attribute `feature_importances_` instead.', LGBMDeprecationWarning)
return self.feature_importances_
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment