Commit 1c6c7046 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add @property to sklearn interface (#155)

* add @property to sklearn interface

* add deprecated; fix binary_metric
parent 6b73b4ae
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
* [Scikit-learn API](Python-API.md#scikit-learn-api) * [Scikit-learn API](Python-API.md#scikit-learn-api)
- [Common Methods](Python-API.md#common-methods) - [Common Methods](Python-API.md#common-methods)
- [Common Attributes](Python-API.md#common-attributes)
- [LGBMClassifier](Python-API.md#lgbmclassifier) - [LGBMClassifier](Python-API.md#lgbmclassifier)
- [LGBMRegressor](Python-API.md#lgbmregressor) - [LGBMRegressor](Python-API.md#lgbmregressor)
- [LGBMRanker](Python-API.md#lgbmranker) - [LGBMRanker](Python-API.md#lgbmranker)
...@@ -675,35 +676,6 @@ The methods of each Class is in alphabetical order. ...@@ -675,35 +676,6 @@ The methods of each Class is in alphabetical order.
X_leaves : array_like, shape=[n_samples, n_trees] X_leaves : array_like, shape=[n_samples, n_trees]
####booster()
Get the underlying lightgbm Booster of this model.
This will raise an exception when it's called before fit().
Returns
-------
booster : a lightgbm booster of underlying model
####evals_result()
Return the evaluation results.
Returns
-------
evals_result : dictionary
####feature_importance()
Return the feature importances of each feature.
Returns
-------
result : array
Array of normailized feature importances
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None) ####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None)
Fit the gradient boosting model. Fit the gradient boosting model.
...@@ -771,7 +743,7 @@ The methods of each Class is in alphabetical order. ...@@ -771,7 +743,7 @@ The methods of each Class is in alphabetical order.
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i] if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
####predict(data, raw_score=False, num_iteration=0) ####predict(X, raw_score=False, num_iteration=0)
Return the predicted value for each sample. Return the predicted value for each sample.
...@@ -786,11 +758,26 @@ The methods of each Class is in alphabetical order. ...@@ -786,11 +758,26 @@ The methods of each Class is in alphabetical order.
Returns Returns
------- -------
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes] predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
###Common Attributes
####booster_
Get the underlying lightgbm Booster of this model.
####evals_result_
Get the evaluation results.
####feature_importance_
Get normailized feature importances.
###LGBMClassifier ###LGBMClassifier
####predict_proba(data, raw_score=False, num_iteration=0) ####predict_proba(X, raw_score=False, num_iteration=0)
Return the predicted probability for each class for each sample. Return the predicted probability for each class for each sample.
...@@ -805,6 +792,14 @@ The methods of each Class is in alphabetical order. ...@@ -805,6 +792,14 @@ The methods of each Class is in alphabetical order.
Returns Returns
------- -------
predicted_probability : array_like, shape=[n_samples, n_classes] predicted_probability : array_like, shape=[n_samples, n_classes]
####classes_
Get class label array.
####n_classes_
Get number of classes.
###LGBMRegressor ###LGBMRegressor
......
...@@ -34,7 +34,7 @@ print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) ...@@ -34,7 +34,7 @@ print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Calculate feature importances...') print('Calculate feature importances...')
# feature importances # feature importances
print('Feature importances:', list(gbm.feature_importance())) print('Feature importances:', list(gbm.feature_importance_))
# other scikit-learn modules # other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31) estimator = lgb.LGBMRegressor(num_leaves=31)
......
...@@ -132,7 +132,7 @@ def param_dict_to_str(data): ...@@ -132,7 +132,7 @@ def param_dict_to_str(data):
% (key, type(val).__name__)) % (key, type(val).__name__))
return ' '.join(pairs) return ' '.join(pairs)
class _temp_file: class _temp_file(object):
def __enter__(self): def __enter__(self):
with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f: with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
self.name = f.name self.name = f.name
...@@ -146,7 +146,7 @@ class _temp_file: ...@@ -146,7 +146,7 @@ class _temp_file:
return ret return ret
def writelines(self, lines): def writelines(self, lines):
with open(self.name, "w+") as f: with open(self.name, "w+") as f:
ret = f.writelines(lines) f.writelines(lines)
"""marco definition of data type in c_api of LightGBM""" """marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 = 0 C_API_DTYPE_FLOAT32 = 0
......
...@@ -5,13 +5,14 @@ from __future__ import absolute_import ...@@ -5,13 +5,14 @@ from __future__ import absolute_import
import inspect import inspect
import numpy as np import numpy as np
from .basic import LightGBMError, Dataset, is_str from .basic import LightGBMError, Dataset
from .engine import train from .engine import train
'''sklearn''' '''sklearn'''
try: try:
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import LabelEncoder
from sklearn.utils import deprecated
SKLEARN_INSTALLED = True SKLEARN_INSTALLED = True
LGBMModelBase = BaseEstimator LGBMModelBase = BaseEstimator
LGBMRegressorBase = RegressorMixin LGBMRegressorBase = RegressorMixin
...@@ -251,25 +252,13 @@ class LGBMModel(LGBMModelBase): ...@@ -251,25 +252,13 @@ class LGBMModel(LGBMModelBase):
self.uniform_drop = uniform_drop self.uniform_drop = uniform_drop
self.xgboost_dart_mode = xgboost_dart_mode self.xgboost_dart_mode = xgboost_dart_mode
self._Booster = None self._Booster = None
self.evals_result = None
self.best_iteration = -1 self.best_iteration = -1
if callable(self.objective): if callable(self.objective):
self.fobj = _objective_function_wrapper(self.objective) self.fobj = _objective_function_wrapper(self.objective)
else: else:
self.fobj = None self.fobj = None
def booster(self):
"""
Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
"""
if self._Booster is None:
raise LightGBMError('Need to call fit beforehand')
return self._Booster
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, group=None, sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None, eval_set=None, eval_sample_weight=None,
...@@ -349,19 +338,15 @@ class LGBMModel(LGBMModelBase): ...@@ -349,19 +338,15 @@ class LGBMModel(LGBMModelBase):
params['num_class'] = self.n_classes_ params['num_class'] = self.n_classes_
if hasattr(self, 'eval_at'): if hasattr(self, 'eval_at'):
params['ndcg_eval_at'] = self.eval_at params['ndcg_eval_at'] = self.eval_at
if self.fobj: if self.fobj:
params["objective"] = "None" # objective = nullptr for unknown objective
else: params['objective'] = 'None'
params["objective"] = self.objective
if callable(eval_metric): if callable(eval_metric):
feval = _eval_function_wrapper(eval_metric) feval = _eval_function_wrapper(eval_metric)
elif is_str(eval_metric) or isinstance(eval_metric, list):
feval = None
params.update({'metric': eval_metric})
else: else:
feval = None feval = None
params['metric'] = eval_metric
def _construct_dataset(X, y, sample_weight, init_score, group, params): def _construct_dataset(X, y, sample_weight, init_score, group, params):
ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params) ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
...@@ -383,10 +368,7 @@ class LGBMModel(LGBMModelBase): ...@@ -383,10 +368,7 @@ class LGBMModel(LGBMModelBase):
if collection is None: if collection is None:
return None return None
elif isinstance(collection, list): elif isinstance(collection, list):
if len(collection) > i: return collection[i] if len(collection) > i else None
return collection[i]
else:
return None
elif isinstance(collection, dict): elif isinstance(collection, dict):
return collection.get(i, None) return collection.get(i, None)
else: else:
...@@ -406,16 +388,13 @@ class LGBMModel(LGBMModelBase): ...@@ -406,16 +388,13 @@ class LGBMModel(LGBMModelBase):
callbacks=callbacks) callbacks=callbacks)
if evals_result: if evals_result:
for val in evals_result.items(): self.evals_result = evals_result
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
if early_stopping_rounds is not None: if early_stopping_rounds is not None:
self.best_iteration = self._Booster.best_iteration self.best_iteration = self._Booster.best_iteration
return self return self
def predict(self, data, raw_score=False, num_iteration=0): def predict(self, X, raw_score=False, num_iteration=0):
""" """
Return the predicted value for each sample. Return the predicted value for each sample.
...@@ -431,9 +410,7 @@ class LGBMModel(LGBMModelBase): ...@@ -431,9 +410,7 @@ class LGBMModel(LGBMModelBase):
------- -------
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes] predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
""" """
return self._Booster.predict(data, return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
raw_score=raw_score,
num_iteration=num_iteration)
def apply(self, X, num_iteration=0): def apply(self, X, num_iteration=0):
""" """
...@@ -451,35 +428,35 @@ class LGBMModel(LGBMModelBase): ...@@ -451,35 +428,35 @@ class LGBMModel(LGBMModelBase):
------- -------
X_leaves : array_like, shape=[n_samples, n_trees] X_leaves : array_like, shape=[n_samples, n_trees]
""" """
return self._Booster.predict(X, return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)
pred_leaf=True,
num_iteration=num_iteration)
def evals_result(self): @property
""" def booster_(self):
Return the evaluation results. """Get the underlying lightgbm Booster of this model."""
if self._Booster is None:
raise LightGBMError('No booster found. Need to call fit beforehand.')
return self._Booster
Returns @property
------- def evals_result_(self):
evals_result : dictionary """Get the evaluation results."""
""" if self.evals_result is None:
if self.evals_result_: raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
evals_result = self.evals_result_ return self.evals_result
else:
raise LightGBMError('No results found.') @property
def feature_importance_(self):
"""Get normailized feature importances."""
importace_array = self.booster_.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
return evals_result @deprecated('Use attribute booster_ instead.')
def booster(self):
return self.booster_
@deprecated('Use attribute feature_importance_ instead.')
def feature_importance(self): def feature_importance(self):
""" return self.feature_importance_
Feature importances
Returns
-------
Array of normailized feature importances
"""
importace_array = self._Booster.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
class LGBMRegressor(LGBMModel, LGBMRegressorBase): class LGBMRegressor(LGBMModel, LGBMRegressorBase):
...@@ -513,6 +490,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -513,6 +490,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
is_unbalance=False, seed=0, is_unbalance=False, seed=0,
drop_rate=0.1, skip_drop=0.5, max_drop=50, drop_rate=0.1, skip_drop=0.5, max_drop=50,
uniform_drop=False, xgboost_dart_mode=False): uniform_drop=False, xgboost_dart_mode=False):
self.classes, self.n_classes = None, None
super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves, super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
max_depth=max_depth, learning_rate=learning_rate, max_depth=max_depth, learning_rate=learning_rate,
n_estimators=n_estimators, max_bin=max_bin, n_estimators=n_estimators, max_bin=max_bin,
...@@ -533,12 +511,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -533,12 +511,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
early_stopping_rounds=None, verbose=True, early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
callbacks=None): callbacks=None):
self._le = LGBMLabelEncoder().fit(y) self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y) y = self._le.transform(y)
self.n_classes_ = len(self._le.classes_) self.classes = self._le.classes_
if self.n_classes_ > 2: self.n_classes = len(self.classes_)
if self.n_classes > 2:
# Switch to using a multiclass objective in the underlying LGBM instance # Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass" self.objective = "multiclass"
if eval_set is not None and eval_metric == "binary_logloss": if eval_set is not None and eval_metric == "binary_logloss":
...@@ -558,18 +536,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -558,18 +536,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
callbacks=callbacks) callbacks=callbacks)
return self return self
def predict(self, data, raw_score=False, num_iteration=0): def predict(self, X, raw_score=False, num_iteration=0):
class_probs = self._Booster.predict(data, class_probs = self.predict_proba(X, raw_score, num_iteration)
raw_score=raw_score, class_index = np.argmax(class_probs, axis=1)
num_iteration=num_iteration) return self._le.inverse_transform(class_index)
if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1)
else:
column_indexes = np.repeat(0, class_probs.shape[0])
column_indexes[class_probs > 0.5] = 1
return self._le.inverse_transform(column_indexes)
def predict_proba(self, data, raw_score=False, num_iteration=0): def predict_proba(self, X, raw_score=False, num_iteration=0):
""" """
Return the predicted probability for each class for each sample. Return the predicted probability for each class for each sample.
...@@ -585,15 +557,25 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -585,15 +557,25 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
------- -------
predicted_probability : array_like, shape=[n_samples, n_classes] predicted_probability : array_like, shape=[n_samples, n_classes]
""" """
class_probs = self._Booster.predict(data, class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
raw_score=raw_score, if self.n_classes > 2:
num_iteration=num_iteration)
if self.n_classes_ > 2:
return class_probs return class_probs
else: else:
classone_probs = class_probs return np.vstack((1. - class_probs, class_probs)).transpose()
classzero_probs = 1.0 - classone_probs
return np.vstack((classzero_probs, classone_probs)).transpose() @property
def classes_(self):
"""Get class label array."""
if self.classes is None:
raise LightGBMError('No classes found. Need to call fit beforehand.')
return self.classes
@property
def n_classes_(self):
"""Get number of classes"""
if self.n_classes is None:
raise LightGBMError('No classes found. Need to call fit beforehand.')
return self.n_classes
class LGBMRanker(LGBMModel): class LGBMRanker(LGBMModel):
......
...@@ -127,7 +127,7 @@ public: ...@@ -127,7 +127,7 @@ public:
explicit BinaryErrorMetric(const MetricConfig& config) :BinaryMetric<BinaryErrorMetric>(config) {} explicit BinaryErrorMetric(const MetricConfig& config) :BinaryMetric<BinaryErrorMetric>(config) {}
inline static score_t LossOnPoint(float label, score_t prob) { inline static score_t LossOnPoint(float label, score_t prob) {
if (prob < 0.5f) { if (prob <= 0.5f) {
return label; return label;
} else { } else {
return 1.0f - label; return 1.0f - label;
......
...@@ -17,13 +17,14 @@ def multi_logloss(y_true, y_pred): ...@@ -17,13 +17,14 @@ def multi_logloss(y_true, y_pred):
def test_template(params = {'objective' : 'regression', 'metric' : 'l2'}, def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
X_y=load_boston(True), feval=mean_squared_error, X_y=load_boston(True), feval=mean_squared_error,
num_round=100, init_model=None, custom_eval=None, num_round=100, init_model=None, custom_eval=None,
return_data=False, return_model=False, early_stopping_rounds=10): early_stopping_rounds=10,
return_data=False, return_model=False):
params['verbose'], params['seed'] = -1, 42
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, params=params) lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
if return_data: return lgb_train, lgb_eval if return_data: return lgb_train, lgb_eval
evals_result = {} evals_result = {}
params['verbose'] = params['seed'] = 0
gbm = lgb.train(params, lgb_train, gbm = lgb.train(params, lgb_train,
num_boost_round=num_round, num_boost_round=num_round,
valid_sets=lgb_eval, valid_sets=lgb_eval,
......
...@@ -86,19 +86,28 @@ class TestSklearn(unittest.TestCase): ...@@ -86,19 +86,28 @@ class TestSklearn(unittest.TestCase):
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
self.assertIn(gbm.best_params_['n_estimators'], [15, 20]) self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
def test_clone(self): def test_clone_and_property(self):
gbm = test_template(return_model=True) gbm = test_template(return_model=True)
gbm_clone = clone(gbm) gbm_clone = clone(gbm)
self.assertIsInstance(gbm.booster_, lgb.Booster)
self.assertIsInstance(gbm.feature_importance_, np.ndarray)
clf = test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True)
self.assertListEqual(sorted(clf.classes_), [0, 1])
self.assertEqual(clf.n_classes_, 2)
self.assertIsInstance(clf.booster_, lgb.Booster)
self.assertIsInstance(clf.feature_importance_, np.ndarray)
def test_joblib(self): def test_joblib(self):
gbm = test_template(num_round=10, return_model=True) gbm = test_template(num_round=10, return_model=True)
joblib.dump(gbm, 'lgb.pkl') joblib.dump(gbm, 'lgb.pkl')
gbm_pickle = joblib.load('lgb.pkl') gbm_pickle = joblib.load('lgb.pkl')
self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params()) self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
self.assertListEqual(list(gbm.feature_importance_), list(gbm_pickle.feature_importance_))
X_train, X_test, y_train, y_test = test_template(return_data=True) X_train, X_test, y_train, y_test = test_template(return_data=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False) gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
self.assertDictEqual(gbm.evals_result(), gbm_pickle.evals_result()) self.assertDictEqual(gbm.evals_result_, gbm_pickle.evals_result_)
pred_origin = gbm.predict(X_test) pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test) pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle)) self.assertEqual(len(pred_origin), len(pred_pickle))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment