Commit 1c6c7046 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add @property to sklearn interface (#155)

* add @property to sklearn interface

* add deprecated; fix binary_metric
parent 6b73b4ae
......@@ -10,6 +10,7 @@
* [Scikit-learn API](Python-API.md#scikit-learn-api)
- [Common Methods](Python-API.md#common-methods)
- [Common Attributes](Python-API.md#common-attributes)
- [LGBMClassifier](Python-API.md#lgbmclassifier)
- [LGBMRegressor](Python-API.md#lgbmregressor)
- [LGBMRanker](Python-API.md#lgbmranker)
......@@ -675,35 +676,6 @@ The methods of each Class is in alphabetical order.
X_leaves : array_like, shape=[n_samples, n_trees]
####booster()
Get the underlying lightgbm Booster of this model.
This will raise an exception when it's called before fit().
Returns
-------
booster : a lightgbm booster of underlying model
####evals_result()
Return the evaluation results.
Returns
-------
evals_result : dictionary
####feature_importance()
Return the feature importances of each feature.
Returns
-------
result : array
Array of normailized feature importances
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None)
Fit the gradient boosting model.
......@@ -771,7 +743,7 @@ The methods of each Class is in alphabetical order.
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
####predict(data, raw_score=False, num_iteration=0)
####predict(X, raw_score=False, num_iteration=0)
Return the predicted value for each sample.
......@@ -788,9 +760,24 @@ The methods of each Class is in alphabetical order.
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
###Common Attributes
####booster_
Get the underlying lightgbm Booster of this model.
####evals_result_
Get the evaluation results.
####feature_importance_
Get normailized feature importances.
###LGBMClassifier
####predict_proba(data, raw_score=False, num_iteration=0)
####predict_proba(X, raw_score=False, num_iteration=0)
Return the predicted probability for each class for each sample.
......@@ -806,6 +793,14 @@ The methods of each Class is in alphabetical order.
-------
predicted_probability : array_like, shape=[n_samples, n_classes]
####classes_
Get class label array.
####n_classes_
Get number of classes.
###LGBMRegressor
......
......@@ -34,7 +34,7 @@ print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Calculate feature importances...')
# feature importances
print('Feature importances:', list(gbm.feature_importance()))
print('Feature importances:', list(gbm.feature_importance_))
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)
......
......@@ -132,7 +132,7 @@ def param_dict_to_str(data):
% (key, type(val).__name__))
return ' '.join(pairs)
class _temp_file:
class _temp_file(object):
def __enter__(self):
with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
self.name = f.name
......@@ -146,7 +146,7 @@ class _temp_file:
return ret
def writelines(self, lines):
with open(self.name, "w+") as f:
ret = f.writelines(lines)
f.writelines(lines)
"""marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 = 0
......
......@@ -5,13 +5,14 @@ from __future__ import absolute_import
import inspect
import numpy as np
from .basic import LightGBMError, Dataset, is_str
from .basic import LightGBMError, Dataset
from .engine import train
'''sklearn'''
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import deprecated
SKLEARN_INSTALLED = True
LGBMModelBase = BaseEstimator
LGBMRegressorBase = RegressorMixin
......@@ -251,25 +252,13 @@ class LGBMModel(LGBMModelBase):
self.uniform_drop = uniform_drop
self.xgboost_dart_mode = xgboost_dart_mode
self._Booster = None
self.evals_result = None
self.best_iteration = -1
if callable(self.objective):
self.fobj = _objective_function_wrapper(self.objective)
else:
self.fobj = None
def booster(self):
"""
Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
"""
if self._Booster is None:
raise LightGBMError('Need to call fit beforehand')
return self._Booster
def fit(self, X, y,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
......@@ -349,19 +338,15 @@ class LGBMModel(LGBMModelBase):
params['num_class'] = self.n_classes_
if hasattr(self, 'eval_at'):
params['ndcg_eval_at'] = self.eval_at
if self.fobj:
params["objective"] = "None"
else:
params["objective"] = self.objective
# objective = nullptr for unknown objective
params['objective'] = 'None'
if callable(eval_metric):
feval = _eval_function_wrapper(eval_metric)
elif is_str(eval_metric) or isinstance(eval_metric, list):
feval = None
params.update({'metric': eval_metric})
else:
feval = None
params['metric'] = eval_metric
def _construct_dataset(X, y, sample_weight, init_score, group, params):
ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
......@@ -383,10 +368,7 @@ class LGBMModel(LGBMModelBase):
if collection is None:
return None
elif isinstance(collection, list):
if len(collection) > i:
return collection[i]
else:
return None
return collection[i] if len(collection) > i else None
elif isinstance(collection, dict):
return collection.get(i, None)
else:
......@@ -406,16 +388,13 @@ class LGBMModel(LGBMModelBase):
callbacks=callbacks)
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
self.evals_result = evals_result
if early_stopping_rounds is not None:
self.best_iteration = self._Booster.best_iteration
return self
def predict(self, data, raw_score=False, num_iteration=0):
def predict(self, X, raw_score=False, num_iteration=0):
"""
Return the predicted value for each sample.
......@@ -431,9 +410,7 @@ class LGBMModel(LGBMModelBase):
-------
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
"""
return self._Booster.predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
def apply(self, X, num_iteration=0):
"""
......@@ -451,35 +428,35 @@ class LGBMModel(LGBMModelBase):
-------
X_leaves : array_like, shape=[n_samples, n_trees]
"""
return self._Booster.predict(X,
pred_leaf=True,
num_iteration=num_iteration)
return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)
def evals_result(self):
"""
Return the evaluation results.
@property
def booster_(self):
"""Get the underlying lightgbm Booster of this model."""
if self._Booster is None:
raise LightGBMError('No booster found. Need to call fit beforehand.')
return self._Booster
Returns
-------
evals_result : dictionary
"""
if self.evals_result_:
evals_result = self.evals_result_
else:
raise LightGBMError('No results found.')
@property
def evals_result_(self):
"""Get the evaluation results."""
if self.evals_result is None:
raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
return self.evals_result
@property
def feature_importance_(self):
"""Get normailized feature importances."""
importace_array = self.booster_.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
return evals_result
@deprecated('Use attribute booster_ instead.')
def booster(self):
return self.booster_
@deprecated('Use attribute feature_importance_ instead.')
def feature_importance(self):
"""
Feature importances
Returns
-------
Array of normailized feature importances
"""
importace_array = self._Booster.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
return self.feature_importance_
class LGBMRegressor(LGBMModel, LGBMRegressorBase):
......@@ -513,6 +490,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
is_unbalance=False, seed=0,
drop_rate=0.1, skip_drop=0.5, max_drop=50,
uniform_drop=False, xgboost_dart_mode=False):
self.classes, self.n_classes = None, None
super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
max_depth=max_depth, learning_rate=learning_rate,
n_estimators=n_estimators, max_bin=max_bin,
......@@ -533,12 +511,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
early_stopping_rounds=None, verbose=True,
feature_name=None, categorical_feature=None,
callbacks=None):
self._le = LGBMLabelEncoder().fit(y)
y = self._le.transform(y)
self.n_classes_ = len(self._le.classes_)
if self.n_classes_ > 2:
self.classes = self._le.classes_
self.n_classes = len(self.classes_)
if self.n_classes > 2:
# Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass"
if eval_set is not None and eval_metric == "binary_logloss":
......@@ -558,18 +536,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
callbacks=callbacks)
return self
def predict(self, data, raw_score=False, num_iteration=0):
class_probs = self._Booster.predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1)
else:
column_indexes = np.repeat(0, class_probs.shape[0])
column_indexes[class_probs > 0.5] = 1
return self._le.inverse_transform(column_indexes)
def predict(self, X, raw_score=False, num_iteration=0):
class_probs = self.predict_proba(X, raw_score, num_iteration)
class_index = np.argmax(class_probs, axis=1)
return self._le.inverse_transform(class_index)
def predict_proba(self, data, raw_score=False, num_iteration=0):
def predict_proba(self, X, raw_score=False, num_iteration=0):
"""
Return the predicted probability for each class for each sample.
......@@ -585,15 +557,25 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
-------
predicted_probability : array_like, shape=[n_samples, n_classes]
"""
class_probs = self._Booster.predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
if self.n_classes_ > 2:
class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
if self.n_classes > 2:
return class_probs
else:
classone_probs = class_probs
classzero_probs = 1.0 - classone_probs
return np.vstack((classzero_probs, classone_probs)).transpose()
return np.vstack((1. - class_probs, class_probs)).transpose()
@property
def classes_(self):
"""Get class label array."""
if self.classes is None:
raise LightGBMError('No classes found. Need to call fit beforehand.')
return self.classes
@property
def n_classes_(self):
"""Get number of classes"""
if self.n_classes is None:
raise LightGBMError('No classes found. Need to call fit beforehand.')
return self.n_classes
class LGBMRanker(LGBMModel):
......
......@@ -127,7 +127,7 @@ public:
explicit BinaryErrorMetric(const MetricConfig& config) :BinaryMetric<BinaryErrorMetric>(config) {}
inline static score_t LossOnPoint(float label, score_t prob) {
if (prob < 0.5f) {
if (prob <= 0.5f) {
return label;
} else {
return 1.0f - label;
......
......@@ -17,13 +17,14 @@ def multi_logloss(y_true, y_pred):
def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
X_y=load_boston(True), feval=mean_squared_error,
num_round=100, init_model=None, custom_eval=None,
return_data=False, return_model=False, early_stopping_rounds=10):
early_stopping_rounds=10,
return_data=False, return_model=False):
params['verbose'], params['seed'] = -1, 42
X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train, params=params)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
if return_data: return lgb_train, lgb_eval
evals_result = {}
params['verbose'] = params['seed'] = 0
gbm = lgb.train(params, lgb_train,
num_boost_round=num_round,
valid_sets=lgb_eval,
......
......@@ -86,19 +86,28 @@ class TestSklearn(unittest.TestCase):
gbm.fit(X_train, y_train)
self.assertIn(gbm.best_params_['n_estimators'], [15, 20])
def test_clone(self):
def test_clone_and_property(self):
gbm = test_template(return_model=True)
gbm_clone = clone(gbm)
self.assertIsInstance(gbm.booster_, lgb.Booster)
self.assertIsInstance(gbm.feature_importance_, np.ndarray)
clf = test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True)
self.assertListEqual(sorted(clf.classes_), [0, 1])
self.assertEqual(clf.n_classes_, 2)
self.assertIsInstance(clf.booster_, lgb.Booster)
self.assertIsInstance(clf.feature_importance_, np.ndarray)
def test_joblib(self):
gbm = test_template(num_round=10, return_model=True)
joblib.dump(gbm, 'lgb.pkl')
gbm_pickle = joblib.load('lgb.pkl')
self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
self.assertListEqual(list(gbm.feature_importance_), list(gbm_pickle.feature_importance_))
X_train, X_test, y_train, y_test = test_template(return_data=True)
gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
self.assertDictEqual(gbm.evals_result(), gbm_pickle.evals_result())
self.assertDictEqual(gbm.evals_result_, gbm_pickle.evals_result_)
pred_origin = gbm.predict(X_test)
pred_pickle = gbm_pickle.predict(X_test)
self.assertEqual(len(pred_origin), len(pred_pickle))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment