Merge branch 'master' of https://github.com/Microsoft/LightGBM

dcbdc675 · Guolin Ke · 795ff82f · 1c6c7046 · dcbdc675 · dcbdc675
Commit dcbdc675 authored Jan 03, 2017 by Guolin Ke
7 changed files
--- a/docs/Python-API.md
+++ b/docs/Python-API.md
@@ -10,6 +10,7 @@

 * [Scikit-learn API](Python-API.md#scikit-learn-api)
    - [Common Methods](Python-API.md#common-methods)
+    - [Common Attributes](Python-API.md#common-attributes)
    - [LGBMClassifier](Python-API.md#lgbmclassifier)
    - [LGBMRegressor](Python-API.md#lgbmregressor)
    - [LGBMRanker](Python-API.md#lgbmranker)
@@ -675,35 +676,6 @@ The methods of each Class is in alphabetical order.
    X_leaves : array_like, shape=[n_samples, n_trees]
    

-####booster()
-
-    Get the underlying lightgbm Booster of this model.
-    This will raise an exception when it's called before fit().
-
-    Returns
-    -------
-    booster : a lightgbm booster of underlying model
-    
-
-####evals_result()
-
-    Return the evaluation results.
-
-    Returns
-    -------
-    evals_result : dictionary
-    
-
-####feature_importance()
-
-    Return the feature importances of each feature.
-
-    Returns
-    -------
-    result : array
-        Array of normailized feature importances
-    
-
 ####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, callbacks=None)

    Fit the gradient boosting model.
@@ -771,7 +743,7 @@ The methods of each Class is in alphabetical order.
      if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]


-####predict(data, raw_score=False, num_iteration=0)
+####predict(X, raw_score=False, num_iteration=0)

    Return the predicted value for each sample.

@@ -786,11 +758,26 @@ The methods of each Class is in alphabetical order.
    Returns
    -------
    predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
-    
+
+
+###Common Attributes
+
+####booster_
+
+    Get the underlying lightgbm Booster of this model.
+
+####evals_result_
+
+    Get the evaluation results.
+
+####feature_importance_
+
+    Get normailized feature importances.
+

 ###LGBMClassifier

-####predict_proba(data, raw_score=False, num_iteration=0)
+####predict_proba(X, raw_score=False, num_iteration=0)

    Return the predicted probability for each class for each sample.

@@ -805,6 +792,14 @@ The methods of each Class is in alphabetical order.
    Returns
    -------
    predicted_probability : array_like, shape=[n_samples, n_classes]
+
+####classes_
+
+    Get class label array.
+
+####n_classes_
+
+    Get number of classes.
    

 ###LGBMRegressor

--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
@@ -34,7 +34,7 @@ print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

 print('Calculate feature importances...')
 # feature importances
-print('Feature importances:', list(gbm.feature_importance()))
+print('Feature importances:', list(gbm.feature_importance_))

 # other scikit-learn modules
 estimator = lgb.LGBMRegressor(num_leaves=31)

--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -132,7 +132,7 @@ def param_dict_to_str(data):
                            % (key, type(val).__name__))
    return ' '.join(pairs)

-class _temp_file:
+class _temp_file(object):
    def __enter__(self):
        with NamedTemporaryFile(prefix="lightgbm_tmp_", delete=True) as f:
            self.name = f.name
@@ -146,7 +146,7 @@ class _temp_file:
        return ret
    def writelines(self, lines):
        with open(self.name, "w+") as f:
-            ret = f.writelines(lines)
+            f.writelines(lines)

 """marco definition of data type in c_api of LightGBM"""
 C_API_DTYPE_FLOAT32 = 0

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -5,13 +5,14 @@ from __future__ import absolute_import
 import inspect

 import numpy as np
-from .basic import LightGBMError, Dataset, is_str
+from .basic import LightGBMError, Dataset
 from .engine import train
 '''sklearn'''
 try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
    from sklearn.preprocessing import LabelEncoder
+    from sklearn.utils import deprecated
    SKLEARN_INSTALLED = True
    LGBMModelBase = BaseEstimator
    LGBMRegressorBase = RegressorMixin
@@ -251,25 +252,13 @@ class LGBMModel(LGBMModelBase):
        self.uniform_drop = uniform_drop
        self.xgboost_dart_mode = xgboost_dart_mode
        self._Booster = None
+        self.evals_result = None
        self.best_iteration = -1
        if callable(self.objective):
            self.fobj = _objective_function_wrapper(self.objective)
        else:
            self.fobj = None

-    def booster(self):
-        """
-        Get the underlying lightgbm Booster of this model.
-        This will raise an exception when fit was not called
-
-        Returns
-        -------
-        booster : a lightgbm booster of underlying model
-        """
-        if self._Booster is None:
-            raise LightGBMError('Need to call fit beforehand')
-        return self._Booster
-
    def fit(self, X, y,
            sample_weight=None, init_score=None, group=None,
            eval_set=None, eval_sample_weight=None,
@@ -349,19 +338,15 @@ class LGBMModel(LGBMModelBase):
            params['num_class'] = self.n_classes_
        if hasattr(self, 'eval_at'):
            params['ndcg_eval_at'] = self.eval_at
-
        if self.fobj:
-            params["objective"] = "None"
-        else:
-            params["objective"] = self.objective
+            # objective = nullptr for unknown objective
+            params['objective'] = 'None'

        if callable(eval_metric):
            feval = _eval_function_wrapper(eval_metric)
-        elif is_str(eval_metric) or isinstance(eval_metric, list):
-            feval = None
-            params.update({'metric': eval_metric})
        else:
            feval = None
+            params['metric'] = eval_metric

        def _construct_dataset(X, y, sample_weight, init_score, group, params):
            ret = Dataset(X, label=y, max_bin=self.max_bin, weight=sample_weight, group=group, params=params)
@@ -383,10 +368,7 @@ class LGBMModel(LGBMModelBase):
                        if collection is None:
                            return None
                        elif isinstance(collection, list):
-                            if len(collection) > i:
-                                return collection[i]
-                            else:
-                                return None
+                            return collection[i] if len(collection) > i else None
                        elif isinstance(collection, dict):
                            return collection.get(i, None)
                        else:
@@ -406,16 +388,13 @@ class LGBMModel(LGBMModelBase):
                              callbacks=callbacks)

        if evals_result:
-            for val in evals_result.items():
-                evals_result_key = list(val[1].keys())[0]
-                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
-            self.evals_result_ = evals_result
+            self.evals_result = evals_result

        if early_stopping_rounds is not None:
            self.best_iteration = self._Booster.best_iteration
        return self

-    def predict(self, data, raw_score=False, num_iteration=0):
+    def predict(self, X, raw_score=False, num_iteration=0):
        """
        Return the predicted value for each sample.

@@ -431,9 +410,7 @@ class LGBMModel(LGBMModelBase):
        -------
        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
        """
-        return self._Booster.predict(data,
-                                     raw_score=raw_score,
-                                     num_iteration=num_iteration)
+        return self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)

    def apply(self, X, num_iteration=0):
        """
@@ -451,35 +428,35 @@ class LGBMModel(LGBMModelBase):
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
        """
-        return self._Booster.predict(X,
-                                     pred_leaf=True,
-                                     num_iteration=num_iteration)
+        return self.booster_.predict(X, pred_leaf=True, num_iteration=num_iteration)

-    def evals_result(self):
-        """
-        Return the evaluation results.
+    @property
+    def booster_(self):
+        """Get the underlying lightgbm Booster of this model."""
+        if self._Booster is None:
+            raise LightGBMError('No booster found. Need to call fit beforehand.')
+        return self._Booster

-        Returns
-        -------
-        evals_result : dictionary
-        """
-        if self.evals_result_:
-            evals_result = self.evals_result_
-        else:
-            raise LightGBMError('No results found.')
+    @property
+    def evals_result_(self):
+        """Get the evaluation results."""
+        if self.evals_result is None:
+            raise LightGBMError('No results found. Need to call fit with eval set beforehand.')
+        return self.evals_result
+
+    @property
+    def feature_importance_(self):
+        """Get normailized feature importances."""
+        importace_array = self.booster_.feature_importance().astype(np.float32)
+        return importace_array / importace_array.sum()

-        return evals_result
+    @deprecated('Use attribute booster_ instead.')
+    def booster(self):
+        return self.booster_

+    @deprecated('Use attribute feature_importance_ instead.')
    def feature_importance(self):
-        """
-        Feature importances
-
-        Returns
-        -------
-        Array of normailized feature importances
-        """
-        importace_array = self._Booster.feature_importance().astype(np.float32)
-        return importace_array / importace_array.sum()
+        return self.feature_importance_

 class LGBMRegressor(LGBMModel, LGBMRegressorBase):

@@ -513,6 +490,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
                 is_unbalance=False, seed=0,
                 drop_rate=0.1, skip_drop=0.5, max_drop=50,
                 uniform_drop=False, xgboost_dart_mode=False):
+        self.classes, self.n_classes = None, None
        super(LGBMClassifier, self).__init__(boosting_type=boosting_type, num_leaves=num_leaves,
                                             max_depth=max_depth, learning_rate=learning_rate,
                                             n_estimators=n_estimators, max_bin=max_bin,
@@ -533,12 +511,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
            early_stopping_rounds=None, verbose=True,
            feature_name=None, categorical_feature=None,
            callbacks=None):
-
        self._le = LGBMLabelEncoder().fit(y)
        y = self._le.transform(y)

-        self.n_classes_ = len(self._le.classes_)
-        if self.n_classes_ > 2:
+        self.classes = self._le.classes_
+        self.n_classes = len(self.classes_)
+        if self.n_classes > 2:
            # Switch to using a multiclass objective in the underlying LGBM instance
            self.objective = "multiclass"
            if eval_set is not None and eval_metric == "binary_logloss":
@@ -558,18 +536,12 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
                                        callbacks=callbacks)
        return self

-    def predict(self, data, raw_score=False, num_iteration=0):
-        class_probs = self._Booster.predict(data,
-                                            raw_score=raw_score,
-                                            num_iteration=num_iteration)
-        if len(class_probs.shape) > 1:
-            column_indexes = np.argmax(class_probs, axis=1)
-        else:
-            column_indexes = np.repeat(0, class_probs.shape[0])
-            column_indexes[class_probs > 0.5] = 1
-        return self._le.inverse_transform(column_indexes)
+    def predict(self, X, raw_score=False, num_iteration=0):
+        class_probs = self.predict_proba(X, raw_score, num_iteration)
+        class_index = np.argmax(class_probs, axis=1)
+        return self._le.inverse_transform(class_index)

-    def predict_proba(self, data, raw_score=False, num_iteration=0):
+    def predict_proba(self, X, raw_score=False, num_iteration=0):
        """
        Return the predicted probability for each class for each sample.

@@ -585,15 +557,25 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
        -------
        predicted_probability : array_like, shape=[n_samples, n_classes]
        """
-        class_probs = self._Booster.predict(data,
-                                            raw_score=raw_score,
-                                            num_iteration=num_iteration)
-        if self.n_classes_ > 2:
+        class_probs = self.booster_.predict(X, raw_score=raw_score, num_iteration=num_iteration)
+        if self.n_classes > 2:
            return class_probs
        else:
-            classone_probs = class_probs
-            classzero_probs = 1.0 - classone_probs
-            return np.vstack((classzero_probs, classone_probs)).transpose()
+            return np.vstack((1. - class_probs, class_probs)).transpose()
+
+    @property
+    def classes_(self):
+        """Get class label array."""
+        if self.classes is None:
+            raise LightGBMError('No classes found. Need to call fit beforehand.')
+        return self.classes
+
+    @property
+    def n_classes_(self):
+        """Get number of classes"""
+        if self.n_classes is None:
+            raise LightGBMError('No classes found. Need to call fit beforehand.')
+        return self.n_classes

 class LGBMRanker(LGBMModel):


--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -127,7 +127,7 @@ public:
  explicit BinaryErrorMetric(const MetricConfig& config) :BinaryMetric<BinaryErrorMetric>(config) {}

  inline static score_t LossOnPoint(float label, score_t prob) {
-    if (prob < 0.5f) {
+    if (prob <= 0.5f) {
      return label;
    } else {
      return 1.0f - label;

--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -17,13 +17,14 @@ def multi_logloss(y_true, y_pred):
 def test_template(params = {'objective' : 'regression', 'metric' : 'l2'},
                  X_y=load_boston(True), feval=mean_squared_error,
                  num_round=100, init_model=None, custom_eval=None,
-                  return_data=False, return_model=False, early_stopping_rounds=10):
+                  early_stopping_rounds=10,
+                  return_data=False, return_model=False):
+    params['verbose'], params['seed'] = -1, 42
    X_train, X_test, y_train, y_test = train_test_split(*X_y, test_size=0.1, random_state=42)
    lgb_train = lgb.Dataset(X_train, y_train, params=params)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, params=params)
    if return_data: return lgb_train, lgb_eval
    evals_result = {}
-    params['verbose'] = params['seed'] = 0
    gbm = lgb.train(params, lgb_train,
                    num_boost_round=num_round,
                    valid_sets=lgb_eval,

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -86,19 +86,28 @@ class TestSklearn(unittest.TestCase):
        gbm.fit(X_train, y_train)
        self.assertIn(gbm.best_params_['n_estimators'], [15, 20])

-    def test_clone(self):
+    def test_clone_and_property(self):
        gbm = test_template(return_model=True)
        gbm_clone = clone(gbm)
+        self.assertIsInstance(gbm.booster_, lgb.Booster)
+        self.assertIsInstance(gbm.feature_importance_, np.ndarray)
+        clf = test_template(load_digits(2, True), model=lgb.LGBMClassifier, return_model=True)
+        self.assertListEqual(sorted(clf.classes_), [0, 1])
+        self.assertEqual(clf.n_classes_, 2)
+        self.assertIsInstance(clf.booster_, lgb.Booster)
+        self.assertIsInstance(clf.feature_importance_, np.ndarray)

    def test_joblib(self):
        gbm = test_template(num_round=10, return_model=True)
        joblib.dump(gbm, 'lgb.pkl')
        gbm_pickle = joblib.load('lgb.pkl')
+        self.assertIsInstance(gbm_pickle.booster_, lgb.Booster)
        self.assertDictEqual(gbm.get_params(), gbm_pickle.get_params())
+        self.assertListEqual(list(gbm.feature_importance_), list(gbm_pickle.feature_importance_))
        X_train, X_test, y_train, y_test = test_template(return_data=True)
        gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        gbm_pickle.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
-        self.assertDictEqual(gbm.evals_result(), gbm_pickle.evals_result())
+        self.assertDictEqual(gbm.evals_result_, gbm_pickle.evals_result_)
        pred_origin = gbm.predict(X_test)
        pred_pickle = gbm_pickle.predict(X_test)
        self.assertEqual(len(pred_origin), len(pred_pickle))