add feature importance in python (#109)

* add feature importances in python; add pandas support * solve best_iteration issue

add feature importance in python (#109)
* add feature importances in python; add pandas support * solve best_iteration issue
2cd024e9 · wxchan · Guolin Ke · 6f7669df · 2cd024e9 · 2cd024e9
Commit 2cd024e9 authored Dec 06, 2016 by wxchan Committed by Guolin Ke Dec 06, 2016
9 changed files
--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
@@ -17,7 +17,11 @@ X_test = df_test.drop(0, axis=1)
 # create dataset for lightgbm
 lgb_train = lgb.Dataset(X_train, y_train)
 lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
-# or you can simply use a tuple of length=2 here
+# ATTENTION: you should carefully use lightgbm.Dataset
+# it requires setting up categorical_feature when you init it
+# rather than passing from lightgbm.train
+# instead, you can simply use a tuple of length=2 like below
+# it will help you construct Datasets with parameters in lightgbm.train
 lgb_train = (X_train, y_train)
 lgb_eval = (X_test, y_test)

@@ -26,14 +30,12 @@ params = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
-    'metric' : 'l2',
+    'metric' : {'l2', 'auc'},
    'num_leaves' : 31,
    'learning_rate' : 0.05,
    'feature_fraction' : 0.9,
    'bagging_fraction' : 0.8,
    'bagging_freq': 5,
-    # 'ndcg_eval_at' : [1, 3, 5, 10],
-    # this metric is not needed in this task, show as an example
    'verbose' : 0
 }

@@ -49,9 +51,6 @@ gbm = lgb.train(params,
 # save model to file
 gbm.save_model('model.txt')

-# load model from file
-gbm = lgb.Booster(model_file='model.txt')
-
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
@@ -62,3 +61,7 @@ model_json = gbm.dump_model()

 with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)
+
+# feature importances
+print('Feature importances:', gbm.feature_importance())
+print('Feature importances:', gbm.feature_importance("gain"))
--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
@@ -26,3 +26,6 @@ gbm.fit(X_train, y_train,
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
+
+# feature importances
+print('Feature importances:', gbm.feature_importance())
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -46,7 +46,7 @@ def print_evaluation(period=1, show_stdv=True):
        The period to log the evaluation results

    show_stdv : bool, optional
-         Whether show stdv if provided
+        Whether show stdv if provided

    Returns
    -------
@@ -55,7 +55,7 @@ def print_evaluation(period=1, show_stdv=True):
    """
    def callback(env):
        """internal function"""
-        if len(env.evaluation_result_list) == 0 or period is False:
+        if not env.evaluation_result_list or period <= 0:
            return
        if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
            result = '\t'.join([_format_eval_result(x, show_stdv) \
@@ -83,15 +83,12 @@ def record_evaluation(eval_result):

    def init(env):
        """internal function"""
-        for data_name, eval_name, _, _ in env.evaluation_result_list:
-            if data_name not in eval_result:
-                eval_result[data_name] = {}
-            if eval_name not in eval_result[data_name]:
-                eval_result[data_name][eval_name] = []
+        for data_name, _, _, _ in env.evaluation_result_list:
+            eval_result.setdefault(data_name, collections.defaultdict(list))

    def callback(env):
        """internal function"""
-        if len(eval_result) == 0:
+        if not eval_result:
            init(env)
        for data_name, eval_name, result, _ in env.evaluation_result_list:
            eval_result[data_name][eval_name].append(result)
@@ -99,17 +96,17 @@ def record_evaluation(eval_result):


 def reset_learning_rate(learning_rates):
-    """Reset learning rate after iteration 1
+    """Reset learning rate after first iteration

    NOTE: the initial learning rate will still take in-effect on first iteration.

    Parameters
    ----------
    learning_rates: list or function
-        List of learning rate for each boosting round
-        or a customized function that calculates learning_rate in terms of
-        current number of round and the total number of boosting round (e.g. yields
-        learning rate decay)
+        List of learning rate for each boosting round \
+        or a customized function that calculates learning_rate in terms of \
+        current number of round and the total number of boosting round \
+        (e.g. yields learning rate decay)
        - list l: learning_rate = l[current_round]
        - function f: learning_rate = f(current_round, total_boost_round)

@@ -121,13 +118,13 @@ def reset_learning_rate(learning_rates):
    def callback(env):
        """internal function"""
        booster = env.model
-        i = env.iteration
+        iteration = env.iteration
        if isinstance(learning_rates, list):
            if len(learning_rates) != env.end_iteration:
                raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
-            booster.reset_parameter({'learning_rate':learning_rates[i]})
+            booster.reset_parameter({'learning_rate':learning_rates[iteration]})
        else:
-            booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)})
+            booster.reset_parameter({'learning_rate':learning_rates(iteration, env.end_iteration)})
    callback.before_iteration = True
    return callback

@@ -157,7 +154,7 @@ def early_stop(stopping_rounds, verbose=True):
    best_msg = {}
    def init(env):
        """internal function"""
-        if len(env.evaluation_result_list) == 0:
+        if not env.evaluation_result_list:
            raise ValueError('For early stopping you need at least one set in evals.')

        if verbose:
@@ -169,13 +166,11 @@ def early_stop(stopping_rounds, verbose=True):
            best_iter[i] = 0
            if verbose:
                best_msg[i] = ""
-            factor_to_bigger_better[i] = -1.0
-            if env.evaluation_result_list[i][3]:
-                factor_to_bigger_better[i] = 1.0
+            factor_to_bigger_better[i] = 1.0 if env.evaluation_result_list[i][3] else -1.0

    def callback(env):
        """internal function"""
-        if len(best_score) == 0:
+        if not best_score:
            init(env)
        for i in range(len(env.evaluation_result_list)):
            score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
@@ -190,6 +185,7 @@ def early_stop(stopping_rounds, verbose=True):
                    if env.model is not None:
                        env.model.set_attr(best_iteration=str(best_iter[i]))
                    if verbose:
-                        print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
+                        print('early stopping, best iteration is:')
+                        print(best_msg[i])
                    raise EarlyStopException(best_iter[i])
    return callback
--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -8,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
 from . import callback

 def _construct_dataset(X_y, reference=None,
-                       params=None, other_fields=None, 
+                       params=None, other_fields=None,
                       feature_name=None, categorical_feature=None,
                       predictor=None):
    if 'max_bin' in params:
@@ -21,9 +21,9 @@ def _construct_dataset(X_y, reference=None,
    if other_fields is not None:
        if not isinstance(other_fields, dict):
            raise TypeError("other filed data should be dict type")
-        weight = None if 'weight' not in other_fields else other_fields['weight']
-        group = None if 'group' not in other_fields else other_fields['group']
-        init_score = None if 'init_score' not in other_fields else other_fields['init_score']
+        weight = other_fields.get('weight', None)
+        group = other_fields.get('group', None)
+        init_score = other_fields.get('init_score', None)
    if is_str(X_y):
        data = X_y
        label = None
@@ -58,15 +58,15 @@ def train(params, train_data, num_boost_round=100,
    Parameters
    ----------
    params : dict
-         params.
+        Parameters for training.
    train_data : Dataset, tuple (X, y) or filename of data
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
-    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
+    valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data
        List of data to be evaluated during training
    valid_names: list of string
-        names of valid_datas
+        Names of valid_datas
    fobj : function
        Customized objective function.
    feval : function
@@ -75,17 +75,17 @@ def train(params, train_data, num_boost_round=100,
    init_model : file name of lightgbm model or 'Booster' instance
        model used for continued train
    train_fields : dict
-        other data file in training data. e.g. train_fields['weight'] is weight data
-        support fields: weight, group, init_score
+        Other data file in training data. e.g. train_fields['weight'] is weight data
+        Support fields: weight, group, init_score
    valid_fields : dict
-        other data file in training data. \
+        Other data file in training data. \
        e.g. valid_fields[0]['weight'] is weight data for first valid data
-        support fields: weight, group, init_score
+        Support fields: weight, group, init_score
    feature_name : list of str
-        feature names
-    categorical_feature : list of str/int
-        categorical features , int type to use index, 
-        str type to use feature names (feature_name cannot be None)
+        Feature names
+    categorical_feature : list of str or int
+        Categorical features, type int represents index, \
+        type str represents feature names (need to specify feature_name as well)
    early_stopping_rounds: int
        Activates early stopping.
        Requires at least one validation data and one metric
@@ -101,18 +101,18 @@ def train(params, train_data, num_boost_round=100,
        passed with None means no using this function
    verbose_eval : bool or int
        Requires at least one item in evals.
-        If `verbose_eval` is True then the evaluation metric on the validation set is
+        If `verbose_eval` is True then the evaluation metric on the validation set is \
        printed at each boosting stage.
-        If `verbose_eval` is an integer then the evaluation metric on the validation set
-        is printed at every given `verbose_eval` boosting stage. The last boosting stage
+        If `verbose_eval` is an integer then the evaluation metric on the validation set \
+        is printed at every given `verbose_eval` boosting stage. The last boosting stage \
        / the boosting stage found by using `early_stopping_rounds` is also printed.
-        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
+        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
        is printed every 4 boosting stages, instead of every boosting stage.
    learning_rates: list or function
-        List of learning rate for each boosting round
-        or a customized function that calculates learning_rate in terms of
-        current number of round and the total number of boosting round (e.g. yields
-        learning rate decay)
+        List of learning rate for each boosting round \
+        or a customized function that calculates learning_rate in terms of \
+        current number of round and the total number of boosting round \
+        (e.g. yields learning rate decay)
        - list l: learning_rate = l[current_round]
        - function f: learning_rate = f(current_round, total_boost_round)
    callbacks : list of callback functions
@@ -131,12 +131,16 @@ def train(params, train_data, num_boost_round=100,
        predictor = init_model
    else:
        predictor = None
+    init_iteration = predictor.num_total_iteration if predictor else 0
    """create dataset"""
    if isinstance(train_data, Dataset):
        train_set = train_data
+        if train_fields is not None:
+            for field, data in train_fields.items():
+                train_set.set_field(field, data)
    else:
-        train_set = _construct_dataset(train_data, None, params, 
-                                       other_fields=train_fields, 
+        train_set = _construct_dataset(train_data, None, params,
+                                       other_fields=train_fields,
                                       feature_name=feature_name,
                                       categorical_feature=categorical_feature,
                                       predictor=predictor)
@@ -150,7 +154,7 @@ def train(params, train_data, num_boost_round=100,
        if isinstance(valid_names, str):
            valid_names = [valid_names]
        for i, valid_data in enumerate(valid_datas):
-            other_fields = None if valid_fields is None else valid_fields[i]
+            other_fields = None if valid_fields is None else valid_fields.get(i, None)
            """reduce cost for prediction training data"""
            if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]:
                is_valid_contain_train = True
@@ -159,6 +163,9 @@ def train(params, train_data, num_boost_round=100,
                continue
            if isinstance(valid_data, Dataset):
                valid_set = valid_data
+                if other_fields is not None:
+                    for field, data in other_fields.items():
+                        valid_set.set_field(field, data)
            else:
                valid_set = _construct_dataset(
                    valid_data,
@@ -169,7 +176,7 @@ def train(params, train_data, num_boost_round=100,
                    categorical_feature=categorical_feature,
                    predictor=predictor)
            valid_sets.append(valid_set)
-            if valid_names is not None:
+            if valid_names is not None and len(valid_names) > i:
                name_valid_sets.append(valid_names[i])
            else:
                name_valid_sets.append('valid_'+str(i))
@@ -179,13 +186,13 @@ def train(params, train_data, num_boost_round=100,
    # Most of legacy advanced options becomes callbacks
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation())
-    else:
-        if isinstance(verbose_eval, int):
-            callbacks.append(callback.print_evaluation(verbose_eval))
+    elif isinstance(verbose_eval, int):
+        callbacks.append(callback.print_evaluation(verbose_eval))

    if early_stopping_rounds is not None:
        callbacks.append(callback.early_stop(early_stopping_rounds,
                                             verbose=bool(verbose_eval)))
+
    if learning_rates is not None:
        callbacks.append(callback.reset_learning_rate(learning_rates))

@@ -197,32 +204,26 @@ def train(params, train_data, num_boost_round=100,
    callbacks_after_iter = [
        cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
    """construct booster"""
-    if 'metric' in params:
-        if is_str(params['metric']):
-            params['metric'] = params['metric'].split(',')
-        else:
-            params['metric'] = list(params['metric'])
-
    booster = Booster(params=params, train_set=train_set)
    if is_valid_contain_train:
        booster.set_train_data_name(train_data_name)
    for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
        booster.add_valid(valid_set, name_valid_set)
    """start training"""
-    for i in range(num_boost_round):
+    for i in range(init_iteration, init_iteration + num_boost_round):
        for cb in callbacks_before_iter:
            cb(callback.CallbackEnv(model=booster,
                                    cvfolds=None,
                                    iteration=i,
-                                    begin_iteration=0,
-                                    end_iteration=num_boost_round,
+                                    begin_iteration=init_iteration,
+                                    end_iteration=init_iteration + num_boost_round,
                                    evaluation_result_list=None))

        booster.update(fobj=fobj)

        evaluation_result_list = []
        # check evaluation result.
-        if len(valid_sets) != 0:
+        if valid_sets:
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
@@ -231,8 +232,8 @@ def train(params, train_data, num_boost_round=100,
                cb(callback.CallbackEnv(model=booster,
                                        cvfolds=None,
                                        iteration=i,
-                                        begin_iteration=0,
-                                        end_iteration=num_boost_round,
+                                        begin_iteration=init_iteration,
+                                        end_iteration=init_iteration + num_boost_round,
                                        evaluation_result_list=evaluation_result_list))
        except callback.EarlyStopException:
            break
@@ -347,24 +348,24 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
    feval : function
        Custom evaluation function.
    train_fields : dict
-        other data file in training data. e.g. train_fields['weight'] is weight data
-        support fields: weight, group, init_score
+        Other data file in training data. e.g. train_fields['weight'] is weight data
+        Support fields: weight, group, init_score
    feature_name : list of str
-        feature names
-    categorical_feature : list of str/int
-        categorical features , int type to use index, 
-        str type to use feature names (feature_name cannot be None)
+        Feature names
+    categorical_feature : list of str or int
+        Categorical features, type int represents index, \
+        type str represents feature names (need to specify feature_name as well)
    early_stopping_rounds: int
-        Activates early stopping. CV error needs to decrease at least
+        Activates early stopping. CV error needs to decrease at least \
        every <early_stopping_rounds> round(s) to continue.
        Last entry in evaluation history is the one from best iteration.
    fpreproc : function
-        Preprocessing function that takes (dtrain, dtest, param) and returns
+        Preprocessing function that takes (dtrain, dtest, param) and returns \
        transformed versions of those.
    verbose_eval : bool, int, or None, default None
-        Whether to display the progress. If None, progress will be displayed
-        when np.ndarray is returned. If True, progress will be displayed at
-        boosting stage. If an integer is given, progress will be displayed
+        Whether to display the progress. If None, progress will be displayed \
+        when np.ndarray is returned. If True, progress will be displayed at \
+        boosting stage. If an integer is given, progress will be displayed \
        at every given `verbose_eval` boosting stage.
    show_stdv : bool, default True
        Whether to display the standard deviation in progress.
@@ -378,25 +379,14 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
    -------
    evaluation history : list(string)
    """
-
-    if isinstance(metrics, str):
-        metrics = [metrics]
-
-    if isinstance(params, list):
-        params = dict(params)
-
-    if 'metric' not in params:
-        params['metric'] = []
-    else:
-        if is_str(params['metric']):
-            params['metric'] = params['metric'].split(',')
+    if metrics:
+        params.setdefault('metric', [])
+        if is_str(metrics):
+            params['metric'].append(metrics)
        else:
-            params['metric'] = list(params['metric'])
-
-    if metrics is not None and len(metrics) > 0:
-        params['metric'].extend(metrics)
+            params['metric'].extend(metrics)

-    train_set = _construct_dataset(train_data, None, params, 
+    train_set = _construct_dataset(train_data, None, params,
 								   other_fields=train_fields,
                                   feature_name=feature_name,
                                   categorical_feature=categorical_feature)
@@ -411,9 +401,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
                                             verbose=False))
    if isinstance(verbose_eval, bool) and verbose_eval:
        callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
-    else:
-        if isinstance(verbose_eval, int):
-            callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
+    elif isinstance(verbose_eval, int):
+        callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))

    callbacks_before_iter = [
        cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
 # coding: utf-8
-# pylint: disable = invalid-name, W0105
+# pylint: disable = invalid-name, W0105, C0111
 """Scikit-Learn Wrapper interface for LightGBM."""
 from __future__ import absolute_import

 import numpy as np
-from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
+from .basic import LightGBMError, is_str
 from .engine import train
 # sklearn
 try:
@@ -66,7 +66,7 @@ def _point_wise_objective(func):
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
-                    raise ValueError("length of grad and hess should equal with num_class * num_data")
+                    raise ValueError("length of grad and hess should equal to num_class * num_data")
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
@@ -169,6 +169,7 @@ class LGBMModel(LGBMModelBase):
        self.is_unbalance = is_unbalance
        self.seed = seed
        self._Booster = None
+        self.best_iteration = -1
        if callable(self.objective):
            self.fobj = _point_wise_objective(self.objective)
        else:
@@ -190,7 +191,6 @@ class LGBMModel(LGBMModelBase):
    def get_params(self, deep=False):
        """Get parameters"""
        params = super(LGBMModel, self).get_params(deep=deep)
-        params['verbose'] = 0 if self.silent else 1
        if self.nthread <= 0:
            params.pop('nthread', None)
        return params
@@ -213,30 +213,31 @@ class LGBMModel(LGBMModelBase):
            A list of (X, y) tuple pairs to use as a validation set for early-stopping
        eval_metric : str, list of str, callable, optional
            If a str, should be a built-in evaluation metric to use.
-            If callable, a custom evaluation metric. The call
-            signature is func(y_predicted, dataset) where dataset will be a
-            Dataset fobject such that you may need to call the get_label
+            If callable, a custom evaluation metric. The call \
+            signature is func(y_predicted, dataset) where dataset will be a \
+            Dataset fobject such that you may need to call the get_label \
            method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
        early_stopping_rounds : int
        verbose : bool
            If `verbose` and an evaluation set is used, writes the evaluation
        train_fields : dict
-            other data file in training data. e.g. train_fields['weight'] is weight data
-            support fields: weight, group, init_score
+            Other data file in training data. e.g. train_fields['weight'] is weight data
+            Support fields: weight, group, init_score
        valid_fields : dict
-            other data file in training data. \
+            Other data file in training data. \
            e.g. valid_fields[0]['weight'] is weight data for first valid data
-            support fields: weight, group, init_score
+            Support fields: weight, group, init_score
        feature_name : list of str
-            feature names
-        categorical_feature : list of str/int
-            categorical features , int type to use index, 
-            str type to use feature names (feature_name cannot be None)
+            Feature names
+        categorical_feature : list of str or int
+            Categorical features, type int represents index, \
+            type str represents feature names (need to specify feature_name as well)
        other_params: dict
-            other parameters
+            Other parameters
        """
        evals_result = {}
        params = self.get_params()
+        params['verbose'] = 0 if self.silent else 1

        if other_params is not None:
            params.update(other_params)
@@ -317,6 +318,14 @@ class LGBMModel(LGBMModelBase):

        return evals_result

+    def feature_importance(self):
+        """Feature importances
+        Returns
+        -------
+        Array of normailized feature importances
+        """
+        importace_array = self._Booster.feature_importance().astype(np.float32)
+        return importace_array / importace_array.sum()

 class LGBMRegressor(LGBMModel, LGBMRegressorBase):
    __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
@@ -394,7 +403,7 @@ def _group_wise_objective(func):
        y_true: array_like of shape [n_samples]
            The target values
        group : array_like of shape
-            group size data of data
+            Group size data of data
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
            The predicted values
    Returns

--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -5,7 +5,7 @@ from __future__ import absolute_import
 import sys
 import os
 from setuptools import setup, find_packages
-# import subprocess
+
 sys.path.insert(0, '.')

 CURRENT_DIR = os.path.dirname(__file__)

--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -227,8 +227,6 @@ Tree::Tree(const std::string& str) {
  leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
  leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);

-
-
 }

 }  // namespace LightGBM
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -101,6 +101,7 @@ def test_early_stopping():
    from sklearn.datasets import load_boston
    from sklearn.cross_validation import KFold
    from sklearn import datasets, metrics, model_selection
+    from sklearn.base import clone

    boston = load_boston()
    y = boston['target']
@@ -111,6 +112,7 @@ def test_early_stopping():
                eval_metric='l2', 
                early_stopping_rounds=10,
                verbose=10)
+    lgb_model_clone = clone(lgb_model)
    print(lgb_model.best_iteration)

 test_binary_classification()