Commit 2cd024e9 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add feature importance in python (#109)

* add feature importances in python; add pandas support

* solve best_iteration issue
parent 6f7669df
......@@ -17,7 +17,11 @@ X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# or you can simply use a tuple of length=2 here
# ATTENTION: you should carefully use lightgbm.Dataset
# it requires setting up categorical_feature when you init it
# rather than passing from lightgbm.train
# instead, you can simply use a tuple of length=2 like below
# it will help you construct Datasets with parameters in lightgbm.train
lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test)
......@@ -26,14 +30,12 @@ params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'l2',
'metric' : {'l2', 'auc'},
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
# 'ndcg_eval_at' : [1, 3, 5, 10],
# this metric is not needed in this task, show as an example
'verbose' : 0
}
......@@ -49,9 +51,6 @@ gbm = lgb.train(params,
# save model to file
gbm.save_model('model.txt')
# load model from file
gbm = lgb.Booster(model_file='model.txt')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
......@@ -62,3 +61,7 @@ model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', gbm.feature_importance("gain"))
......@@ -26,3 +26,6 @@ gbm.fit(X_train, y_train,
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# feature importances
print('Feature importances:', gbm.feature_importance())
This diff is collapsed.
......@@ -46,7 +46,7 @@ def print_evaluation(period=1, show_stdv=True):
The period to log the evaluation results
show_stdv : bool, optional
Whether show stdv if provided
Whether show stdv if provided
Returns
-------
......@@ -55,7 +55,7 @@ def print_evaluation(period=1, show_stdv=True):
"""
def callback(env):
"""internal function"""
if len(env.evaluation_result_list) == 0 or period is False:
if not env.evaluation_result_list or period <= 0:
return
if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
result = '\t'.join([_format_eval_result(x, show_stdv) \
......@@ -83,15 +83,12 @@ def record_evaluation(eval_result):
def init(env):
"""internal function"""
for data_name, eval_name, _, _ in env.evaluation_result_list:
if data_name not in eval_result:
eval_result[data_name] = {}
if eval_name not in eval_result[data_name]:
eval_result[data_name][eval_name] = []
for data_name, _, _, _ in env.evaluation_result_list:
eval_result.setdefault(data_name, collections.defaultdict(list))
def callback(env):
"""internal function"""
if len(eval_result) == 0:
if not eval_result:
init(env)
for data_name, eval_name, result, _ in env.evaluation_result_list:
eval_result[data_name][eval_name].append(result)
......@@ -99,17 +96,17 @@ def record_evaluation(eval_result):
def reset_learning_rate(learning_rates):
"""Reset learning rate after iteration 1
"""Reset learning rate after first iteration
NOTE: the initial learning rate will still take in-effect on first iteration.
Parameters
----------
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
List of learning rate for each boosting round \
or a customized function that calculates learning_rate in terms of \
current number of round and the total number of boosting round \
(e.g. yields learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
......@@ -121,13 +118,13 @@ def reset_learning_rate(learning_rates):
def callback(env):
"""internal function"""
booster = env.model
i = env.iteration
iteration = env.iteration
if isinstance(learning_rates, list):
if len(learning_rates) != env.end_iteration:
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
booster.reset_parameter({'learning_rate':learning_rates[i]})
booster.reset_parameter({'learning_rate':learning_rates[iteration]})
else:
booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)})
booster.reset_parameter({'learning_rate':learning_rates(iteration, env.end_iteration)})
callback.before_iteration = True
return callback
......@@ -157,7 +154,7 @@ def early_stop(stopping_rounds, verbose=True):
best_msg = {}
def init(env):
"""internal function"""
if len(env.evaluation_result_list) == 0:
if not env.evaluation_result_list:
raise ValueError('For early stopping you need at least one set in evals.')
if verbose:
......@@ -169,13 +166,11 @@ def early_stop(stopping_rounds, verbose=True):
best_iter[i] = 0
if verbose:
best_msg[i] = ""
factor_to_bigger_better[i] = -1.0
if env.evaluation_result_list[i][3]:
factor_to_bigger_better[i] = 1.0
factor_to_bigger_better[i] = 1.0 if env.evaluation_result_list[i][3] else -1.0
def callback(env):
"""internal function"""
if len(best_score) == 0:
if not best_score:
init(env)
for i in range(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
......@@ -190,6 +185,7 @@ def early_stop(stopping_rounds, verbose=True):
if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose:
print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
print('early stopping, best iteration is:')
print(best_msg[i])
raise EarlyStopException(best_iter[i])
return callback
......@@ -8,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback
def _construct_dataset(X_y, reference=None,
params=None, other_fields=None,
params=None, other_fields=None,
feature_name=None, categorical_feature=None,
predictor=None):
if 'max_bin' in params:
......@@ -21,9 +21,9 @@ def _construct_dataset(X_y, reference=None,
if other_fields is not None:
if not isinstance(other_fields, dict):
raise TypeError("other filed data should be dict type")
weight = None if 'weight' not in other_fields else other_fields['weight']
group = None if 'group' not in other_fields else other_fields['group']
init_score = None if 'init_score' not in other_fields else other_fields['init_score']
weight = other_fields.get('weight', None)
group = other_fields.get('group', None)
init_score = other_fields.get('init_score', None)
if is_str(X_y):
data = X_y
label = None
......@@ -58,15 +58,15 @@ def train(params, train_data, num_boost_round=100,
Parameters
----------
params : dict
params.
Parameters for training.
train_data : Dataset, tuple (X, y) or filename of data
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data
List of data to be evaluated during training
valid_names: list of string
names of valid_datas
Names of valid_datas
fobj : function
Customized objective function.
feval : function
......@@ -75,17 +75,17 @@ def train(params, train_data, num_boost_round=100,
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
other data file in training data. \
Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
Support fields: weight, group, init_score
feature_name : list of str
feature names
categorical_feature : list of str/int
categorical features , int type to use index,
str type to use feature names (feature_name cannot be None)
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index, \
type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
......@@ -101,18 +101,18 @@ def train(params, train_data, num_boost_round=100,
passed with None means no using this function
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is
If `verbose_eval` is True then the evaluation metric on the validation set is \
printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set
is printed at every given `verbose_eval` boosting stage. The last boosting stage
If `verbose_eval` is an integer then the evaluation metric on the validation set \
is printed at every given `verbose_eval` boosting stage. The last boosting stage \
/ the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
List of learning rate for each boosting round \
or a customized function that calculates learning_rate in terms of \
current number of round and the total number of boosting round \
(e.g. yields learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
callbacks : list of callback functions
......@@ -131,12 +131,16 @@ def train(params, train_data, num_boost_round=100,
predictor = init_model
else:
predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0
"""create dataset"""
if isinstance(train_data, Dataset):
train_set = train_data
if train_fields is not None:
for field, data in train_fields.items():
train_set.set_field(field, data)
else:
train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields,
train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields,
feature_name=feature_name,
categorical_feature=categorical_feature,
predictor=predictor)
......@@ -150,7 +154,7 @@ def train(params, train_data, num_boost_round=100,
if isinstance(valid_names, str):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields[i]
other_fields = None if valid_fields is None else valid_fields.get(i, None)
"""reduce cost for prediction training data"""
if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]:
is_valid_contain_train = True
......@@ -159,6 +163,9 @@ def train(params, train_data, num_boost_round=100,
continue
if isinstance(valid_data, Dataset):
valid_set = valid_data
if other_fields is not None:
for field, data in other_fields.items():
valid_set.set_field(field, data)
else:
valid_set = _construct_dataset(
valid_data,
......@@ -169,7 +176,7 @@ def train(params, train_data, num_boost_round=100,
categorical_feature=categorical_feature,
predictor=predictor)
valid_sets.append(valid_set)
if valid_names is not None:
if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i])
else:
name_valid_sets.append('valid_'+str(i))
......@@ -179,13 +186,13 @@ def train(params, train_data, num_boost_round=100,
# Most of legacy advanced options becomes callbacks
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation())
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval))
elif isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval))
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval)))
if learning_rates is not None:
callbacks.append(callback.reset_learning_rate(learning_rates))
......@@ -197,32 +204,26 @@ def train(params, train_data, num_boost_round=100,
callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
"""construct booster"""
if 'metric' in params:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train:
booster.set_train_data_name(train_data_name)
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set)
"""start training"""
for i in range(num_boost_round):
for i in range(init_iteration, init_iteration + num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster,
cvfolds=None,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
begin_iteration=init_iteration,
end_iteration=init_iteration + num_boost_round,
evaluation_result_list=None))
booster.update(fobj=fobj)
evaluation_result_list = []
# check evaluation result.
if len(valid_sets) != 0:
if valid_sets:
if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval))
......@@ -231,8 +232,8 @@ def train(params, train_data, num_boost_round=100,
cb(callback.CallbackEnv(model=booster,
cvfolds=None,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
begin_iteration=init_iteration,
end_iteration=init_iteration + num_boost_round,
evaluation_result_list=evaluation_result_list))
except callback.EarlyStopException:
break
......@@ -347,24 +348,24 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
feval : function
Custom evaluation function.
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
feature_name : list of str
feature names
categorical_feature : list of str/int
categorical features , int type to use index,
str type to use feature names (feature_name cannot be None)
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index, \
type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
Activates early stopping. CV error needs to decrease at least \
every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns
Preprocessing function that takes (dtrain, dtest, param) and returns \
transformed versions of those.
verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed
when np.ndarray is returned. If True, progress will be displayed at
boosting stage. If an integer is given, progress will be displayed
Whether to display the progress. If None, progress will be displayed \
when np.ndarray is returned. If True, progress will be displayed at \
boosting stage. If an integer is given, progress will be displayed \
at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
......@@ -378,25 +379,14 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
-------
evaluation history : list(string)
"""
if isinstance(metrics, str):
metrics = [metrics]
if isinstance(params, list):
params = dict(params)
if 'metric' not in params:
params['metric'] = []
else:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
if metrics:
params.setdefault('metric', [])
if is_str(metrics):
params['metric'].append(metrics)
else:
params['metric'] = list(params['metric'])
if metrics is not None and len(metrics) > 0:
params['metric'].extend(metrics)
params['metric'].extend(metrics)
train_set = _construct_dataset(train_data, None, params,
train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields,
feature_name=feature_name,
categorical_feature=categorical_feature)
......@@ -411,9 +401,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
verbose=False))
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
elif isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
......
# coding: utf-8
# pylint: disable = invalid-name, W0105
# pylint: disable = invalid-name, W0105, C0111
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from .basic import LightGBMError, is_str
from .engine import train
# sklearn
try:
......@@ -66,7 +66,7 @@ def _point_wise_objective(func):
num_data = len(weight)
num_class = len(grad) // num_data
if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal with num_class * num_data")
raise ValueError("length of grad and hess should equal to num_class * num_data")
for k in range(num_class):
for i in range(num_data):
idx = k * num_data + i
......@@ -169,6 +169,7 @@ class LGBMModel(LGBMModelBase):
self.is_unbalance = is_unbalance
self.seed = seed
self._Booster = None
self.best_iteration = -1
if callable(self.objective):
self.fobj = _point_wise_objective(self.objective)
else:
......@@ -190,7 +191,6 @@ class LGBMModel(LGBMModelBase):
def get_params(self, deep=False):
"""Get parameters"""
params = super(LGBMModel, self).get_params(deep=deep)
params['verbose'] = 0 if self.silent else 1
if self.nthread <= 0:
params.pop('nthread', None)
return params
......@@ -213,30 +213,31 @@ class LGBMModel(LGBMModelBase):
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call
signature is func(y_predicted, dataset) where dataset will be a
Dataset fobject such that you may need to call the get_label
If callable, a custom evaluation metric. The call \
signature is func(y_predicted, dataset) where dataset will be a \
Dataset fobject such that you may need to call the get_label \
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
Other data file in training data. e.g. train_fields['weight'] is weight data
Support fields: weight, group, init_score
valid_fields : dict
other data file in training data. \
Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
Support fields: weight, group, init_score
feature_name : list of str
feature names
categorical_feature : list of str/int
categorical features , int type to use index,
str type to use feature names (feature_name cannot be None)
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index, \
type str represents feature names (need to specify feature_name as well)
other_params: dict
other parameters
Other parameters
"""
evals_result = {}
params = self.get_params()
params['verbose'] = 0 if self.silent else 1
if other_params is not None:
params.update(other_params)
......@@ -317,6 +318,14 @@ class LGBMModel(LGBMModelBase):
return evals_result
def feature_importance(self):
"""Feature importances
Returns
-------
Array of normailized feature importances
"""
importace_array = self._Booster.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression.
......@@ -394,7 +403,7 @@ def _group_wise_objective(func):
y_true: array_like of shape [n_samples]
The target values
group : array_like of shape
group size data of data
Group size data of data
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
......
......@@ -5,7 +5,7 @@ from __future__ import absolute_import
import sys
import os
from setuptools import setup, find_packages
# import subprocess
sys.path.insert(0, '.')
CURRENT_DIR = os.path.dirname(__file__)
......
......@@ -227,8 +227,6 @@ Tree::Tree(const std::string& str) {
leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
}
} // namespace LightGBM
......@@ -101,6 +101,7 @@ def test_early_stopping():
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold
from sklearn import datasets, metrics, model_selection
from sklearn.base import clone
boston = load_boston()
y = boston['target']
......@@ -111,6 +112,7 @@ def test_early_stopping():
eval_metric='l2',
early_stopping_rounds=10,
verbose=10)
lgb_model_clone = clone(lgb_model)
print(lgb_model.best_iteration)
test_binary_classification()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment