Commit 2cd024e9 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add feature importance in python (#109)

* add feature importances in python; add pandas support

* solve best_iteration issue
parent 6f7669df
...@@ -17,7 +17,11 @@ X_test = df_test.drop(0, axis=1) ...@@ -17,7 +17,11 @@ X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm # create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# or you can simply use a tuple of length=2 here # ATTENTION: you should carefully use lightgbm.Dataset
# it requires setting up categorical_feature when you init it
# rather than passing from lightgbm.train
# instead, you can simply use a tuple of length=2 like below
# it will help you construct Datasets with parameters in lightgbm.train
lgb_train = (X_train, y_train) lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test) lgb_eval = (X_test, y_test)
...@@ -26,14 +30,12 @@ params = { ...@@ -26,14 +30,12 @@ params = {
'task' : 'train', 'task' : 'train',
'boosting_type' : 'gbdt', 'boosting_type' : 'gbdt',
'objective' : 'regression', 'objective' : 'regression',
'metric' : 'l2', 'metric' : {'l2', 'auc'},
'num_leaves' : 31, 'num_leaves' : 31,
'learning_rate' : 0.05, 'learning_rate' : 0.05,
'feature_fraction' : 0.9, 'feature_fraction' : 0.9,
'bagging_fraction' : 0.8, 'bagging_fraction' : 0.8,
'bagging_freq': 5, 'bagging_freq': 5,
# 'ndcg_eval_at' : [1, 3, 5, 10],
# this metric is not needed in this task, show as an example
'verbose' : 0 'verbose' : 0
} }
...@@ -49,9 +51,6 @@ gbm = lgb.train(params, ...@@ -49,9 +51,6 @@ gbm = lgb.train(params,
# save model to file # save model to file
gbm.save_model('model.txt') gbm.save_model('model.txt')
# load model from file
gbm = lgb.Booster(model_file='model.txt')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
...@@ -62,3 +61,7 @@ model_json = gbm.dump_model() ...@@ -62,3 +61,7 @@ model_json = gbm.dump_model()
with open('model.json', 'w+') as f: with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4) json.dump(model_json, f, indent=4)
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', gbm.feature_importance("gain"))
...@@ -26,3 +26,6 @@ gbm.fit(X_train, y_train, ...@@ -26,3 +26,6 @@ gbm.fit(X_train, y_train,
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# feature importances
print('Feature importances:', gbm.feature_importance())
This diff is collapsed.
...@@ -46,7 +46,7 @@ def print_evaluation(period=1, show_stdv=True): ...@@ -46,7 +46,7 @@ def print_evaluation(period=1, show_stdv=True):
The period to log the evaluation results The period to log the evaluation results
show_stdv : bool, optional show_stdv : bool, optional
Whether show stdv if provided Whether show stdv if provided
Returns Returns
------- -------
...@@ -55,7 +55,7 @@ def print_evaluation(period=1, show_stdv=True): ...@@ -55,7 +55,7 @@ def print_evaluation(period=1, show_stdv=True):
""" """
def callback(env): def callback(env):
"""internal function""" """internal function"""
if len(env.evaluation_result_list) == 0 or period is False: if not env.evaluation_result_list or period <= 0:
return return
if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration: if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
result = '\t'.join([_format_eval_result(x, show_stdv) \ result = '\t'.join([_format_eval_result(x, show_stdv) \
...@@ -83,15 +83,12 @@ def record_evaluation(eval_result): ...@@ -83,15 +83,12 @@ def record_evaluation(eval_result):
def init(env): def init(env):
"""internal function""" """internal function"""
for data_name, eval_name, _, _ in env.evaluation_result_list: for data_name, _, _, _ in env.evaluation_result_list:
if data_name not in eval_result: eval_result.setdefault(data_name, collections.defaultdict(list))
eval_result[data_name] = {}
if eval_name not in eval_result[data_name]:
eval_result[data_name][eval_name] = []
def callback(env): def callback(env):
"""internal function""" """internal function"""
if len(eval_result) == 0: if not eval_result:
init(env) init(env)
for data_name, eval_name, result, _ in env.evaluation_result_list: for data_name, eval_name, result, _ in env.evaluation_result_list:
eval_result[data_name][eval_name].append(result) eval_result[data_name][eval_name].append(result)
...@@ -99,17 +96,17 @@ def record_evaluation(eval_result): ...@@ -99,17 +96,17 @@ def record_evaluation(eval_result):
def reset_learning_rate(learning_rates): def reset_learning_rate(learning_rates):
"""Reset learning rate after iteration 1 """Reset learning rate after first iteration
NOTE: the initial learning rate will still take in-effect on first iteration. NOTE: the initial learning rate will still take in-effect on first iteration.
Parameters Parameters
---------- ----------
learning_rates: list or function learning_rates: list or function
List of learning rate for each boosting round List of learning rate for each boosting round \
or a customized function that calculates learning_rate in terms of or a customized function that calculates learning_rate in terms of \
current number of round and the total number of boosting round (e.g. yields current number of round and the total number of boosting round \
learning rate decay) (e.g. yields learning rate decay)
- list l: learning_rate = l[current_round] - list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round) - function f: learning_rate = f(current_round, total_boost_round)
...@@ -121,13 +118,13 @@ def reset_learning_rate(learning_rates): ...@@ -121,13 +118,13 @@ def reset_learning_rate(learning_rates):
def callback(env): def callback(env):
"""internal function""" """internal function"""
booster = env.model booster = env.model
i = env.iteration iteration = env.iteration
if isinstance(learning_rates, list): if isinstance(learning_rates, list):
if len(learning_rates) != env.end_iteration: if len(learning_rates) != env.end_iteration:
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.") raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
booster.reset_parameter({'learning_rate':learning_rates[i]}) booster.reset_parameter({'learning_rate':learning_rates[iteration]})
else: else:
booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)}) booster.reset_parameter({'learning_rate':learning_rates(iteration, env.end_iteration)})
callback.before_iteration = True callback.before_iteration = True
return callback return callback
...@@ -157,7 +154,7 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -157,7 +154,7 @@ def early_stop(stopping_rounds, verbose=True):
best_msg = {} best_msg = {}
def init(env): def init(env):
"""internal function""" """internal function"""
if len(env.evaluation_result_list) == 0: if not env.evaluation_result_list:
raise ValueError('For early stopping you need at least one set in evals.') raise ValueError('For early stopping you need at least one set in evals.')
if verbose: if verbose:
...@@ -169,13 +166,11 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -169,13 +166,11 @@ def early_stop(stopping_rounds, verbose=True):
best_iter[i] = 0 best_iter[i] = 0
if verbose: if verbose:
best_msg[i] = "" best_msg[i] = ""
factor_to_bigger_better[i] = -1.0 factor_to_bigger_better[i] = 1.0 if env.evaluation_result_list[i][3] else -1.0
if env.evaluation_result_list[i][3]:
factor_to_bigger_better[i] = 1.0
def callback(env): def callback(env):
"""internal function""" """internal function"""
if len(best_score) == 0: if not best_score:
init(env) init(env)
for i in range(len(env.evaluation_result_list)): for i in range(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i] score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
...@@ -190,6 +185,7 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -190,6 +185,7 @@ def early_stop(stopping_rounds, verbose=True):
if env.model is not None: if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i])) env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose: if verbose:
print('early stopping, best iteration is:\n{}'.format(best_msg[i])) print('early stopping, best iteration is:')
print(best_msg[i])
raise EarlyStopException(best_iter[i]) raise EarlyStopException(best_iter[i])
return callback return callback
...@@ -8,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str ...@@ -8,7 +8,7 @@ from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback from . import callback
def _construct_dataset(X_y, reference=None, def _construct_dataset(X_y, reference=None,
params=None, other_fields=None, params=None, other_fields=None,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
predictor=None): predictor=None):
if 'max_bin' in params: if 'max_bin' in params:
...@@ -21,9 +21,9 @@ def _construct_dataset(X_y, reference=None, ...@@ -21,9 +21,9 @@ def _construct_dataset(X_y, reference=None,
if other_fields is not None: if other_fields is not None:
if not isinstance(other_fields, dict): if not isinstance(other_fields, dict):
raise TypeError("other filed data should be dict type") raise TypeError("other filed data should be dict type")
weight = None if 'weight' not in other_fields else other_fields['weight'] weight = other_fields.get('weight', None)
group = None if 'group' not in other_fields else other_fields['group'] group = other_fields.get('group', None)
init_score = None if 'init_score' not in other_fields else other_fields['init_score'] init_score = other_fields.get('init_score', None)
if is_str(X_y): if is_str(X_y):
data = X_y data = X_y
label = None label = None
...@@ -58,15 +58,15 @@ def train(params, train_data, num_boost_round=100, ...@@ -58,15 +58,15 @@ def train(params, train_data, num_boost_round=100,
Parameters Parameters
---------- ----------
params : dict params : dict
params. Parameters for training.
train_data : Dataset, tuple (X, y) or filename of data train_data : Dataset, tuple (X, y) or filename of data
Data to be trained. Data to be trained.
num_boost_round: int num_boost_round: int
Number of boosting iterations. Number of boosting iterations.
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data
List of data to be evaluated during training List of data to be evaluated during training
valid_names: list of string valid_names: list of string
names of valid_datas Names of valid_datas
fobj : function fobj : function
Customized objective function. Customized objective function.
feval : function feval : function
...@@ -75,17 +75,17 @@ def train(params, train_data, num_boost_round=100, ...@@ -75,17 +75,17 @@ def train(params, train_data, num_boost_round=100,
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
train_fields : dict train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data Other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score Support fields: weight, group, init_score
valid_fields : dict valid_fields : dict
other data file in training data. \ Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
feature names Feature names
categorical_feature : list of str/int categorical_feature : list of str or int
categorical features , int type to use index, Categorical features, type int represents index, \
str type to use feature names (feature_name cannot be None) type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -101,18 +101,18 @@ def train(params, train_data, num_boost_round=100, ...@@ -101,18 +101,18 @@ def train(params, train_data, num_boost_round=100,
passed with None means no using this function passed with None means no using this function
verbose_eval : bool or int verbose_eval : bool or int
Requires at least one item in evals. Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is If `verbose_eval` is True then the evaluation metric on the validation set is \
printed at each boosting stage. printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set If `verbose_eval` is an integer then the evaluation metric on the validation set \
is printed at every given `verbose_eval` boosting stage. The last boosting stage is printed at every given `verbose_eval` boosting stage. The last boosting stage \
/ the boosting stage found by using `early_stopping_rounds` is also printed. / the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
is printed every 4 boosting stages, instead of every boosting stage. is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function learning_rates: list or function
List of learning rate for each boosting round List of learning rate for each boosting round \
or a customized function that calculates learning_rate in terms of or a customized function that calculates learning_rate in terms of \
current number of round and the total number of boosting round (e.g. yields current number of round and the total number of boosting round \
learning rate decay) (e.g. yields learning rate decay)
- list l: learning_rate = l[current_round] - list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round) - function f: learning_rate = f(current_round, total_boost_round)
callbacks : list of callback functions callbacks : list of callback functions
...@@ -131,12 +131,16 @@ def train(params, train_data, num_boost_round=100, ...@@ -131,12 +131,16 @@ def train(params, train_data, num_boost_round=100,
predictor = init_model predictor = init_model
else: else:
predictor = None predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0
"""create dataset""" """create dataset"""
if isinstance(train_data, Dataset): if isinstance(train_data, Dataset):
train_set = train_data train_set = train_data
if train_fields is not None:
for field, data in train_fields.items():
train_set.set_field(field, data)
else: else:
train_set = _construct_dataset(train_data, None, params, train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields, other_fields=train_fields,
feature_name=feature_name, feature_name=feature_name,
categorical_feature=categorical_feature, categorical_feature=categorical_feature,
predictor=predictor) predictor=predictor)
...@@ -150,7 +154,7 @@ def train(params, train_data, num_boost_round=100, ...@@ -150,7 +154,7 @@ def train(params, train_data, num_boost_round=100,
if isinstance(valid_names, str): if isinstance(valid_names, str):
valid_names = [valid_names] valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas): for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields[i] other_fields = None if valid_fields is None else valid_fields.get(i, None)
"""reduce cost for prediction training data""" """reduce cost for prediction training data"""
if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]: if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]:
is_valid_contain_train = True is_valid_contain_train = True
...@@ -159,6 +163,9 @@ def train(params, train_data, num_boost_round=100, ...@@ -159,6 +163,9 @@ def train(params, train_data, num_boost_round=100,
continue continue
if isinstance(valid_data, Dataset): if isinstance(valid_data, Dataset):
valid_set = valid_data valid_set = valid_data
if other_fields is not None:
for field, data in other_fields.items():
valid_set.set_field(field, data)
else: else:
valid_set = _construct_dataset( valid_set = _construct_dataset(
valid_data, valid_data,
...@@ -169,7 +176,7 @@ def train(params, train_data, num_boost_round=100, ...@@ -169,7 +176,7 @@ def train(params, train_data, num_boost_round=100,
categorical_feature=categorical_feature, categorical_feature=categorical_feature,
predictor=predictor) predictor=predictor)
valid_sets.append(valid_set) valid_sets.append(valid_set)
if valid_names is not None: if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i]) name_valid_sets.append(valid_names[i])
else: else:
name_valid_sets.append('valid_'+str(i)) name_valid_sets.append('valid_'+str(i))
...@@ -179,13 +186,13 @@ def train(params, train_data, num_boost_round=100, ...@@ -179,13 +186,13 @@ def train(params, train_data, num_boost_round=100,
# Most of legacy advanced options becomes callbacks # Most of legacy advanced options becomes callbacks
if isinstance(verbose_eval, bool) and verbose_eval: if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation()) callbacks.append(callback.print_evaluation())
else: elif isinstance(verbose_eval, int):
if isinstance(verbose_eval, int): callbacks.append(callback.print_evaluation(verbose_eval))
callbacks.append(callback.print_evaluation(verbose_eval))
if early_stopping_rounds is not None: if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds, callbacks.append(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval))) verbose=bool(verbose_eval)))
if learning_rates is not None: if learning_rates is not None:
callbacks.append(callback.reset_learning_rate(learning_rates)) callbacks.append(callback.reset_learning_rate(learning_rates))
...@@ -197,32 +204,26 @@ def train(params, train_data, num_boost_round=100, ...@@ -197,32 +204,26 @@ def train(params, train_data, num_boost_round=100,
callbacks_after_iter = [ callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)] cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
"""construct booster""" """construct booster"""
if 'metric' in params:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
booster = Booster(params=params, train_set=train_set) booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train: if is_valid_contain_train:
booster.set_train_data_name(train_data_name) booster.set_train_data_name(train_data_name)
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets): for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set) booster.add_valid(valid_set, name_valid_set)
"""start training""" """start training"""
for i in range(num_boost_round): for i in range(init_iteration, init_iteration + num_boost_round):
for cb in callbacks_before_iter: for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster, cb(callback.CallbackEnv(model=booster,
cvfolds=None, cvfolds=None,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=init_iteration,
end_iteration=num_boost_round, end_iteration=init_iteration + num_boost_round,
evaluation_result_list=None)) evaluation_result_list=None))
booster.update(fobj=fobj) booster.update(fobj=fobj)
evaluation_result_list = [] evaluation_result_list = []
# check evaluation result. # check evaluation result.
if len(valid_sets) != 0: if valid_sets:
if is_valid_contain_train: if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval)) evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval)) evaluation_result_list.extend(booster.eval_valid(feval))
...@@ -231,8 +232,8 @@ def train(params, train_data, num_boost_round=100, ...@@ -231,8 +232,8 @@ def train(params, train_data, num_boost_round=100,
cb(callback.CallbackEnv(model=booster, cb(callback.CallbackEnv(model=booster,
cvfolds=None, cvfolds=None,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=init_iteration,
end_iteration=num_boost_round, end_iteration=init_iteration + num_boost_round,
evaluation_result_list=evaluation_result_list)) evaluation_result_list=evaluation_result_list))
except callback.EarlyStopException: except callback.EarlyStopException:
break break
...@@ -347,24 +348,24 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -347,24 +348,24 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
feval : function feval : function
Custom evaluation function. Custom evaluation function.
train_fields : dict train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data Other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
feature names Feature names
categorical_feature : list of str/int categorical_feature : list of str or int
categorical features , int type to use index, Categorical features, type int represents index, \
str type to use feature names (feature_name cannot be None) type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least \
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration. Last entry in evaluation history is the one from best iteration.
fpreproc : function fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns Preprocessing function that takes (dtrain, dtest, param) and returns \
transformed versions of those. transformed versions of those.
verbose_eval : bool, int, or None, default None verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed Whether to display the progress. If None, progress will be displayed \
when np.ndarray is returned. If True, progress will be displayed at when np.ndarray is returned. If True, progress will be displayed at \
boosting stage. If an integer is given, progress will be displayed boosting stage. If an integer is given, progress will be displayed \
at every given `verbose_eval` boosting stage. at every given `verbose_eval` boosting stage.
show_stdv : bool, default True show_stdv : bool, default True
Whether to display the standard deviation in progress. Whether to display the standard deviation in progress.
...@@ -378,25 +379,14 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -378,25 +379,14 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
------- -------
evaluation history : list(string) evaluation history : list(string)
""" """
if metrics:
if isinstance(metrics, str): params.setdefault('metric', [])
metrics = [metrics] if is_str(metrics):
params['metric'].append(metrics)
if isinstance(params, list):
params = dict(params)
if 'metric' not in params:
params['metric'] = []
else:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else: else:
params['metric'] = list(params['metric']) params['metric'].extend(metrics)
if metrics is not None and len(metrics) > 0:
params['metric'].extend(metrics)
train_set = _construct_dataset(train_data, None, params, train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields, other_fields=train_fields,
feature_name=feature_name, feature_name=feature_name,
categorical_feature=categorical_feature) categorical_feature=categorical_feature)
...@@ -411,9 +401,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -411,9 +401,8 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
verbose=False)) verbose=False))
if isinstance(verbose_eval, bool) and verbose_eval: if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation(show_stdv=show_stdv)) callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
else: elif isinstance(verbose_eval, int):
if isinstance(verbose_eval, int): callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks_before_iter = [ callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)] cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
......
# coding: utf-8 # coding: utf-8
# pylint: disable = invalid-name, W0105 # pylint: disable = invalid-name, W0105, C0111
"""Scikit-Learn Wrapper interface for LightGBM.""" """Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import from __future__ import absolute_import
import numpy as np import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str from .basic import LightGBMError, is_str
from .engine import train from .engine import train
# sklearn # sklearn
try: try:
...@@ -66,7 +66,7 @@ def _point_wise_objective(func): ...@@ -66,7 +66,7 @@ def _point_wise_objective(func):
num_data = len(weight) num_data = len(weight)
num_class = len(grad) // num_data num_class = len(grad) // num_data
if num_class * num_data != len(grad): if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal with num_class * num_data") raise ValueError("length of grad and hess should equal to num_class * num_data")
for k in range(num_class): for k in range(num_class):
for i in range(num_data): for i in range(num_data):
idx = k * num_data + i idx = k * num_data + i
...@@ -169,6 +169,7 @@ class LGBMModel(LGBMModelBase): ...@@ -169,6 +169,7 @@ class LGBMModel(LGBMModelBase):
self.is_unbalance = is_unbalance self.is_unbalance = is_unbalance
self.seed = seed self.seed = seed
self._Booster = None self._Booster = None
self.best_iteration = -1
if callable(self.objective): if callable(self.objective):
self.fobj = _point_wise_objective(self.objective) self.fobj = _point_wise_objective(self.objective)
else: else:
...@@ -190,7 +191,6 @@ class LGBMModel(LGBMModelBase): ...@@ -190,7 +191,6 @@ class LGBMModel(LGBMModelBase):
def get_params(self, deep=False): def get_params(self, deep=False):
"""Get parameters""" """Get parameters"""
params = super(LGBMModel, self).get_params(deep=deep) params = super(LGBMModel, self).get_params(deep=deep)
params['verbose'] = 0 if self.silent else 1
if self.nthread <= 0: if self.nthread <= 0:
params.pop('nthread', None) params.pop('nthread', None)
return params return params
...@@ -213,30 +213,31 @@ class LGBMModel(LGBMModelBase): ...@@ -213,30 +213,31 @@ class LGBMModel(LGBMModelBase):
A list of (X, y) tuple pairs to use as a validation set for early-stopping A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use. If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call If callable, a custom evaluation metric. The call \
signature is func(y_predicted, dataset) where dataset will be a signature is func(y_predicted, dataset) where dataset will be a \
Dataset fobject such that you may need to call the get_label Dataset fobject such that you may need to call the get_label \
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool) method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int early_stopping_rounds : int
verbose : bool verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data Other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score Support fields: weight, group, init_score
valid_fields : dict valid_fields : dict
other data file in training data. \ Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
feature names Feature names
categorical_feature : list of str/int categorical_feature : list of str or int
categorical features , int type to use index, Categorical features, type int represents index, \
str type to use feature names (feature_name cannot be None) type str represents feature names (need to specify feature_name as well)
other_params: dict other_params: dict
other parameters Other parameters
""" """
evals_result = {} evals_result = {}
params = self.get_params() params = self.get_params()
params['verbose'] = 0 if self.silent else 1
if other_params is not None: if other_params is not None:
params.update(other_params) params.update(other_params)
...@@ -317,6 +318,14 @@ class LGBMModel(LGBMModelBase): ...@@ -317,6 +318,14 @@ class LGBMModel(LGBMModelBase):
return evals_result return evals_result
def feature_importance(self):
"""Feature importances
Returns
-------
Array of normailized feature importances
"""
importace_array = self._Booster.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
class LGBMRegressor(LGBMModel, LGBMRegressorBase): class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression. __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
...@@ -394,7 +403,7 @@ def _group_wise_objective(func): ...@@ -394,7 +403,7 @@ def _group_wise_objective(func):
y_true: array_like of shape [n_samples] y_true: array_like of shape [n_samples]
The target values The target values
group : array_like of shape group : array_like of shape
group size data of data Group size data of data
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class) y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values The predicted values
Returns Returns
......
...@@ -5,7 +5,7 @@ from __future__ import absolute_import ...@@ -5,7 +5,7 @@ from __future__ import absolute_import
import sys import sys
import os import os
from setuptools import setup, find_packages from setuptools import setup, find_packages
# import subprocess
sys.path.insert(0, '.') sys.path.insert(0, '.')
CURRENT_DIR = os.path.dirname(__file__) CURRENT_DIR = os.path.dirname(__file__)
......
...@@ -227,8 +227,6 @@ Tree::Tree(const std::string& str) { ...@@ -227,8 +227,6 @@ Tree::Tree(const std::string& str) {
leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_); leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_); leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
} }
} // namespace LightGBM } // namespace LightGBM
...@@ -101,6 +101,7 @@ def test_early_stopping(): ...@@ -101,6 +101,7 @@ def test_early_stopping():
from sklearn.datasets import load_boston from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold from sklearn.cross_validation import KFold
from sklearn import datasets, metrics, model_selection from sklearn import datasets, metrics, model_selection
from sklearn.base import clone
boston = load_boston() boston = load_boston()
y = boston['target'] y = boston['target']
...@@ -111,6 +112,7 @@ def test_early_stopping(): ...@@ -111,6 +112,7 @@ def test_early_stopping():
eval_metric='l2', eval_metric='l2',
early_stopping_rounds=10, early_stopping_rounds=10,
verbose=10) verbose=10)
lgb_model_clone = clone(lgb_model)
print(lgb_model.best_iteration) print(lgb_model.best_iteration)
test_binary_classification() test_binary_classification()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment