Commit 2cd024e9 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add feature importance in python (#109)

* add feature importances in python; add pandas support

* solve best_iteration issue
parent 6f7669df
...@@ -17,7 +17,11 @@ X_test = df_test.drop(0, axis=1) ...@@ -17,7 +17,11 @@ X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm # create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# or you can simply use a tuple of length=2 here # ATTENTION: you should carefully use lightgbm.Dataset
# it requires setting up categorical_feature when you init it
# rather than passing from lightgbm.train
# instead, you can simply use a tuple of length=2 like below
# it will help you construct Datasets with parameters in lightgbm.train
lgb_train = (X_train, y_train) lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test) lgb_eval = (X_test, y_test)
...@@ -26,14 +30,12 @@ params = { ...@@ -26,14 +30,12 @@ params = {
'task' : 'train', 'task' : 'train',
'boosting_type' : 'gbdt', 'boosting_type' : 'gbdt',
'objective' : 'regression', 'objective' : 'regression',
'metric' : 'l2', 'metric' : {'l2', 'auc'},
'num_leaves' : 31, 'num_leaves' : 31,
'learning_rate' : 0.05, 'learning_rate' : 0.05,
'feature_fraction' : 0.9, 'feature_fraction' : 0.9,
'bagging_fraction' : 0.8, 'bagging_fraction' : 0.8,
'bagging_freq': 5, 'bagging_freq': 5,
# 'ndcg_eval_at' : [1, 3, 5, 10],
# this metric is not needed in this task, show as an example
'verbose' : 0 'verbose' : 0
} }
...@@ -49,9 +51,6 @@ gbm = lgb.train(params, ...@@ -49,9 +51,6 @@ gbm = lgb.train(params,
# save model to file # save model to file
gbm.save_model('model.txt') gbm.save_model('model.txt')
# load model from file
gbm = lgb.Booster(model_file='model.txt')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
...@@ -62,3 +61,7 @@ model_json = gbm.dump_model() ...@@ -62,3 +61,7 @@ model_json = gbm.dump_model()
with open('model.json', 'w+') as f: with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4) json.dump(model_json, f, indent=4)
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', gbm.feature_importance("gain"))
...@@ -26,3 +26,6 @@ gbm.fit(X_train, y_train, ...@@ -26,3 +26,6 @@ gbm.fit(X_train, y_train,
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# feature importances
print('Feature importances:', gbm.feature_importance())
# coding: utf-8 # coding: utf-8
# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105 # pylint: disable = invalid-name, C0111, C0301, R0912, R0913, R0914, W0105
# pylint: disable = E1101
"""Wrapper c_api of LightGBM""" """Wrapper c_api of LightGBM"""
from __future__ import absolute_import from __future__ import absolute_import
import sys import sys
import os
import ctypes import ctypes
import tempfile import tempfile
import json import json
...@@ -19,11 +19,11 @@ try: ...@@ -19,11 +19,11 @@ try:
from pandas import Series, DataFrame from pandas import Series, DataFrame
IS_PANDAS_INSTALLED = True IS_PANDAS_INSTALLED = True
except ImportError: except ImportError:
IS_PANDAS_INSTALLED = False
class Series(object): class Series(object):
pass pass
class DataFrame(object): class DataFrame(object):
pass pass
IS_PANDAS_INSTALLED = False
IS_PY3 = (sys.version_info[0] == 3) IS_PY3 = (sys.version_info[0] == 3)
...@@ -62,18 +62,11 @@ def is_numpy_object(data): ...@@ -62,18 +62,11 @@ def is_numpy_object(data):
return type(data).__module__ == np.__name__ return type(data).__module__ == np.__name__
def is_numpy_1d_array(data): def is_numpy_1d_array(data):
if isinstance(data, np.ndarray) and len(data.shape) == 1: return isinstance(data, np.ndarray) and len(data.shape) == 1
return True
else:
return False
def is_1d_list(data): def is_1d_list(data):
if not isinstance(data, list): return isinstance(data, list) and \
return False (not data or isinstance(data[0], (int, float, bool)))
if len(data) > 0:
if not isinstance(data[0], (int, float, bool)):
return False
return True
def list_to_1d_numpy(data, dtype): def list_to_1d_numpy(data, dtype):
if is_numpy_1d_array(data): if is_numpy_1d_array(data):
...@@ -115,20 +108,19 @@ def c_array(ctype, values): ...@@ -115,20 +108,19 @@ def c_array(ctype, values):
return (ctype * len(values))(*values) return (ctype * len(values))(*values)
def param_dict_to_str(data): def param_dict_to_str(data):
if data is None or len(data) == 0: if not data:
return "" return ""
pairs = [] pairs = []
for key, val in data.items(): for key, val in data.items():
if is_str(val): if is_str(val) or isinstance(val, (int, float, bool)):
pairs.append(str(key)+'='+str(val)) pairs.append(str(key)+'='+str(val))
elif isinstance(val, (list, tuple)): elif isinstance(val, (list, tuple, set)):
pairs.append(str(key)+'='+','.join(map(str, val))) pairs.append(str(key)+'='+','.join(map(str, val)))
elif isinstance(val, (int, float, bool)):
pairs.append(str(key)+'='+str(val))
else: else:
raise TypeError('unknow type of parameter:%s , got:%s' raise TypeError('unknow type of parameter:%s , got:%s'
% (key, type(val).__name__)) % (key, type(val).__name__))
return ' '.join(pairs) return ' '.join(pairs)
"""marco definition of data type in c_api of LightGBM""" """marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 = 0 C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1 C_API_DTYPE_FLOAT64 = 1
...@@ -207,7 +199,7 @@ class Predictor(object): ...@@ -207,7 +199,7 @@ class Predictor(object):
self.handle, self.handle,
ctypes.byref(out_num_class))) ctypes.byref(out_num_class)))
self.num_class = out_num_class.value self.num_class = out_num_class.value
self.__num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
elif booster_handle is not None: elif booster_handle is not None:
self.__is_manage_handle = is_manage_handle self.__is_manage_handle = is_manage_handle
self.handle = booster_handle self.handle = booster_handle
...@@ -220,7 +212,7 @@ class Predictor(object): ...@@ -220,7 +212,7 @@ class Predictor(object):
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration( _safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle, self.handle,
ctypes.byref(out_num_iterations))) ctypes.byref(out_num_iterations)))
self.__num_total_iteration = out_num_iterations.value self.num_total_iteration = out_num_iterations.value
else: else:
raise TypeError('Need Model file to create a booster') raise TypeError('Need Model file to create a booster')
...@@ -239,9 +231,9 @@ class Predictor(object): ...@@ -239,9 +231,9 @@ class Predictor(object):
---------- ----------
data : string/numpy array/scipy.sparse data : string/numpy array/scipy.sparse
Data source for prediction Data source for prediction
When data is string type, it represents the path of txt file, When data type is string, it represents the path of txt file
num_iteration : int num_iteration : int
used iteration for prediction Used iteration for prediction
raw_score : bool raw_score : bool
True for predict raw score True for predict raw score
pred_leaf : bool pred_leaf : bool
...@@ -249,23 +241,22 @@ class Predictor(object): ...@@ -249,23 +241,22 @@ class Predictor(object):
data_has_header : bool data_has_header : bool
Used for txt data Used for txt data
is_reshape : bool is_reshape : bool
True for reshape to [nrow, ...] Reshape to (nrow, ncol) if true
Returns Returns
------- -------
Prediction result Prediction result
""" """
if isinstance(data, Dataset): if isinstance(data, Dataset):
raise TypeError("cannot use Dataset instance for prediction, \ raise TypeError("cannot use Dataset instance for prediction, please use raw data instead")
please use raw data instead")
predict_type = C_API_PREDICT_NORMAL predict_type = C_API_PREDICT_NORMAL
if raw_score: if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE predict_type = C_API_PREDICT_RAW_SCORE
if pred_leaf: if pred_leaf:
predict_type = C_API_PREDICT_LEAF_INDEX predict_type = C_API_PREDICT_LEAF_INDEX
int_data_has_header = 1 if data_has_header else 0 int_data_has_header = 1 if data_has_header else 0
if num_iteration > self.__num_total_iteration: if num_iteration > self.num_total_iteration:
num_iteration = self.__num_total_iteration num_iteration = self.num_total_iteration
if is_str(data): if is_str(data):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
_safe_call(_LIB.LGBM_BoosterPredictForFile( _safe_call(_LIB.LGBM_BoosterPredictForFile(
...@@ -275,22 +266,20 @@ class Predictor(object): ...@@ -275,22 +266,20 @@ class Predictor(object):
predict_type, predict_type,
num_iteration, num_iteration,
c_str(tmp_pred_fname))) c_str(tmp_pred_fname)))
tmp_file = open(tmp_pred_fname, "r") with open(tmp_pred_fname, "r") as tmp_file:
lines = tmp_file.readlines() lines = tmp_file.readlines()
tmp_file.close()
nrow = len(lines) nrow = len(lines)
preds = [] preds = [float(token) for line in lines for token in line.split('\t')]
for line in lines: preds = np.array(preds, dtype=np.float32, copy=False)
for token in line.split('\t'):
preds.append(float(token))
preds = np.array(preds, copy=False)
os.remove(tmp_pred_fname)
elif isinstance(data, scipy.sparse.csr_matrix): elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration, preds, nrow = self.__pred_for_csr(data, num_iteration,
predict_type) predict_type)
elif isinstance(data, np.ndarray): elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration, preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type) predict_type)
elif IS_PANDAS_INSTALLED and isinstance(data, DataFrame):
preds, nrow = self.__pred_for_np2d(data.values, num_iteration,
predict_type)
else: else:
try: try:
csr = scipy.sparse.csr_matrix(data) csr = scipy.sparse.csr_matrix(data)
...@@ -301,12 +290,11 @@ class Predictor(object): ...@@ -301,12 +290,11 @@ class Predictor(object):
format(type(data).__name__)) format(type(data).__name__))
if pred_leaf: if pred_leaf:
preds = preds.astype(np.int32) preds = preds.astype(np.int32)
if preds.size != nrow and is_reshape: if is_reshape and preds.size != nrow:
if preds.size % nrow == 0: if preds.size % nrow == 0:
ncol = int(preds.size / nrow) preds = preds.reshape(nrow, -1)
preds = preds.reshape(nrow, ncol)
else: else:
raise ValueError('len of predict result(%d) cannot be divide nrow (%d)' raise ValueError('length of predict result (%d) cannot be divide nrow (%d)'
% (preds.size, nrow)) % (preds.size, nrow))
return preds return preds
...@@ -314,9 +302,9 @@ class Predictor(object): ...@@ -314,9 +302,9 @@ class Predictor(object):
n_preds = self.num_class * nrow n_preds = self.num_class * nrow
if predict_type == C_API_PREDICT_LEAF_INDEX: if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0: if num_iteration > 0:
n_preds *= min(num_iteration, self.__num_total_iteration) n_preds *= min(num_iteration, self.num_total_iteration)
else: else:
n_preds *= self.__num_total_iteration n_preds *= self.num_total_iteration
return n_preds return n_preds
def __pred_for_np2d(self, mat, num_iteration, predict_type): def __pred_for_np2d(self, mat, num_iteration, predict_type):
...@@ -386,7 +374,7 @@ class Predictor(object): ...@@ -386,7 +374,7 @@ class Predictor(object):
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int', 'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float', 'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'float32': 'float', 'float64': 'float', 'bool': 'i'} 'float32': 'float', 'float64': 'float', 'bool': 'int'}
def _data_from_pandas(data): def _data_from_pandas(data):
if isinstance(data, DataFrame): if isinstance(data, DataFrame):
...@@ -427,26 +415,26 @@ class Dataset(object): ...@@ -427,26 +415,26 @@ class Dataset(object):
---------- ----------
data : string/numpy array/scipy.sparse data : string/numpy array/scipy.sparse
Data source of Dataset. Data source of Dataset.
When data is string type, it represents the path of txt file, When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional label : list or numpy 1-D array, optional
Label of the data Label of the data
max_bin : int, required max_bin : int, required
max number of discrete bin for features Max number of discrete bin for features
reference : Other Dataset, optional reference : Other Dataset, optional
If this dataset validation, need to use training data as reference If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional weight : list or numpy 1-D array , optional
Weight for each instance. Weight for each instance.
group : list or numpy 1-D array , optional group : list or numpy 1-D array , optional
group/query size for dataset Group/query size for dataset
silent : boolean, optional silent : boolean, optional
Whether print messages during construction Whether print messages during construction
feature_name : list of str feature_name : list of str
feature names Feature names
categorical_feature : list of str/int categorical_feature : list of str or int
categorical features , int type to use index, Categorical features, type int represents index, \
str type to use feature names (feature_name cannot be None) type str represents feature names (need to specify feature_name as well)
params: dict, optional params: dict, optional
other parameters Other parameters
""" """
self.__label = None self.__label = None
self.__weight = None self.__weight = None
...@@ -469,17 +457,17 @@ class Dataset(object): ...@@ -469,17 +457,17 @@ class Dataset(object):
params["verbose"] = 1 params["verbose"] = 1
"""get categorical features""" """get categorical features"""
if categorical_feature is not None: if categorical_feature is not None:
categorical_indices = [] categorical_indices = set()
feature_dict = {} feature_dict = {}
if feature_name is not None: if feature_name is not None:
feature_dict =dict((name, i) for i, name in enumerate(feature_name)) feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature: for name in categorical_feature:
if is_str(name) and name in feature_dict: if is_str(name) and name in feature_dict:
categorical_indices.append(feature_dict[name]) categorical_indices.add(feature_dict[name])
elif isinstance(name, int): elif isinstance(name, int):
categorical_indices.append(name) categorical_indices.add(name)
else: else:
raise TypeError("unknown type({}) or unknown name({}) in categorical_feature" raise TypeError("unknown type({}) or unknown name({}) in categorical_feature" \
.format(type(name).__name__, name)) .format(type(name).__name__, name))
params['categorical_column'] = categorical_indices params['categorical_column'] = categorical_indices
...@@ -494,8 +482,8 @@ class Dataset(object): ...@@ -494,8 +482,8 @@ class Dataset(object):
"""start construct data""" """start construct data"""
if is_str(data): if is_str(data):
"""check data has header or not""" """check data has header or not"""
if "has_header" in params or "header" in params: if params.get("has_header", "").lower() == "true" \
if params["has_header"].lower() == "true" or params["header"].lower() == "true": or params.get("header", "").lower() == "true":
self.data_has_header = True self.data_has_header = True
self.handle = ctypes.c_void_p() self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile( _safe_call(_LIB.LGBM_DatasetCreateFromFile(
...@@ -548,17 +536,17 @@ class Dataset(object): ...@@ -548,17 +536,17 @@ class Dataset(object):
---------- ----------
data : string/numpy array/scipy.sparse data : string/numpy array/scipy.sparse
Data source of Dataset. Data source of Dataset.
When data is string type, it represents the path of txt file, When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional label : list or numpy 1-D array, optional
Label of the training data. Label of the training data.
weight : list or numpy 1-D array , optional weight : list or numpy 1-D array , optional
Weight for each instance. Weight for each instance.
group : list or numpy 1-D array , optional group : list or numpy 1-D array , optional
group/query size for dataset Group/query size for dataset
silent : boolean, optional silent : boolean, optional
Whether print messages during construction Whether print messages during construction
params: dict, optional params: dict, optional
other parameters Other parameters
""" """
return Dataset(data, label=label, max_bin=self.max_bin, reference=self, return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group=group, predictor=self.predictor, weight=weight, group=group, predictor=self.predictor,
...@@ -658,7 +646,7 @@ class Dataset(object): ...@@ -658,7 +646,7 @@ class Dataset(object):
Returns Returns
------- -------
info : array info : array
a numpy array of information of the data A numpy array of information of the data
""" """
tmp_out_len = ctypes.c_int64() tmp_out_len = ctypes.c_int64()
out_type = ctypes.c_int32() out_type = ctypes.c_int32()
...@@ -700,6 +688,9 @@ class Dataset(object): ...@@ -700,6 +688,9 @@ class Dataset(object):
0, 0,
FIELD_TYPE_MAPPER[field_name])) FIELD_TYPE_MAPPER[field_name]))
return return
if IS_PANDAS_INSTALLED and isinstance(data, Series):
dtype = np.int32 if field_name == 'group' else np.float32
data = data.astype(dtype).values
if not is_numpy_1d_array(data): if not is_numpy_1d_array(data):
raise TypeError("Unknow type({})".format(type(data).__name__)) raise TypeError("Unknow type({})".format(type(data).__name__))
if data.dtype == np.float32: if data.dtype == np.float32:
...@@ -719,7 +710,6 @@ class Dataset(object): ...@@ -719,7 +710,6 @@ class Dataset(object):
len(data), len(data),
type_data)) type_data))
def save_binary(self, filename): def save_binary(self, filename):
"""Save Dataset to binary file """Save Dataset to binary file
...@@ -737,7 +727,7 @@ class Dataset(object): ...@@ -737,7 +727,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
label: array like label: numpy array or list or None
The label information to be set into Dataset The label information to be set into Dataset
""" """
label = list_to_1d_numpy(label, np.float32) label = list_to_1d_numpy(label, np.float32)
...@@ -749,7 +739,7 @@ class Dataset(object): ...@@ -749,7 +739,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
weight : array like weight : numpy array or list or None
Weight for each data point Weight for each data point
""" """
if weight is not None: if weight is not None:
...@@ -759,10 +749,11 @@ class Dataset(object): ...@@ -759,10 +749,11 @@ class Dataset(object):
def set_init_score(self, score): def set_init_score(self, score):
""" Set init score of booster to start from. """ Set init score of booster to start from.
Parameters Parameters
---------- ----------
score: array like score: numpy array or list or None
Init score for booster
""" """
if score is not None: if score is not None:
score = list_to_1d_numpy(score, np.float32) score = list_to_1d_numpy(score, np.float32)
...@@ -774,7 +765,7 @@ class Dataset(object): ...@@ -774,7 +765,7 @@ class Dataset(object):
Parameters Parameters
---------- ----------
group : array like group : numpy array or list or None
Group size of each group Group size of each group
""" """
if group is not None: if group is not None:
...@@ -782,7 +773,6 @@ class Dataset(object): ...@@ -782,7 +773,6 @@ class Dataset(object):
self.__group = group self.__group = group
self.set_field('group', group) self.set_field('group', group)
def get_label(self): def get_label(self):
"""Get the label of the Dataset. """Get the label of the Dataset.
...@@ -854,7 +844,7 @@ class Dataset(object): ...@@ -854,7 +844,7 @@ class Dataset(object):
return ret.value return ret.value
class Booster(object): class Booster(object):
""""A Booster of of LightGBM. """"A Booster of LightGBM.
""" """
def __init__(self, params=None, train_set=None, model_file=None, silent=False): def __init__(self, params=None, train_set=None, model_file=None, silent=False):
"""Initialize the Booster. """Initialize the Booster.
...@@ -864,7 +854,7 @@ class Booster(object): ...@@ -864,7 +854,7 @@ class Booster(object):
params : dict params : dict
Parameters for boosters. Parameters for boosters.
train_set : Dataset train_set : Dataset
training dataset Training dataset
model_file : string model_file : string
Path to the model file. Path to the model file.
silent : boolean, optional silent : boolean, optional
...@@ -884,7 +874,7 @@ class Booster(object): ...@@ -884,7 +874,7 @@ class Booster(object):
if train_set is not None: if train_set is not None:
"""Training task""" """Training task"""
if not isinstance(train_set, Dataset): if not isinstance(train_set, Dataset):
raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__)) raise TypeError('training data should be Dataset instance, met {}'.format(type(train_set).__name__))
params_str = param_dict_to_str(params) params_str = param_dict_to_str(params)
"""construct booster object""" """construct booster object"""
_safe_call(_LIB.LGBM_BoosterCreate( _safe_call(_LIB.LGBM_BoosterCreate(
...@@ -938,9 +928,9 @@ class Booster(object): ...@@ -938,9 +928,9 @@ class Booster(object):
Parameters Parameters
---------- ----------
data : Dataset data : Dataset
validation data Validation data
name : String name : String
name of validation data Name of validation data
""" """
if data.predictor is not self.init_predictor: if data.predictor is not self.init_predictor:
raise Exception("Add validation data failed, you should use same predictor for these data") raise Exception("Add validation data failed, you should use same predictor for these data")
...@@ -959,7 +949,7 @@ class Booster(object): ...@@ -959,7 +949,7 @@ class Booster(object):
Parameters Parameters
---------- ----------
params : dict params : dict
params New parameters for boosters
silent : boolean, optional silent : boolean, optional
Whether print messages during construction Whether print messages during construction
""" """
...@@ -977,9 +967,11 @@ class Booster(object): ...@@ -977,9 +967,11 @@ class Booster(object):
Note: for multi-class task, the score is group by class_id first, then group by row_id Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i] if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well and you should group grad and hess in this way as well
Parameters Parameters
---------- ----------
train_set : training data, None means use last training data train_set :
Training data, None means use last training data
fobj : function fobj : function
Customized objective function. Customized objective function.
...@@ -1014,6 +1006,7 @@ class Booster(object): ...@@ -1014,6 +1006,7 @@ class Booster(object):
Note: for multi-class task, the score is group by class_id first, then group by row_id Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i] if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well and you should group grad and hess in this way as well
Parameters Parameters
---------- ----------
grad : 1d numpy or 1d list grad : 1d numpy or 1d list
...@@ -1036,7 +1029,7 @@ class Booster(object): ...@@ -1036,7 +1029,7 @@ class Booster(object):
else: else:
raise TypeError("hess should be numpy 1d array or 1d list") raise TypeError("hess should be numpy 1d array or 1d list")
if len(grad) != len(hess): if len(grad) != len(hess):
raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) raise ValueError('grad / hess lengths mismatch: {} / {}'.format(len(grad), len(hess)))
if grad.dtype != np.float32: if grad.dtype != np.float32:
grad = grad.astype(np.float32, copy=False) grad = grad.astype(np.float32, copy=False)
if hess.dtype != np.float32: if hess.dtype != np.float32:
...@@ -1071,7 +1064,8 @@ class Booster(object): ...@@ -1071,7 +1064,8 @@ class Booster(object):
Parameters Parameters
---------- ----------
data : Dataset object data : Dataset object
name : name of data name :
Name of data
feval : function feval : function
Custom evaluation function. Custom evaluation function.
Returns Returns
...@@ -1124,10 +1118,8 @@ class Booster(object): ...@@ -1124,10 +1118,8 @@ class Booster(object):
result: str result: str
Evaluation result list. Evaluation result list.
""" """
ret = [] return [item for i in range(1, self.__num_dataset) \
for i in range(1, self.__num_dataset): for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)]
ret.extend(self.__inner_eval(self.name_valid_sets[i-1], i, feval))
return ret
def save_model(self, filename, num_iteration=-1): def save_model(self, filename, num_iteration=-1):
"""Save model of booster to file """Save model of booster to file
...@@ -1135,9 +1127,9 @@ class Booster(object): ...@@ -1135,9 +1127,9 @@ class Booster(object):
Parameters Parameters
---------- ----------
filename : str filename : str
filename to save Filename to save
num_iteration: int num_iteration: int
number of iteration that want to save. < 0 means save all Number of iteration that want to save. < 0 means save all
""" """
_safe_call(_LIB.LGBM_BoosterSaveModel( _safe_call(_LIB.LGBM_BoosterSaveModel(
self.handle, self.handle,
...@@ -1145,8 +1137,7 @@ class Booster(object): ...@@ -1145,8 +1137,7 @@ class Booster(object):
c_str(filename))) c_str(filename)))
def dump_model(self): def dump_model(self):
""" """Dump model to json format
Dump model to json format
Returns Returns
------- -------
...@@ -1162,6 +1153,7 @@ class Booster(object): ...@@ -1162,6 +1153,7 @@ class Booster(object):
ctypes.byref(tmp_out_len), ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer))) ctypes.byref(ptr_string_buffer)))
actual_len = tmp_out_len.value actual_len = tmp_out_len.value
'''if buffer length is not long enough, reallocate a buffer'''
if actual_len > buffer_len: if actual_len > buffer_len:
string_buffer = ctypes.create_string_buffer(actual_len) string_buffer = ctypes.create_string_buffer(actual_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)]) ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
...@@ -1173,16 +1165,15 @@ class Booster(object): ...@@ -1173,16 +1165,15 @@ class Booster(object):
return json.loads(string_buffer.value.decode()) return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True): def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
""" """Predict logic
Predict logic
Parameters Parameters
---------- ----------
data : string/numpy array/scipy.sparse data : string/numpy array/scipy.sparse
Data source for prediction Data source for prediction
When data is string type, it represents the path of txt file, When data type is string, it represents the path of txt file
num_iteration : int num_iteration : int
used iteration for prediction Used iteration for prediction
raw_score : bool raw_score : bool
True for predict raw score True for predict raw score
pred_leaf : bool pred_leaf : bool
...@@ -1190,7 +1181,7 @@ class Booster(object): ...@@ -1190,7 +1181,7 @@ class Booster(object):
data_has_header : bool data_has_header : bool
Used for txt data Used for txt data
is_reshape : bool is_reshape : bool
True for reshape to [nrow, ...] Reshape to (nrow, ncol) if true
Returns Returns
------- -------
...@@ -1207,6 +1198,29 @@ class Booster(object): ...@@ -1207,6 +1198,29 @@ class Booster(object):
self.__is_manage_handle = False self.__is_manage_handle = False
return predictor return predictor
def feature_importance(self, importance_type='split'):
"""Feature importances
Returns
-------
Array of feature importances
"""
if importance_type not in ["split", "gain"]:
raise KeyError("importance_type must be split or gain")
dump_model = self.dump_model()
ret = [0] * (dump_model["max_feature_idx"] + 1)
def dfs(root):
if "split_feature" in root:
if importance_type == 'split':
ret[root["split_feature"]] += 1
elif importance_type == 'gain':
ret[root["split_feature"]] += root["split_gain"]
dfs(root["left_child"])
dfs(root["right_child"])
for tree in dump_model["tree_info"]:
dfs(tree["tree_structure"])
return np.array(ret)
def __inner_eval(self, data_name, data_idx, feval=None): def __inner_eval(self, data_name, data_idx, feval=None):
""" """
Evaulate training or validation data Evaulate training or validation data
...@@ -1291,16 +1305,11 @@ class Booster(object): ...@@ -1291,16 +1305,11 @@ class Booster(object):
ptr_string_buffers)) ptr_string_buffers))
if self.__num_inner_eval != tmp_out_len.value: if self.__num_inner_eval != tmp_out_len.value:
raise ValueError("size of eval names doesn't equal with num_evals") raise ValueError("size of eval names doesn't equal with num_evals")
self.__name_inner_eval = [] self.__name_inner_eval = \
for i in range(self.__num_inner_eval): [string_buffers[i].value.decode() for i in range(self.__num_inner_eval)]
self.__name_inner_eval.append(string_buffers[i].value.decode()) self.__higher_better_inner_eval = \
self.__higher_better_inner_eval = [] [name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval]
higher_better_metric = ['auc', 'ndcg']
for name in self.__name_inner_eval:
if any(name.startswith(x) for x in higher_better_metric):
self.__higher_better_inner_eval.append(True)
else:
self.__higher_better_inner_eval.append(False)
def attr(self, key): def attr(self, key):
"""Get attribute string from the Booster. """Get attribute string from the Booster.
...@@ -1314,10 +1323,7 @@ class Booster(object): ...@@ -1314,10 +1323,7 @@ class Booster(object):
value : str value : str
The attribute value of the key, returns None if attribute do not exist. The attribute value of the key, returns None if attribute do not exist.
""" """
if key in self.__attr: return self.__attr.get(key, None)
return self.__attr[key]
else:
return None
def set_attr(self, **kwargs): def set_attr(self, **kwargs):
"""Set the attribute of the Booster. """Set the attribute of the Booster.
...@@ -1330,7 +1336,7 @@ class Booster(object): ...@@ -1330,7 +1336,7 @@ class Booster(object):
for key, value in kwargs.items(): for key, value in kwargs.items():
if value is not None: if value is not None:
if not is_str(value): if not is_str(value):
raise ValueError("Set Attr only accepts string values") raise ValueError("set_attr only accepts string values")
self.__attr[key] = value self.__attr[key] = value
else: else:
self.__attr.pop(key, None) self.__attr.pop(key, None)
...@@ -55,7 +55,7 @@ def print_evaluation(period=1, show_stdv=True): ...@@ -55,7 +55,7 @@ def print_evaluation(period=1, show_stdv=True):
""" """
def callback(env): def callback(env):
"""internal function""" """internal function"""
if len(env.evaluation_result_list) == 0 or period is False: if not env.evaluation_result_list or period <= 0:
return return
if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration: if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
result = '\t'.join([_format_eval_result(x, show_stdv) \ result = '\t'.join([_format_eval_result(x, show_stdv) \
...@@ -83,15 +83,12 @@ def record_evaluation(eval_result): ...@@ -83,15 +83,12 @@ def record_evaluation(eval_result):
def init(env): def init(env):
"""internal function""" """internal function"""
for data_name, eval_name, _, _ in env.evaluation_result_list: for data_name, _, _, _ in env.evaluation_result_list:
if data_name not in eval_result: eval_result.setdefault(data_name, collections.defaultdict(list))
eval_result[data_name] = {}
if eval_name not in eval_result[data_name]:
eval_result[data_name][eval_name] = []
def callback(env): def callback(env):
"""internal function""" """internal function"""
if len(eval_result) == 0: if not eval_result:
init(env) init(env)
for data_name, eval_name, result, _ in env.evaluation_result_list: for data_name, eval_name, result, _ in env.evaluation_result_list:
eval_result[data_name][eval_name].append(result) eval_result[data_name][eval_name].append(result)
...@@ -99,17 +96,17 @@ def record_evaluation(eval_result): ...@@ -99,17 +96,17 @@ def record_evaluation(eval_result):
def reset_learning_rate(learning_rates): def reset_learning_rate(learning_rates):
"""Reset learning rate after iteration 1 """Reset learning rate after first iteration
NOTE: the initial learning rate will still take in-effect on first iteration. NOTE: the initial learning rate will still take in-effect on first iteration.
Parameters Parameters
---------- ----------
learning_rates: list or function learning_rates: list or function
List of learning rate for each boosting round List of learning rate for each boosting round \
or a customized function that calculates learning_rate in terms of or a customized function that calculates learning_rate in terms of \
current number of round and the total number of boosting round (e.g. yields current number of round and the total number of boosting round \
learning rate decay) (e.g. yields learning rate decay)
- list l: learning_rate = l[current_round] - list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round) - function f: learning_rate = f(current_round, total_boost_round)
...@@ -121,13 +118,13 @@ def reset_learning_rate(learning_rates): ...@@ -121,13 +118,13 @@ def reset_learning_rate(learning_rates):
def callback(env): def callback(env):
"""internal function""" """internal function"""
booster = env.model booster = env.model
i = env.iteration iteration = env.iteration
if isinstance(learning_rates, list): if isinstance(learning_rates, list):
if len(learning_rates) != env.end_iteration: if len(learning_rates) != env.end_iteration:
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.") raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
booster.reset_parameter({'learning_rate':learning_rates[i]}) booster.reset_parameter({'learning_rate':learning_rates[iteration]})
else: else:
booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)}) booster.reset_parameter({'learning_rate':learning_rates(iteration, env.end_iteration)})
callback.before_iteration = True callback.before_iteration = True
return callback return callback
...@@ -157,7 +154,7 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -157,7 +154,7 @@ def early_stop(stopping_rounds, verbose=True):
best_msg = {} best_msg = {}
def init(env): def init(env):
"""internal function""" """internal function"""
if len(env.evaluation_result_list) == 0: if not env.evaluation_result_list:
raise ValueError('For early stopping you need at least one set in evals.') raise ValueError('For early stopping you need at least one set in evals.')
if verbose: if verbose:
...@@ -169,13 +166,11 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -169,13 +166,11 @@ def early_stop(stopping_rounds, verbose=True):
best_iter[i] = 0 best_iter[i] = 0
if verbose: if verbose:
best_msg[i] = "" best_msg[i] = ""
factor_to_bigger_better[i] = -1.0 factor_to_bigger_better[i] = 1.0 if env.evaluation_result_list[i][3] else -1.0
if env.evaluation_result_list[i][3]:
factor_to_bigger_better[i] = 1.0
def callback(env): def callback(env):
"""internal function""" """internal function"""
if len(best_score) == 0: if not best_score:
init(env) init(env)
for i in range(len(env.evaluation_result_list)): for i in range(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i] score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
...@@ -190,6 +185,7 @@ def early_stop(stopping_rounds, verbose=True): ...@@ -190,6 +185,7 @@ def early_stop(stopping_rounds, verbose=True):
if env.model is not None: if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i])) env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose: if verbose:
print('early stopping, best iteration is:\n{}'.format(best_msg[i])) print('early stopping, best iteration is:')
print(best_msg[i])
raise EarlyStopException(best_iter[i]) raise EarlyStopException(best_iter[i])
return callback return callback
...@@ -21,9 +21,9 @@ def _construct_dataset(X_y, reference=None, ...@@ -21,9 +21,9 @@ def _construct_dataset(X_y, reference=None,
if other_fields is not None: if other_fields is not None:
if not isinstance(other_fields, dict): if not isinstance(other_fields, dict):
raise TypeError("other filed data should be dict type") raise TypeError("other filed data should be dict type")
weight = None if 'weight' not in other_fields else other_fields['weight'] weight = other_fields.get('weight', None)
group = None if 'group' not in other_fields else other_fields['group'] group = other_fields.get('group', None)
init_score = None if 'init_score' not in other_fields else other_fields['init_score'] init_score = other_fields.get('init_score', None)
if is_str(X_y): if is_str(X_y):
data = X_y data = X_y
label = None label = None
...@@ -58,15 +58,15 @@ def train(params, train_data, num_boost_round=100, ...@@ -58,15 +58,15 @@ def train(params, train_data, num_boost_round=100,
Parameters Parameters
---------- ----------
params : dict params : dict
params. Parameters for training.
train_data : Dataset, tuple (X, y) or filename of data train_data : Dataset, tuple (X, y) or filename of data
Data to be trained. Data to be trained.
num_boost_round: int num_boost_round: int
Number of boosting iterations. Number of boosting iterations.
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data valid_datas: list of Datasets, tuples (valid_X, valid_y) or filenames of data
List of data to be evaluated during training List of data to be evaluated during training
valid_names: list of string valid_names: list of string
names of valid_datas Names of valid_datas
fobj : function fobj : function
Customized objective function. Customized objective function.
feval : function feval : function
...@@ -75,17 +75,17 @@ def train(params, train_data, num_boost_round=100, ...@@ -75,17 +75,17 @@ def train(params, train_data, num_boost_round=100,
init_model : file name of lightgbm model or 'Booster' instance init_model : file name of lightgbm model or 'Booster' instance
model used for continued train model used for continued train
train_fields : dict train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data Other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score Support fields: weight, group, init_score
valid_fields : dict valid_fields : dict
other data file in training data. \ Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
feature names Feature names
categorical_feature : list of str/int categorical_feature : list of str or int
categorical features , int type to use index, Categorical features, type int represents index, \
str type to use feature names (feature_name cannot be None) type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
Requires at least one validation data and one metric Requires at least one validation data and one metric
...@@ -101,18 +101,18 @@ def train(params, train_data, num_boost_round=100, ...@@ -101,18 +101,18 @@ def train(params, train_data, num_boost_round=100,
passed with None means no using this function passed with None means no using this function
verbose_eval : bool or int verbose_eval : bool or int
Requires at least one item in evals. Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is If `verbose_eval` is True then the evaluation metric on the validation set is \
printed at each boosting stage. printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set If `verbose_eval` is an integer then the evaluation metric on the validation set \
is printed at every given `verbose_eval` boosting stage. The last boosting stage is printed at every given `verbose_eval` boosting stage. The last boosting stage \
/ the boosting stage found by using `early_stopping_rounds` is also printed. / the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
is printed every 4 boosting stages, instead of every boosting stage. is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function learning_rates: list or function
List of learning rate for each boosting round List of learning rate for each boosting round \
or a customized function that calculates learning_rate in terms of or a customized function that calculates learning_rate in terms of \
current number of round and the total number of boosting round (e.g. yields current number of round and the total number of boosting round \
learning rate decay) (e.g. yields learning rate decay)
- list l: learning_rate = l[current_round] - list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round) - function f: learning_rate = f(current_round, total_boost_round)
callbacks : list of callback functions callbacks : list of callback functions
...@@ -131,9 +131,13 @@ def train(params, train_data, num_boost_round=100, ...@@ -131,9 +131,13 @@ def train(params, train_data, num_boost_round=100,
predictor = init_model predictor = init_model
else: else:
predictor = None predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0
"""create dataset""" """create dataset"""
if isinstance(train_data, Dataset): if isinstance(train_data, Dataset):
train_set = train_data train_set = train_data
if train_fields is not None:
for field, data in train_fields.items():
train_set.set_field(field, data)
else: else:
train_set = _construct_dataset(train_data, None, params, train_set = _construct_dataset(train_data, None, params,
other_fields=train_fields, other_fields=train_fields,
...@@ -150,7 +154,7 @@ def train(params, train_data, num_boost_round=100, ...@@ -150,7 +154,7 @@ def train(params, train_data, num_boost_round=100,
if isinstance(valid_names, str): if isinstance(valid_names, str):
valid_names = [valid_names] valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas): for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields[i] other_fields = None if valid_fields is None else valid_fields.get(i, None)
"""reduce cost for prediction training data""" """reduce cost for prediction training data"""
if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]: if valid_data[0] is train_data[0] and valid_data[1] is train_data[1]:
is_valid_contain_train = True is_valid_contain_train = True
...@@ -159,6 +163,9 @@ def train(params, train_data, num_boost_round=100, ...@@ -159,6 +163,9 @@ def train(params, train_data, num_boost_round=100,
continue continue
if isinstance(valid_data, Dataset): if isinstance(valid_data, Dataset):
valid_set = valid_data valid_set = valid_data
if other_fields is not None:
for field, data in other_fields.items():
valid_set.set_field(field, data)
else: else:
valid_set = _construct_dataset( valid_set = _construct_dataset(
valid_data, valid_data,
...@@ -169,7 +176,7 @@ def train(params, train_data, num_boost_round=100, ...@@ -169,7 +176,7 @@ def train(params, train_data, num_boost_round=100,
categorical_feature=categorical_feature, categorical_feature=categorical_feature,
predictor=predictor) predictor=predictor)
valid_sets.append(valid_set) valid_sets.append(valid_set)
if valid_names is not None: if valid_names is not None and len(valid_names) > i:
name_valid_sets.append(valid_names[i]) name_valid_sets.append(valid_names[i])
else: else:
name_valid_sets.append('valid_'+str(i)) name_valid_sets.append('valid_'+str(i))
...@@ -179,13 +186,13 @@ def train(params, train_data, num_boost_round=100, ...@@ -179,13 +186,13 @@ def train(params, train_data, num_boost_round=100,
# Most of legacy advanced options becomes callbacks # Most of legacy advanced options becomes callbacks
if isinstance(verbose_eval, bool) and verbose_eval: if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation()) callbacks.append(callback.print_evaluation())
else: elif isinstance(verbose_eval, int):
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval)) callbacks.append(callback.print_evaluation(verbose_eval))
if early_stopping_rounds is not None: if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds, callbacks.append(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval))) verbose=bool(verbose_eval)))
if learning_rates is not None: if learning_rates is not None:
callbacks.append(callback.reset_learning_rate(learning_rates)) callbacks.append(callback.reset_learning_rate(learning_rates))
...@@ -197,32 +204,26 @@ def train(params, train_data, num_boost_round=100, ...@@ -197,32 +204,26 @@ def train(params, train_data, num_boost_round=100,
callbacks_after_iter = [ callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)] cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
"""construct booster""" """construct booster"""
if 'metric' in params:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
booster = Booster(params=params, train_set=train_set) booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train: if is_valid_contain_train:
booster.set_train_data_name(train_data_name) booster.set_train_data_name(train_data_name)
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets): for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set) booster.add_valid(valid_set, name_valid_set)
"""start training""" """start training"""
for i in range(num_boost_round): for i in range(init_iteration, init_iteration + num_boost_round):
for cb in callbacks_before_iter: for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster, cb(callback.CallbackEnv(model=booster,
cvfolds=None, cvfolds=None,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=init_iteration,
end_iteration=num_boost_round, end_iteration=init_iteration + num_boost_round,
evaluation_result_list=None)) evaluation_result_list=None))
booster.update(fobj=fobj) booster.update(fobj=fobj)
evaluation_result_list = [] evaluation_result_list = []
# check evaluation result. # check evaluation result.
if len(valid_sets) != 0: if valid_sets:
if is_valid_contain_train: if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval)) evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval)) evaluation_result_list.extend(booster.eval_valid(feval))
...@@ -231,8 +232,8 @@ def train(params, train_data, num_boost_round=100, ...@@ -231,8 +232,8 @@ def train(params, train_data, num_boost_round=100,
cb(callback.CallbackEnv(model=booster, cb(callback.CallbackEnv(model=booster,
cvfolds=None, cvfolds=None,
iteration=i, iteration=i,
begin_iteration=0, begin_iteration=init_iteration,
end_iteration=num_boost_round, end_iteration=init_iteration + num_boost_round,
evaluation_result_list=evaluation_result_list)) evaluation_result_list=evaluation_result_list))
except callback.EarlyStopException: except callback.EarlyStopException:
break break
...@@ -347,24 +348,24 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -347,24 +348,24 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
feval : function feval : function
Custom evaluation function. Custom evaluation function.
train_fields : dict train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data Other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
feature names Feature names
categorical_feature : list of str/int categorical_feature : list of str or int
categorical features , int type to use index, Categorical features, type int represents index, \
str type to use feature names (feature_name cannot be None) type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least Activates early stopping. CV error needs to decrease at least \
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration. Last entry in evaluation history is the one from best iteration.
fpreproc : function fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns Preprocessing function that takes (dtrain, dtest, param) and returns \
transformed versions of those. transformed versions of those.
verbose_eval : bool, int, or None, default None verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed Whether to display the progress. If None, progress will be displayed \
when np.ndarray is returned. If True, progress will be displayed at when np.ndarray is returned. If True, progress will be displayed at \
boosting stage. If an integer is given, progress will be displayed boosting stage. If an integer is given, progress will be displayed \
at every given `verbose_eval` boosting stage. at every given `verbose_eval` boosting stage.
show_stdv : bool, default True show_stdv : bool, default True
Whether to display the standard deviation in progress. Whether to display the standard deviation in progress.
...@@ -378,22 +379,11 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -378,22 +379,11 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
------- -------
evaluation history : list(string) evaluation history : list(string)
""" """
if metrics:
if isinstance(metrics, str): params.setdefault('metric', [])
metrics = [metrics] if is_str(metrics):
params['metric'].append(metrics)
if isinstance(params, list):
params = dict(params)
if 'metric' not in params:
params['metric'] = []
else:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else: else:
params['metric'] = list(params['metric'])
if metrics is not None and len(metrics) > 0:
params['metric'].extend(metrics) params['metric'].extend(metrics)
train_set = _construct_dataset(train_data, None, params, train_set = _construct_dataset(train_data, None, params,
...@@ -411,8 +401,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False, ...@@ -411,8 +401,7 @@ def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
verbose=False)) verbose=False))
if isinstance(verbose_eval, bool) and verbose_eval: if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation(show_stdv=show_stdv)) callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
else: elif isinstance(verbose_eval, int):
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv)) callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks_before_iter = [ callbacks_before_iter = [
......
# coding: utf-8 # coding: utf-8
# pylint: disable = invalid-name, W0105 # pylint: disable = invalid-name, W0105, C0111
"""Scikit-Learn Wrapper interface for LightGBM.""" """Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import from __future__ import absolute_import
import numpy as np import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str from .basic import LightGBMError, is_str
from .engine import train from .engine import train
# sklearn # sklearn
try: try:
...@@ -66,7 +66,7 @@ def _point_wise_objective(func): ...@@ -66,7 +66,7 @@ def _point_wise_objective(func):
num_data = len(weight) num_data = len(weight)
num_class = len(grad) // num_data num_class = len(grad) // num_data
if num_class * num_data != len(grad): if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal with num_class * num_data") raise ValueError("length of grad and hess should equal to num_class * num_data")
for k in range(num_class): for k in range(num_class):
for i in range(num_data): for i in range(num_data):
idx = k * num_data + i idx = k * num_data + i
...@@ -169,6 +169,7 @@ class LGBMModel(LGBMModelBase): ...@@ -169,6 +169,7 @@ class LGBMModel(LGBMModelBase):
self.is_unbalance = is_unbalance self.is_unbalance = is_unbalance
self.seed = seed self.seed = seed
self._Booster = None self._Booster = None
self.best_iteration = -1
if callable(self.objective): if callable(self.objective):
self.fobj = _point_wise_objective(self.objective) self.fobj = _point_wise_objective(self.objective)
else: else:
...@@ -190,7 +191,6 @@ class LGBMModel(LGBMModelBase): ...@@ -190,7 +191,6 @@ class LGBMModel(LGBMModelBase):
def get_params(self, deep=False): def get_params(self, deep=False):
"""Get parameters""" """Get parameters"""
params = super(LGBMModel, self).get_params(deep=deep) params = super(LGBMModel, self).get_params(deep=deep)
params['verbose'] = 0 if self.silent else 1
if self.nthread <= 0: if self.nthread <= 0:
params.pop('nthread', None) params.pop('nthread', None)
return params return params
...@@ -213,30 +213,31 @@ class LGBMModel(LGBMModelBase): ...@@ -213,30 +213,31 @@ class LGBMModel(LGBMModelBase):
A list of (X, y) tuple pairs to use as a validation set for early-stopping A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use. If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call If callable, a custom evaluation metric. The call \
signature is func(y_predicted, dataset) where dataset will be a signature is func(y_predicted, dataset) where dataset will be a \
Dataset fobject such that you may need to call the get_label Dataset fobject such that you may need to call the get_label \
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool) method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int early_stopping_rounds : int
verbose : bool verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data Other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score Support fields: weight, group, init_score
valid_fields : dict valid_fields : dict
other data file in training data. \ Other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score Support fields: weight, group, init_score
feature_name : list of str feature_name : list of str
feature names Feature names
categorical_feature : list of str/int categorical_feature : list of str or int
categorical features , int type to use index, Categorical features, type int represents index, \
str type to use feature names (feature_name cannot be None) type str represents feature names (need to specify feature_name as well)
other_params: dict other_params: dict
other parameters Other parameters
""" """
evals_result = {} evals_result = {}
params = self.get_params() params = self.get_params()
params['verbose'] = 0 if self.silent else 1
if other_params is not None: if other_params is not None:
params.update(other_params) params.update(other_params)
...@@ -317,6 +318,14 @@ class LGBMModel(LGBMModelBase): ...@@ -317,6 +318,14 @@ class LGBMModel(LGBMModelBase):
return evals_result return evals_result
def feature_importance(self):
"""Feature importances
Returns
-------
Array of normailized feature importances
"""
importace_array = self._Booster.feature_importance().astype(np.float32)
return importace_array / importace_array.sum()
class LGBMRegressor(LGBMModel, LGBMRegressorBase): class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression. __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
...@@ -394,7 +403,7 @@ def _group_wise_objective(func): ...@@ -394,7 +403,7 @@ def _group_wise_objective(func):
y_true: array_like of shape [n_samples] y_true: array_like of shape [n_samples]
The target values The target values
group : array_like of shape group : array_like of shape
group size data of data Group size data of data
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class) y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values The predicted values
Returns Returns
......
...@@ -5,7 +5,7 @@ from __future__ import absolute_import ...@@ -5,7 +5,7 @@ from __future__ import absolute_import
import sys import sys
import os import os
from setuptools import setup, find_packages from setuptools import setup, find_packages
# import subprocess
sys.path.insert(0, '.') sys.path.insert(0, '.')
CURRENT_DIR = os.path.dirname(__file__) CURRENT_DIR = os.path.dirname(__file__)
......
...@@ -227,8 +227,6 @@ Tree::Tree(const std::string& str) { ...@@ -227,8 +227,6 @@ Tree::Tree(const std::string& str) {
leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_); leaf_parent_ = Common::StringToArray<int>(key_vals["leaf_parent"], ' ', num_leaves_);
leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_); leaf_value_ = Common::StringToArray<double>(key_vals["leaf_value"], ' ', num_leaves_);
} }
} // namespace LightGBM } // namespace LightGBM
...@@ -101,6 +101,7 @@ def test_early_stopping(): ...@@ -101,6 +101,7 @@ def test_early_stopping():
from sklearn.datasets import load_boston from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold from sklearn.cross_validation import KFold
from sklearn import datasets, metrics, model_selection from sklearn import datasets, metrics, model_selection
from sklearn.base import clone
boston = load_boston() boston = load_boston()
y = boston['target'] y = boston['target']
...@@ -111,6 +112,7 @@ def test_early_stopping(): ...@@ -111,6 +112,7 @@ def test_early_stopping():
eval_metric='l2', eval_metric='l2',
early_stopping_rounds=10, early_stopping_rounds=10,
verbose=10) verbose=10)
lgb_model_clone = clone(lgb_model)
print(lgb_model.best_iteration) print(lgb_model.best_iteration)
test_binary_classification() test_binary_classification()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment