"...git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "f38f118ce2fc6451b75363689fc06e011d69cf33"
Commit 54963bb7 authored by Guolin Ke's avatar Guolin Ke
Browse files

Merge branch 'master' of https://github.com/Microsoft/LightGBM

parents 762b5707 ced64bad
##Basic Data Structure API
----
###Dataset
####__init__(data, label=None, max_bin=255, reference=None, weight=None, group=None, silent=False, feature_name=None, categorical_feature=None, params=None, free_raw_data=True)
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
Max number of discrete bin for features
reference : Other Dataset, optional
If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
params: dict, optional
Other parameters
free_raw_data: Bool
True if need to free raw data after construct inner dataset
####construct()
Lazy init
####create_valid(data, label=None, weight=None, group=None, silent=False, params=None)
Create validation data align with current dataset
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of _InnerDataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
Other parameters
####get_group()
Get the initial score of the Dataset.
Returns
-------
init_score : array
####get_init_score()
Get the initial score of the Dataset.
Returns
-------
init_score : array
####get_label()
Get the label of the Dataset.
Returns
-------
label : array
####get_weight()
Get the weight of the Dataset.
Returns
-------
weight : array
####num_data()
Get the number of rows in the Dataset.
Returns
-------
number of rows : int
####num_feature()
Get the number of columns (features) in the Dataset.
Returns
-------
number of columns : int
####save_binary(filename)
Save Dataset to binary file
Parameters
----------
filename : string
Name of the output file.
####set_categorical_feature(categorical_feature)
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
####set_feature_name(feature_name)
Set feature name
Parameters
----------
feature_name : list of str
Feature names
####set_group(group)
Set group size of Dataset (used for ranking).
Parameters
----------
group : numpy array or list or None
Group size of each group
####set_init_score(init_score)
Set init score of booster to start from.
Parameters
----------
init_score: numpy array or list or None
Init score for booster
####set_label(label)
Set label of Dataset
Parameters
----------
label: numpy array or list or None
The label information to be set into Dataset
####set_reference(reference)
Set reference dataset
Parameters
----------
reference : Dataset
Will use reference as template to consturct current dataset
####set_weight(weight)
Set weight of each instance.
Parameters
----------
weight : numpy array or list or None
Weight for each data point
####subset(used_indices, params=None)
Get subset of current dataset
Parameters
----------
used_indices : list of int
Used indices of this subset
params : dict
Other parameters
###Booster
####__init__(params=None, train_set=None, model_file=None, silent=False)
Initialize the Booster.
Parameters
----------
params : dict
Parameters for boosters.
train_set : Dataset
Training dataset
model_file : string
Path to the model file.
silent : boolean, optional
Whether print messages during construction
####add_valid(data, name)
Add an validation data
Parameters
----------
data : Dataset
Validation data
name : String
Name of validation data
####attr(key)
Get attribute string from the Booster.
Parameters
----------
key : str
The key to get attribute from.
Returns
-------
value : str
The attribute value of the key, returns None if attribute do not exist.
####current_iteration()
####dump_model()
Dump model to json format
Returns
-------
Json format of model
####eval(data, name, feval=None)
Evaluate for data
Parameters
----------
data : _InnerDataset object
name :
Name of data
feval : function
Custom evaluation function.
Returns
-------
result: list
Evaluation result list.
####eval_train(feval=None)
Evaluate for training data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
####eval_valid(feval=None)
Evaluate for validation data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
####feature_importance(importance_type=split)
Feature importances
Returns
-------
Array of feature importances
####predict(data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True)
Predict logic
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data type is string, it represents the path of txt file
num_iteration : int
Used iteration for prediction
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
is_reshape : bool
Reshape to (nrow, ncol) if true
Returns
-------
Prediction result
####reset_parameter(params)
Reset parameters for booster
Parameters
----------
params : dict
New parameters for boosters
silent : boolean, optional
Whether print messages during construction
####rollback_one_iter()
Rollback one iteration
####save_model(filename, num_iteration=-1)
Save model of booster to file
Parameters
----------
filename : str
Filename to save
num_iteration: int
Number of iteration that want to save. < 0 means save all
####set_attr(kwargs)
Set the attribute of the Booster.
Parameters
----------
**kwargs
The attributes to set. Setting a value to None deletes an attribute.
####set_train_data_name(name)
####update(train_set=None, fobj=None)
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
train_set :
Training data, None means use last training data
fobj : function
Customized objective function.
Returns
-------
is_finished, bool
##Training API
----
####train(params, train_set, num_boost_round=100, valid_sets=None, valid_names=None, fobj=None, feval=None, init_model=None, feature_name=None, categorical_feature=None, early_stopping_rounds=None, evals_result=None, verbose_eval=True, learning_rates=None, callbacks=None)
Train with given parameters.
Parameters
----------
params : dict
Parameters for training.
train_set : Dataset
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_sets: list of Datasets
List of data to be evaluated during training
valid_names: list of string
Names of valid_sets
fobj : function
Customized objective function.
feval : function
Customized evaluation function.
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_sets.
Example: with a valid_sets containing [valid_set, train_set]
and valid_names containing ['eval', 'train']
and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True,
the eval metric on the valid set is printed at each boosting stage.
If `verbose_eval` is int,
the eval metric on the valid set is printed at every `verbose_eval` boosting stage.
The last boosting stage
or the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals,
an evaluation metric is printed every 4 (instead of 1) boosting stages.
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round (and the total number of boosting round)
(e.g. yields learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
or learning_rate = f(current_round)
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
booster : a trained booster model
####cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, metrics=None, fobj=None, feval=None, init_model=None, feature_name=None, categorical_feature=None, early_stopping_rounds=None, fpreproc=None, verbose_eval=None, show_stdv=True, seed=0, callbacks=None)
Cross-validation with given paramaters.
Parameters
----------
params : dict
Booster params.
train_set : Dataset
Data to be trained.
num_boost_round : int
Number of boosting iterations.
nfold : int
Number of folds in CV.
stratified : bool
Perform stratified sampling.
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings
Evaluation metrics to be watched in CV.
fobj : function
Custom objective function.
feval : function
Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index,
type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param)
and returns transformed versions of those.
verbose_eval : bool, int, or None, default None
Whether to display the progress.
If None, progress will be displayed when np.ndarray is returned.
If True, progress will be displayed at boosting stage.
If an integer is given,
progress will be displayed at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
seed : int
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
evaluation history : list(string)
##Scikit-learn API
----
###Common Methods
####__init__(num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=10, max_bin=255, silent=True, objective=regression, nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10, subsample=1, subsample_freq=1, colsample_bytree=1, reg_alpha=0, reg_lambda=0, scale_pos_weight=1, is_unbalance=False, seed=0)
Implementation of the Scikit-Learn API for LightGBM.
Parameters
----------
num_leaves : int
Maximum tree leaves for base learners.
max_depth : int
Maximum tree depth for base learners, -1 means no limit.
learning_rate : float
Boosting learning rate
n_estimators : int
Number of boosted trees to fit.
silent : boolean
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
default: binary for LGBMClassifier, lambdarank for LGBMRanker
nthread : int
Number of parallel threads
min_split_gain : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child(leaf)
min_child_samples : int
Minimum number of data need in a child(leaf)
subsample : float
Subsample ratio of the training instance.
subsample_freq : int
frequence of subsample, <=0 means no enable
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
reg_alpha : float
L1 regularization term on weights
reg_lambda : float
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
is_unbalance : bool
Is unbalance for binary classification
seed : int
Random number seed.
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``
or ``objective(y_true, y_pred, group) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
The predicted values
group: array_like
group/query data, used for ranking task
grad: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the second derivative for each sample point
for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
####apply(X, num_iteration=0)
Return the predicted leaf every tree for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
####booster()
Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
####evals_result()
Return the evaluation results.
Returns
-------
evals_result : dictionary
####feature_importance()
Feature importances
Returns
-------
Array of normailized feature importances
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, other_params=None)
Fit the gradient boosting model
Parameters
----------
X : array_like
Feature matrix
y : array_like
Labels
sample_weight : array_like
weight of training data
init_score : array_like
init score of training data
group : array_like
group data of training data
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_sample_weight : List of array
weight of eval data
eval_init_score : List of array
init score of eval data
eval_group : List of array
group data of eval data
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric, see note for more details.
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
other_params: dict
Other parameters
Note
----
Custom eval function expects a callable with following functions:
``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
or ``func(y_true, y_pred, weight, group)``.
return (eval_name, eval_result, is_bigger_better)
or list of (eval_name, eval_result, is_bigger_better)
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
The predicted values
weight: array_like of shape [n_samples]
The weight of samples
group: array_like
group/query data, used for ranking task
eval_name: str
name of evaluation
eval_result: float
eval result
is_bigger_better: bool
is eval result bigger better, e.g. AUC is bigger_better.
for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
####get_params(deep=False)
Get parameters
####predict(data, raw_score=False, num_iteration=0)
Return the predicted value for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
###LGBMClassifier
####predict_proba(data, raw_score=False, num_iteration=0)
Return the predicted probability for each class for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
predicted_probability : array_like, shape=[n_samples, n_classes]
###LGBMRegressor
###LGBMRanker
####fit(X, y, sample_weight=None, init_score=None, group=None, eval_set=None, eval_sample_weight=None, eval_init_score=None, eval_group=None, eval_metric=None, eval_at=None, early_stopping_rounds=None, verbose=True, feature_name=None, categorical_feature=None, other_params=None)
Most arguments like common methods except following:
eval_at : list of int
The evaulation positions of NDCG
# coding: utf-8
# pylint: disable = C0103, C0111, C0301, C0321, C0330, W0621
import inspect
import lightgbm as lgb
file_api = open('Python_API.md', 'w+')
def write_func(func, leftSpace=0):
file_api.write('####' + func.__name__ + '('
+ ', '.join([
v.name + ('=' + str(v.default) if v.default != v.empty else '')
for _, v in inspect.signature(func).parameters.items() if v.name != 'self'
])
+ ')\n')
if func.__doc__:
for line in func.__doc__.splitlines():
if line: file_api.write(line[leftSpace:])
file_api.write('\n')
file_api.write('\n')
def write_class(class_):
file_api.write('###' + class_.__name__ + '\n')
for name, members in sorted(class_.__dict__.items(), key=lambda x: x[0]):
if name == '__init__' or not name.startswith('_'): write_func(members, leftSpace=4)
def write_module(name, members):
file_api.write('##' + name + '\n----\n')
for member in members:
if inspect.isclass(member): write_class(member)
else: write_func(member)
write_module('Basic Data Structure API', [
lgb.Dataset,
lgb.Booster
])
write_module('Training API', [
lgb.train,
lgb.cv
])
write_module('Scikit-learn API', [
lgb.LGBMModel,
lgb.LGBMClassifier,
lgb.LGBMRegressor,
lgb.LGBMRanker
])
file_api.close()
.toctree-l4{
padding: 0.4045em 2.427em 0.4045em 3.227em !important;
}
site_name: LightGBM
theme: readthedocs
extra_css:
- css/extra.css
...@@ -758,7 +758,7 @@ class _InnerDataset(object): ...@@ -758,7 +758,7 @@ class _InnerDataset(object):
self.set_field('weight', weight) self.set_field('weight', weight)
def set_init_score(self, score): def set_init_score(self, score):
""" Set init score of booster to start from. """Set init score of booster to start from.
Parameters Parameters
---------- ----------
...@@ -869,7 +869,8 @@ class Dataset(object): ...@@ -869,7 +869,8 @@ class Dataset(object):
feature_name : list of str feature_name : list of str
Feature names Feature names
categorical_feature : list of str or int categorical_feature : list of str or int
Categorical features, type int represents index, \ Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
params: dict, optional params: dict, optional
Other parameters Other parameters
...@@ -919,7 +920,9 @@ class Dataset(object): ...@@ -919,7 +920,9 @@ class Dataset(object):
return ret return ret
def construct(self): def construct(self):
"""Lazy init""" """
Lazy init
"""
if self.inner_dataset is None: if self.inner_dataset is None:
if self.reference is not None: if self.reference is not None:
if self.used_indices is None: if self.used_indices is None:
...@@ -1036,7 +1039,8 @@ class Dataset(object): ...@@ -1036,7 +1039,8 @@ class Dataset(object):
return ret return ret
def save_binary(self, filename): def save_binary(self, filename):
"""Save Dataset to binary file """
Save Dataset to binary file
Parameters Parameters
---------- ----------
...@@ -1047,7 +1051,8 @@ class Dataset(object): ...@@ -1047,7 +1051,8 @@ class Dataset(object):
def set_label(self, label): def set_label(self, label):
"""Set label of Dataset """
Set label of Dataset
Parameters Parameters
---------- ----------
...@@ -1059,7 +1064,8 @@ class Dataset(object): ...@@ -1059,7 +1064,8 @@ class Dataset(object):
self.inner_dataset.set_label(self.label) self.inner_dataset.set_label(self.label)
def set_weight(self, weight): def set_weight(self, weight):
""" Set weight of each instance. """
Set weight of each instance.
Parameters Parameters
---------- ----------
...@@ -1071,7 +1077,8 @@ class Dataset(object): ...@@ -1071,7 +1077,8 @@ class Dataset(object):
self.inner_dataset.set_weight(self.weight) self.inner_dataset.set_weight(self.weight)
def set_init_score(self, init_score): def set_init_score(self, init_score):
""" Set init score of booster to start from. """
Set init score of booster to start from.
Parameters Parameters
---------- ----------
...@@ -1083,7 +1090,8 @@ class Dataset(object): ...@@ -1083,7 +1090,8 @@ class Dataset(object):
self.inner_dataset.set_init_score(self.init_score) self.inner_dataset.set_init_score(self.init_score)
def set_group(self, group): def set_group(self, group):
"""Set group size of Dataset (used for ranking). """
Set group size of Dataset (used for ranking).
Parameters Parameters
---------- ----------
...@@ -1095,7 +1103,8 @@ class Dataset(object): ...@@ -1095,7 +1103,8 @@ class Dataset(object):
self.inner_dataset.set_group(self.group) self.inner_dataset.set_group(self.group)
def get_label(self): def get_label(self):
"""Get the label of the Dataset. """
Get the label of the Dataset.
Returns Returns
------- -------
...@@ -1106,7 +1115,8 @@ class Dataset(object): ...@@ -1106,7 +1115,8 @@ class Dataset(object):
return self.label return self.label
def get_weight(self): def get_weight(self):
"""Get the weight of the Dataset. """
Get the weight of the Dataset.
Returns Returns
------- -------
...@@ -1117,7 +1127,8 @@ class Dataset(object): ...@@ -1117,7 +1127,8 @@ class Dataset(object):
return self.weight return self.weight
def get_init_score(self): def get_init_score(self):
"""Get the initial score of the Dataset. """
Get the initial score of the Dataset.
Returns Returns
------- -------
...@@ -1128,7 +1139,8 @@ class Dataset(object): ...@@ -1128,7 +1139,8 @@ class Dataset(object):
return self.init_score return self.init_score
def get_group(self): def get_group(self):
"""Get the initial score of the Dataset. """
Get the initial score of the Dataset.
Returns Returns
------- -------
...@@ -1139,7 +1151,8 @@ class Dataset(object): ...@@ -1139,7 +1151,8 @@ class Dataset(object):
return self.group return self.group
def num_data(self): def num_data(self):
"""Get the number of rows in the Dataset. """
Get the number of rows in the Dataset.
Returns Returns
------- -------
...@@ -1151,7 +1164,8 @@ class Dataset(object): ...@@ -1151,7 +1164,8 @@ class Dataset(object):
raise LightGBMError("Cannot call num_data before construct, please call it explicitly") raise LightGBMError("Cannot call num_data before construct, please call it explicitly")
def num_feature(self): def num_feature(self):
"""Get the number of columns (features) in the Dataset. """
Get the number of columns (features) in the Dataset.
Returns Returns
------- -------
...@@ -1166,7 +1180,8 @@ class Booster(object): ...@@ -1166,7 +1180,8 @@ class Booster(object):
""""A Booster of LightGBM. """"A Booster of LightGBM.
""" """
def __init__(self, params=None, train_set=None, model_file=None, silent=False): def __init__(self, params=None, train_set=None, model_file=None, silent=False):
"""Initialize the Booster. """
Initialize the Booster.
Parameters Parameters
---------- ----------
...@@ -1241,7 +1256,8 @@ class Booster(object): ...@@ -1241,7 +1256,8 @@ class Booster(object):
self.__train_data_name = name self.__train_data_name = name
def add_valid(self, data, name): def add_valid(self, data, name):
"""Add an validation data """
Add an validation data
Parameters Parameters
---------- ----------
...@@ -1262,7 +1278,8 @@ class Booster(object): ...@@ -1262,7 +1278,8 @@ class Booster(object):
self.__is_predicted_cur_iter.append(False) self.__is_predicted_cur_iter.append(False)
def reset_parameter(self, params): def reset_parameter(self, params):
"""Reset parameters for booster """
Reset parameters for booster
Parameters Parameters
---------- ----------
...@@ -1365,7 +1382,8 @@ class Booster(object): ...@@ -1365,7 +1382,8 @@ class Booster(object):
return out_cur_iter.value return out_cur_iter.value
def eval(self, data, name, feval=None): def eval(self, data, name, feval=None):
"""Evaluate for data """
Evaluate for data
Parameters Parameters
---------- ----------
...@@ -1397,7 +1415,8 @@ class Booster(object): ...@@ -1397,7 +1415,8 @@ class Booster(object):
return self.__inner_eval(name, data_idx, feval) return self.__inner_eval(name, data_idx, feval)
def eval_train(self, feval=None): def eval_train(self, feval=None):
"""Evaluate for training data """
Evaluate for training data
Parameters Parameters
---------- ----------
...@@ -1412,7 +1431,8 @@ class Booster(object): ...@@ -1412,7 +1431,8 @@ class Booster(object):
return self.__inner_eval(self.__train_data_name, 0, feval) return self.__inner_eval(self.__train_data_name, 0, feval)
def eval_valid(self, feval=None): def eval_valid(self, feval=None):
"""Evaluate for validation data """
Evaluate for validation data
Parameters Parameters
---------- ----------
...@@ -1428,7 +1448,8 @@ class Booster(object): ...@@ -1428,7 +1448,8 @@ class Booster(object):
for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)] for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)]
def save_model(self, filename, num_iteration=-1): def save_model(self, filename, num_iteration=-1):
"""Save model of booster to file """
Save model of booster to file
Parameters Parameters
---------- ----------
...@@ -1443,7 +1464,8 @@ class Booster(object): ...@@ -1443,7 +1464,8 @@ class Booster(object):
c_str(filename))) c_str(filename)))
def dump_model(self): def dump_model(self):
"""Dump model to json format """
Dump model to json format
Returns Returns
------- -------
...@@ -1471,7 +1493,8 @@ class Booster(object): ...@@ -1471,7 +1493,8 @@ class Booster(object):
return json.loads(string_buffer.value.decode()) return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True): def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""Predict logic """
Predict logic
Parameters Parameters
---------- ----------
...@@ -1503,7 +1526,8 @@ class Booster(object): ...@@ -1503,7 +1526,8 @@ class Booster(object):
return predictor return predictor
def feature_importance(self, importance_type='split'): def feature_importance(self, importance_type='split'):
"""Feature importances """
Feature importances
Returns Returns
------- -------
...@@ -1615,7 +1639,8 @@ class Booster(object): ...@@ -1615,7 +1639,8 @@ class Booster(object):
[name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval] [name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval]
def attr(self, key): def attr(self, key):
"""Get attribute string from the Booster. """
Get attribute string from the Booster.
Parameters Parameters
---------- ----------
...@@ -1630,7 +1655,8 @@ class Booster(object): ...@@ -1630,7 +1655,8 @@ class Booster(object):
return self.__attr.get(key, None) return self.__attr.get(key, None)
def set_attr(self, **kwargs): def set_attr(self, **kwargs):
"""Set the attribute of the Booster. """
Set the attribute of the Booster.
Parameters Parameters
---------- ----------
......
...@@ -15,7 +15,8 @@ def train(params, train_set, num_boost_round=100, ...@@ -15,7 +15,8 @@ def train(params, train_set, num_boost_round=100,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
early_stopping_rounds=None, evals_result=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None): verbose_eval=True, learning_rates=None, callbacks=None):
"""Train with given parameters. """
Train with given parameters.
Parameters Parameters
---------- ----------
...@@ -39,7 +40,8 @@ def train(params, train_set, num_boost_round=100, ...@@ -39,7 +40,8 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str feature_name : list of str
Feature names Feature names
categorical_feature : list of str or int categorical_feature : list of str or int
Categorical features, type int represents index, \ Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. Activates early stopping.
...@@ -49,27 +51,29 @@ def train(params, train_set, num_boost_round=100, ...@@ -49,27 +51,29 @@ def train(params, train_set, num_boost_round=100,
If early stopping occurs, the model will add 'best_iteration' field If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_sets. This dictionary used to store all evaluation results of all the items in valid_sets.
Example: with a valid_sets containing [valid_set, train_set] \ Example: with a valid_sets containing [valid_set, train_set]
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss') and valid_names containing ['eval', 'train']
and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]}, Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}} 'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function passed with None means no using this function
verbose_eval : bool or int verbose_eval : bool or int
Requires at least one item in evals. Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is \ If `verbose_eval` is True,
printed at each boosting stage. the eval metric on the valid set is printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set \ If `verbose_eval` is int,
is printed at every given `verbose_eval` boosting stage. The last boosting stage \ the eval metric on the valid set is printed at every `verbose_eval` boosting stage.
/ the boosting stage found by using `early_stopping_rounds` is also printed. The last boosting stage
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \ or the boosting stage found by using `early_stopping_rounds` is also printed.
is printed every 4 boosting stages, instead of every boosting stage. Example: with verbose_eval=4 and at least one item in evals,
an evaluation metric is printed every 4 (instead of 1) boosting stages.
learning_rates: list or function learning_rates: list or function
List of learning rate for each boosting round \ List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of \ or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round \ current number of round (and the total number of boosting round)
(e.g. yields learning rate decay) (e.g. yields learning rate decay)
- list l: learning_rate = l[current_round] - list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round) \ - function f: learning_rate = f(current_round, total_boost_round)
or learning_rate = f(current_round) or learning_rate = f(current_round)
callbacks : list of callback functions callbacks : list of callback functions
List of callback functions that are applied at end of each iteration. List of callback functions that are applied at end of each iteration.
...@@ -259,12 +263,13 @@ def _agg_cv_result(raw_results): ...@@ -259,12 +263,13 @@ def _agg_cv_result(raw_results):
return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()] return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, init_model=None, metrics=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
early_stopping_rounds=None, fpreproc=None, early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None): callbacks=None):
"""Cross-validation with given paramaters. """
Cross-validation with given paramaters.
Parameters Parameters
---------- ----------
...@@ -291,20 +296,21 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False, ...@@ -291,20 +296,21 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
feature_name : list of str feature_name : list of str
Feature names Feature names
categorical_feature : list of str or int categorical_feature : list of str or int
Categorical features, type int represents index, \ Categorical features, type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least \ Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue. every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration. Last entry in evaluation history is the one from best iteration.
fpreproc : function fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns \ Preprocessing function that takes (dtrain, dtest, param)
transformed versions of those. and returns transformed versions of those.
verbose_eval : bool, int, or None, default None verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed \ Whether to display the progress.
when np.ndarray is returned. If True, progress will be displayed at \ If None, progress will be displayed when np.ndarray is returned.
boosting stage. If an integer is given, progress will be displayed \ If True, progress will be displayed at boosting stage.
at every given `verbose_eval` boosting stage. If an integer is given,
progress will be displayed at every given `verbose_eval` boosting stage.
show_stdv : bool, default True show_stdv : bool, default True
Whether to display the standard deviation in progress. Whether to display the standard deviation in progress.
Results are not affected, and always contains std. Results are not affected, and always contains std.
......
...@@ -128,7 +128,16 @@ def _eval_function_wrapper(func): ...@@ -128,7 +128,16 @@ def _eval_function_wrapper(func):
return inner return inner
class LGBMModel(LGBMModelBase): class LGBMModel(LGBMModelBase):
"""Implementation of the Scikit-Learn API for LightGBM.
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
"""
Implementation of the Scikit-Learn API for LightGBM.
Parameters Parameters
---------- ----------
...@@ -145,6 +154,7 @@ class LGBMModel(LGBMModelBase): ...@@ -145,6 +154,7 @@ class LGBMModel(LGBMModelBase):
objective : string or callable objective : string or callable
Specify the learning task and the corresponding learning objective or Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below). a custom objective function to be used (see note below).
default: binary for LGBMClassifier, lambdarank for LGBMRanker
nthread : int nthread : int
Number of parallel threads Number of parallel threads
min_split_gain : float min_split_gain : float
...@@ -174,7 +184,8 @@ class LGBMModel(LGBMModelBase): ...@@ -174,7 +184,8 @@ class LGBMModel(LGBMModelBase):
---- ----
A custom objective function can be provided for the ``objective`` A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess`` or ``objective(y_true, y_pred, group) -> grad, hess``: ``objective(y_true, y_pred) -> grad, hess``
or ``objective(y_true, y_pred, group) -> grad, hess``:
y_true: array_like of shape [n_samples] y_true: array_like of shape [n_samples]
The target values The target values
...@@ -191,14 +202,6 @@ class LGBMModel(LGBMModelBase): ...@@ -191,14 +202,6 @@ class LGBMModel(LGBMModelBase):
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i] if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well and you should group grad and hess in this way as well
""" """
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED: if not SKLEARN_INSTALLED:
raise LightGBMError('Scikit-learn is required for this module') raise LightGBMError('Scikit-learn is required for this module')
...@@ -229,8 +232,8 @@ class LGBMModel(LGBMModelBase): ...@@ -229,8 +232,8 @@ class LGBMModel(LGBMModelBase):
self.fobj = None self.fobj = None
def booster(self): def booster(self):
"""Get the underlying lightgbm Booster of this model. """
Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called This will raise an exception when fit was not called
Returns Returns
...@@ -242,7 +245,9 @@ class LGBMModel(LGBMModelBase): ...@@ -242,7 +245,9 @@ class LGBMModel(LGBMModelBase):
return self._Booster return self._Booster
def get_params(self, deep=False): def get_params(self, deep=False):
"""Get parameters""" """
Get parameters
"""
params = super(LGBMModel, self).get_params(deep=deep) params = super(LGBMModel, self).get_params(deep=deep)
if self.nthread <= 0: if self.nthread <= 0:
params.pop('nthread', None) params.pop('nthread', None)
...@@ -288,20 +293,23 @@ class LGBMModel(LGBMModelBase): ...@@ -288,20 +293,23 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str feature_name : list of str
Feature names Feature names
categorical_feature : list of str or int categorical_feature : list of str or int
Categorical features, type int represents index, \ Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well) type str represents feature names (need to specify feature_name as well)
other_params: dict other_params: dict
Other parameters Other parameters
Note Note
---- ----
Custom eval function expects a callable with following functions: ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` Custom eval function expects a callable with following functions:
``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
or ``func(y_true, y_pred, weight, group)``. or ``func(y_true, y_pred, weight, group)``.
return (eval_name, eval_result, is_bigger_better) or list of (eval_name, eval_result, is_bigger_better) return (eval_name, eval_result, is_bigger_better)
or list of (eval_name, eval_result, is_bigger_better)
y_true: array_like of shape [n_samples] y_true: array_like of shape [n_samples]
The target values The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class) y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
The predicted values The predicted values
weight: array_like of shape [n_samples] weight: array_like of shape [n_samples]
The weight of samples The weight of samples
...@@ -383,20 +391,36 @@ class LGBMModel(LGBMModelBase): ...@@ -383,20 +391,36 @@ class LGBMModel(LGBMModelBase):
return self return self
def predict(self, data, raw_score=False, num_iteration=0): def predict(self, data, raw_score=False, num_iteration=0):
"""
Return the predicted value for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
"""
return self.booster().predict(data, return self.booster().predict(data,
raw_score=raw_score, raw_score=raw_score,
num_iteration=num_iteration) num_iteration=num_iteration)
def apply(self, X, num_iteration=0): def apply(self, X, num_iteration=0):
"""Return the predicted leaf every tree for each sample. """
Return the predicted leaf every tree for each sample.
Parameters Parameters
---------- ----------
X : array_like, shape=[n_samples, n_features] X : array_like, shape=[n_samples, n_features]
Input features matrix. Input features matrix.
ntree_limit : int num_iteration : int
Limit number of trees in the prediction; defaults to 0 (use all trees). Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns Returns
------- -------
...@@ -407,7 +431,9 @@ class LGBMModel(LGBMModelBase): ...@@ -407,7 +431,9 @@ class LGBMModel(LGBMModelBase):
num_iteration=num_iteration) num_iteration=num_iteration)
def evals_result(self): def evals_result(self):
"""Return the evaluation results. """
Return the evaluation results.
Returns Returns
------- -------
evals_result : dictionary evals_result : dictionary
...@@ -420,7 +446,9 @@ class LGBMModel(LGBMModelBase): ...@@ -420,7 +446,9 @@ class LGBMModel(LGBMModelBase):
return evals_result return evals_result
def feature_importance(self): def feature_importance(self):
"""Feature importances """
Feature importances
Returns Returns
------- -------
Array of normailized feature importances Array of normailized feature importances
...@@ -429,8 +457,6 @@ class LGBMModel(LGBMModelBase): ...@@ -429,8 +457,6 @@ class LGBMModel(LGBMModelBase):
return importace_array / importace_array.sum() return importace_array / importace_array.sum()
class LGBMRegressor(LGBMModel, LGBMRegressorBase): class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, def fit(self, X, y,
sample_weight=None, init_score=None, sample_weight=None, init_score=None,
...@@ -449,9 +475,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase): ...@@ -449,9 +475,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
return self return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase): class LGBMClassifier(LGBMModel, LGBMClassifierBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM classification.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def __init__(self, num_leaves=31, max_depth=-1, def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255, learning_rate=0.1, n_estimators=10, max_bin=255,
...@@ -511,6 +534,21 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -511,6 +534,21 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
return self._le.inverse_transform(column_indexes) return self._le.inverse_transform(column_indexes)
def predict_proba(self, data, raw_score=False, num_iteration=0): def predict_proba(self, data, raw_score=False, num_iteration=0):
"""
Return the predicted probability for each class for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
predicted_probability : array_like, shape=[n_samples, n_classes]
"""
class_probs = self.booster().predict(data, class_probs = self.booster().predict(data,
raw_score=raw_score, raw_score=raw_score,
num_iteration=num_iteration) num_iteration=num_iteration)
...@@ -522,9 +560,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase): ...@@ -522,9 +560,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
return np.vstack((classzero_probs, classone_probs)).transpose() return np.vstack((classzero_probs, classone_probs)).transpose()
class LGBMRanker(LGBMModel): class LGBMRanker(LGBMModel):
__doc__ = """Implementation of the scikit-learn API for LightGBM ranking application.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def __init__(self, num_leaves=31, max_depth=-1, def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255, learning_rate=0.1, n_estimators=10, max_bin=255,
...@@ -550,7 +585,7 @@ class LGBMRanker(LGBMModel): ...@@ -550,7 +585,7 @@ class LGBMRanker(LGBMModel):
feature_name=None, categorical_feature=None, feature_name=None, categorical_feature=None,
other_params=None): other_params=None):
""" """
Most arguments like LGBMModel.fit except following: Most arguments like common methods except following:
eval_at : list of int eval_at : list of int
The evaulation positions of NDCG The evaulation positions of NDCG
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment