Commit 54963bb7 authored by Guolin Ke's avatar Guolin Ke
Browse files

Merge branch 'master' of https://github.com/Microsoft/LightGBM

parents 762b5707 ced64bad
This diff is collapsed.
# coding: utf-8
# pylint: disable = C0103, C0111, C0301, C0321, C0330, W0621
import inspect
import lightgbm as lgb
file_api = open('Python_API.md', 'w+')
def write_func(func, leftSpace=0):
file_api.write('####' + func.__name__ + '('
+ ', '.join([
v.name + ('=' + str(v.default) if v.default != v.empty else '')
for _, v in inspect.signature(func).parameters.items() if v.name != 'self'
])
+ ')\n')
if func.__doc__:
for line in func.__doc__.splitlines():
if line: file_api.write(line[leftSpace:])
file_api.write('\n')
file_api.write('\n')
def write_class(class_):
file_api.write('###' + class_.__name__ + '\n')
for name, members in sorted(class_.__dict__.items(), key=lambda x: x[0]):
if name == '__init__' or not name.startswith('_'): write_func(members, leftSpace=4)
def write_module(name, members):
file_api.write('##' + name + '\n----\n')
for member in members:
if inspect.isclass(member): write_class(member)
else: write_func(member)
write_module('Basic Data Structure API', [
lgb.Dataset,
lgb.Booster
])
write_module('Training API', [
lgb.train,
lgb.cv
])
write_module('Scikit-learn API', [
lgb.LGBMModel,
lgb.LGBMClassifier,
lgb.LGBMRegressor,
lgb.LGBMRanker
])
file_api.close()
.toctree-l4{
padding: 0.4045em 2.427em 0.4045em 3.227em !important;
}
site_name: LightGBM
theme: readthedocs
extra_css:
- css/extra.css
......@@ -758,7 +758,7 @@ class _InnerDataset(object):
self.set_field('weight', weight)
def set_init_score(self, score):
""" Set init score of booster to start from.
"""Set init score of booster to start from.
Parameters
----------
......@@ -869,7 +869,8 @@ class Dataset(object):
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index, \
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
params: dict, optional
Other parameters
......@@ -919,7 +920,9 @@ class Dataset(object):
return ret
def construct(self):
"""Lazy init"""
"""
Lazy init
"""
if self.inner_dataset is None:
if self.reference is not None:
if self.used_indices is None:
......@@ -1036,7 +1039,8 @@ class Dataset(object):
return ret
def save_binary(self, filename):
"""Save Dataset to binary file
"""
Save Dataset to binary file
Parameters
----------
......@@ -1047,7 +1051,8 @@ class Dataset(object):
def set_label(self, label):
"""Set label of Dataset
"""
Set label of Dataset
Parameters
----------
......@@ -1059,7 +1064,8 @@ class Dataset(object):
self.inner_dataset.set_label(self.label)
def set_weight(self, weight):
""" Set weight of each instance.
"""
Set weight of each instance.
Parameters
----------
......@@ -1071,7 +1077,8 @@ class Dataset(object):
self.inner_dataset.set_weight(self.weight)
def set_init_score(self, init_score):
""" Set init score of booster to start from.
"""
Set init score of booster to start from.
Parameters
----------
......@@ -1083,7 +1090,8 @@ class Dataset(object):
self.inner_dataset.set_init_score(self.init_score)
def set_group(self, group):
"""Set group size of Dataset (used for ranking).
"""
Set group size of Dataset (used for ranking).
Parameters
----------
......@@ -1095,7 +1103,8 @@ class Dataset(object):
self.inner_dataset.set_group(self.group)
def get_label(self):
"""Get the label of the Dataset.
"""
Get the label of the Dataset.
Returns
-------
......@@ -1106,7 +1115,8 @@ class Dataset(object):
return self.label
def get_weight(self):
"""Get the weight of the Dataset.
"""
Get the weight of the Dataset.
Returns
-------
......@@ -1117,7 +1127,8 @@ class Dataset(object):
return self.weight
def get_init_score(self):
"""Get the initial score of the Dataset.
"""
Get the initial score of the Dataset.
Returns
-------
......@@ -1128,7 +1139,8 @@ class Dataset(object):
return self.init_score
def get_group(self):
"""Get the initial score of the Dataset.
"""
Get the initial score of the Dataset.
Returns
-------
......@@ -1139,7 +1151,8 @@ class Dataset(object):
return self.group
def num_data(self):
"""Get the number of rows in the Dataset.
"""
Get the number of rows in the Dataset.
Returns
-------
......@@ -1151,7 +1164,8 @@ class Dataset(object):
raise LightGBMError("Cannot call num_data before construct, please call it explicitly")
def num_feature(self):
"""Get the number of columns (features) in the Dataset.
"""
Get the number of columns (features) in the Dataset.
Returns
-------
......@@ -1166,7 +1180,8 @@ class Booster(object):
""""A Booster of LightGBM.
"""
def __init__(self, params=None, train_set=None, model_file=None, silent=False):
"""Initialize the Booster.
"""
Initialize the Booster.
Parameters
----------
......@@ -1241,7 +1256,8 @@ class Booster(object):
self.__train_data_name = name
def add_valid(self, data, name):
"""Add an validation data
"""
Add an validation data
Parameters
----------
......@@ -1262,7 +1278,8 @@ class Booster(object):
self.__is_predicted_cur_iter.append(False)
def reset_parameter(self, params):
"""Reset parameters for booster
"""
Reset parameters for booster
Parameters
----------
......@@ -1365,7 +1382,8 @@ class Booster(object):
return out_cur_iter.value
def eval(self, data, name, feval=None):
"""Evaluate for data
"""
Evaluate for data
Parameters
----------
......@@ -1397,7 +1415,8 @@ class Booster(object):
return self.__inner_eval(name, data_idx, feval)
def eval_train(self, feval=None):
"""Evaluate for training data
"""
Evaluate for training data
Parameters
----------
......@@ -1412,7 +1431,8 @@ class Booster(object):
return self.__inner_eval(self.__train_data_name, 0, feval)
def eval_valid(self, feval=None):
"""Evaluate for validation data
"""
Evaluate for validation data
Parameters
----------
......@@ -1428,7 +1448,8 @@ class Booster(object):
for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)]
def save_model(self, filename, num_iteration=-1):
"""Save model of booster to file
"""
Save model of booster to file
Parameters
----------
......@@ -1443,7 +1464,8 @@ class Booster(object):
c_str(filename)))
def dump_model(self):
"""Dump model to json format
"""
Dump model to json format
Returns
-------
......@@ -1471,7 +1493,8 @@ class Booster(object):
return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""Predict logic
"""
Predict logic
Parameters
----------
......@@ -1503,7 +1526,8 @@ class Booster(object):
return predictor
def feature_importance(self, importance_type='split'):
"""Feature importances
"""
Feature importances
Returns
-------
......@@ -1615,7 +1639,8 @@ class Booster(object):
[name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval]
def attr(self, key):
"""Get attribute string from the Booster.
"""
Get attribute string from the Booster.
Parameters
----------
......@@ -1630,7 +1655,8 @@ class Booster(object):
return self.__attr.get(key, None)
def set_attr(self, **kwargs):
"""Set the attribute of the Booster.
"""
Set the attribute of the Booster.
Parameters
----------
......
......@@ -15,7 +15,8 @@ def train(params, train_set, num_boost_round=100,
feature_name=None, categorical_feature=None,
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None):
"""Train with given parameters.
"""
Train with given parameters.
Parameters
----------
......@@ -39,7 +40,8 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index, \
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int
Activates early stopping.
......@@ -49,27 +51,29 @@ def train(params, train_set, num_boost_round=100,
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_sets.
Example: with a valid_sets containing [valid_set, train_set] \
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Example: with a valid_sets containing [valid_set, train_set]
and valid_names containing ['eval', 'train']
and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is \
printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set \
is printed at every given `verbose_eval` boosting stage. The last boosting stage \
/ the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
is printed every 4 boosting stages, instead of every boosting stage.
If `verbose_eval` is True,
the eval metric on the valid set is printed at each boosting stage.
If `verbose_eval` is int,
the eval metric on the valid set is printed at every `verbose_eval` boosting stage.
The last boosting stage
or the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals,
an evaluation metric is printed every 4 (instead of 1) boosting stages.
learning_rates: list or function
List of learning rate for each boosting round \
or a customized function that calculates learning_rate in terms of \
current number of round and the total number of boosting round \
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round (and the total number of boosting round)
(e.g. yields learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round) \
- function f: learning_rate = f(current_round, total_boost_round)
or learning_rate = f(current_round)
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
......@@ -259,12 +263,13 @@ def _agg_cv_result(raw_results):
return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]
def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, init_model=None,
metrics=None, fobj=None, feval=None, init_model=None,
feature_name=None, categorical_feature=None,
early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
"""Cross-validation with given paramaters.
"""
Cross-validation with given paramaters.
Parameters
----------
......@@ -291,20 +296,21 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index, \
Categorical features, type int represents index,
type str represents feature names (need to specify feature_name as well)
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least \
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns \
transformed versions of those.
Preprocessing function that takes (dtrain, dtest, param)
and returns transformed versions of those.
verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed \
when np.ndarray is returned. If True, progress will be displayed at \
boosting stage. If an integer is given, progress will be displayed \
at every given `verbose_eval` boosting stage.
Whether to display the progress.
If None, progress will be displayed when np.ndarray is returned.
If True, progress will be displayed at boosting stage.
If an integer is given,
progress will be displayed at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
......
......@@ -128,7 +128,16 @@ def _eval_function_wrapper(func):
return inner
class LGBMModel(LGBMModelBase):
"""Implementation of the Scikit-Learn API for LightGBM.
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
"""
Implementation of the Scikit-Learn API for LightGBM.
Parameters
----------
......@@ -145,6 +154,7 @@ class LGBMModel(LGBMModelBase):
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
default: binary for LGBMClassifier, lambdarank for LGBMRanker
nthread : int
Number of parallel threads
min_split_gain : float
......@@ -174,7 +184,8 @@ class LGBMModel(LGBMModelBase):
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess`` or ``objective(y_true, y_pred, group) -> grad, hess``:
``objective(y_true, y_pred) -> grad, hess``
or ``objective(y_true, y_pred, group) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
......@@ -191,14 +202,6 @@ class LGBMModel(LGBMModelBase):
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
"""
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED:
raise LightGBMError('Scikit-learn is required for this module')
......@@ -229,8 +232,8 @@ class LGBMModel(LGBMModelBase):
self.fobj = None
def booster(self):
"""Get the underlying lightgbm Booster of this model.
"""
Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
......@@ -242,7 +245,9 @@ class LGBMModel(LGBMModelBase):
return self._Booster
def get_params(self, deep=False):
"""Get parameters"""
"""
Get parameters
"""
params = super(LGBMModel, self).get_params(deep=deep)
if self.nthread <= 0:
params.pop('nthread', None)
......@@ -288,20 +293,23 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str
Feature names
categorical_feature : list of str or int
Categorical features, type int represents index, \
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
other_params: dict
Other parameters
Note
----
Custom eval function expects a callable with following functions: ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
Custom eval function expects a callable with following functions:
``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
or ``func(y_true, y_pred, weight, group)``.
return (eval_name, eval_result, is_bigger_better) or list of (eval_name, eval_result, is_bigger_better)
return (eval_name, eval_result, is_bigger_better)
or list of (eval_name, eval_result, is_bigger_better)
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
The predicted values
weight: array_like of shape [n_samples]
The weight of samples
......@@ -383,20 +391,36 @@ class LGBMModel(LGBMModelBase):
return self
def predict(self, data, raw_score=False, num_iteration=0):
"""
Return the predicted value for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
"""
return self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
def apply(self, X, num_iteration=0):
"""Return the predicted leaf every tree for each sample.
"""
Return the predicted leaf every tree for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
......@@ -407,7 +431,9 @@ class LGBMModel(LGBMModelBase):
num_iteration=num_iteration)
def evals_result(self):
"""Return the evaluation results.
"""
Return the evaluation results.
Returns
-------
evals_result : dictionary
......@@ -420,7 +446,9 @@ class LGBMModel(LGBMModelBase):
return evals_result
def feature_importance(self):
"""Feature importances
"""
Feature importances
Returns
-------
Array of normailized feature importances
......@@ -429,8 +457,6 @@ class LGBMModel(LGBMModelBase):
return importace_array / importace_array.sum()
class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y,
sample_weight=None, init_score=None,
......@@ -449,9 +475,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM classification.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
......@@ -511,6 +534,21 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
return self._le.inverse_transform(column_indexes)
def predict_proba(self, data, raw_score=False, num_iteration=0):
"""
Return the predicted probability for each class for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
num_iteration : int
Limit number of iterations in the prediction; defaults to 0 (use all trees).
Returns
-------
predicted_probability : array_like, shape=[n_samples, n_classes]
"""
class_probs = self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
......@@ -522,9 +560,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
return np.vstack((classzero_probs, classone_probs)).transpose()
class LGBMRanker(LGBMModel):
__doc__ = """Implementation of the scikit-learn API for LightGBM ranking application.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
......@@ -550,7 +585,7 @@ class LGBMRanker(LGBMModel):
feature_name=None, categorical_feature=None,
other_params=None):
"""
Most arguments like LGBMModel.fit except following:
Most arguments like common methods except following:
eval_at : list of int
The evaulation positions of NDCG
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment