Commit de51990e authored by Nikita Titov's avatar Nikita Titov Committed by Guolin Ke
Browse files

[docs] [python] docstrings improving (#894)

* fixed description of Dataset

* fixed description of Booster

* fixed description of train

* fixed description of cv

* fixed description of callbacks

* fixed description of plotting

* hotfixes
parent a494d7b7
......@@ -551,36 +551,37 @@ class Dataset(object):
weight=None, group=None, silent=False,
feature_name='auto', categorical_feature='auto', params=None,
free_raw_data=True):
"""
"""Constract Dataset.
Parameters
----------
data : string/numpy array/scipy.sparse
data : string, numpy array or scipy.sparse
Data source of Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
Max number of discrete bin for features
reference : Other Dataset, optional
If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional
If string, it represents the path to txt file.
label : list or numpy 1-D array, optional (default=None)
Label of the data.
max_bin : int, optional (default=255)
Max number of discrete bins for features.
reference : Dataset or None, optional (default=None)
If this is Dataset for validation, training data should be used as reference.
weight : list, numpy 1-D array or None, optional (default=None)
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
Other parameters
free_raw_data: Bool
True if need to free raw data after construct inner dataset
group : list, numpy 1-D array or None, optional (default=None)
Group/query size for Dataset.
silent : bool, optional (default=False)
Whether to print messages during construction.
feature_name : list of strings or 'auto', optional (default="auto")
Feature names.
If 'auto' and data is pandas DataFrame, data columns names are used.
categorical_feature : list of strings or int, or 'auto', optional (default="auto")
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
params: dict or None, optional (default=None)
Other parameters.
free_raw_data: bool, optional (default=True)
If True, raw data is freed after constructing inner Dataset.
"""
self.handle = None
self.data = data
......@@ -778,7 +779,13 @@ class Dataset(object):
ctypes.byref(self.handle)))
def construct(self):
"""Lazy init"""
"""Lazy init.
Returns
-------
self : Dataset
Returns self.
"""
if self.handle is None:
if self.reference is not None:
if self.used_indices is None:
......@@ -811,24 +818,28 @@ class Dataset(object):
def create_valid(self, data, label=None, weight=None, group=None,
silent=False, params=None):
"""
Create validation data align with current dataset
"""Create validation data align with current Dataset.
Parameters
----------
data : string/numpy array/scipy.sparse
data : string, numpy array or scipy.sparse
Data source of Dataset.
When data type is string, it represents the path of txt file
label : list or numpy 1-D array, optional
If string, it represents the path to txt file.
label : list or numpy 1-D array, optional (default=None)
Label of the training data.
weight : list or numpy 1-D array , optional
weight : list, numpy 1-D array or None, optional (default=None)
Weight for each instance.
group : list or numpy 1-D array , optional
Group/query size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
Other parameters
group : list, numpy 1-D array or None, optional (default=None)
Group/query size for Dataset.
silent : bool, optional (default=False)
Whether to print messages during construction.
params: dict or None, optional (default=None)
Other parameters.
Returns
-------
self : Dataset
Returns self.
"""
ret = Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group=group, silent=silent, params=params,
......@@ -838,15 +849,19 @@ class Dataset(object):
return ret
def subset(self, used_indices, params=None):
"""
Get subset of current dataset
"""Get subset of current Dataset.
Parameters
----------
used_indices : list of int
Used indices of this subset
params : dict
Other parameters
Indices used to create the subset.
params: dict or None, optional (default=None)
Other parameters.
Returns
-------
subset : Dataset
Subset of the current Dataset.
"""
if params is None:
params = self.params
......@@ -858,8 +873,7 @@ class Dataset(object):
return ret
def save_binary(self, filename):
"""
Save Dataset to binary file
"""Save Dataset to binary file.
Parameters
----------
......@@ -886,11 +900,10 @@ class Dataset(object):
Parameters
----------
field_name: str
The field name of the information
data: numpy array or list or None
The array ofdata to be set
field_name: string
The field name of the information.
data: list, numpy array or None
The array of data to be set.
"""
if self.handle is None:
raise Exception("Cannot set %s before construct dataset" % field_name)
......@@ -934,13 +947,13 @@ class Dataset(object):
Parameters
----------
field_name: str
The field name of the information
field_name: string
The field name of the information.
Returns
-------
info : array
A numpy array of information of the data
info : numpy array
A numpy array with information from the Dataset.
"""
if self.handle is None:
raise Exception("Cannot get %s before construct dataset" % field_name)
......@@ -967,14 +980,12 @@ class Dataset(object):
raise TypeError("Unknown type")
def set_categorical_feature(self, categorical_feature):
"""
Set categorical features
"""Set categorical features.
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
categorical_feature : list of int or strings
Names or indices of categorical features.
"""
if self.categorical_feature == categorical_feature:
return
......@@ -1005,13 +1016,12 @@ class Dataset(object):
raise LightGBMError("Cannot set predictor after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
def set_reference(self, reference):
"""
Set reference dataset
"""Set reference Dataset.
Parameters
----------
reference : Dataset
Will use reference as template to consturct current dataset
Reference that is used as a template to consturct the current Dataset.
"""
self.set_categorical_feature(reference.categorical_feature)
self.set_feature_name(reference.feature_name)
......@@ -1026,13 +1036,12 @@ class Dataset(object):
raise LightGBMError("Cannot set reference after freed raw data, set free_raw_data=False when construct Dataset to avoid this.")
def set_feature_name(self, feature_name):
"""
Set feature name
"""Set feature name.
Parameters
----------
feature_name : list of str
Feature names
feature_name : list of strings
Feature names.
"""
if feature_name != 'auto':
self.feature_name = feature_name
......@@ -1046,13 +1055,12 @@ class Dataset(object):
ctypes.c_int(len(feature_name))))
def set_label(self, label):
"""
Set label of Dataset
"""Set label of Dataset
Parameters
----------
label: numpy array or list or None
The label information to be set into Dataset
label: list, numpy array or None
The label information to be set into Dataset.
"""
self.label = label
if self.handle is not None:
......@@ -1060,13 +1068,12 @@ class Dataset(object):
self.set_field('label', label)
def set_weight(self, weight):
"""
Set weight of each instance.
"""Set weight of each instance.
Parameters
----------
weight : numpy array or list or None
Weight for each data point
weight : list, numpy array or None
Weight to be set for each data point.
"""
self.weight = weight
if self.handle is not None and weight is not None:
......@@ -1074,13 +1081,12 @@ class Dataset(object):
self.set_field('weight', weight)
def set_init_score(self, init_score):
"""
Set init score of booster to start from.
"""Set init score of Booster to start from.
Parameters
----------
init_score: numpy array or list or None
Init score for booster
init_score : list, numpy array or None
Init score for Booster.
"""
self.init_score = init_score
if self.handle is not None and init_score is not None:
......@@ -1088,13 +1094,12 @@ class Dataset(object):
self.set_field('init_score', init_score)
def set_group(self, group):
"""
Set group size of Dataset (used for ranking).
"""Set group size of Dataset (used for ranking).
Parameters
----------
group : numpy array or list or None
Group size of each group
group : list, numpy array or None
Group size of each group.
"""
self.group = group
if self.handle is not None and group is not None:
......@@ -1102,48 +1107,48 @@ class Dataset(object):
self.set_field('group', group)
def get_label(self):
"""
Get the label of the Dataset.
"""Get the label of the Dataset.
Returns
-------
label : array
label : numpy array
The label information from the Dataset.
"""
if self.label is None and self.handle is not None:
self.label = self.get_field('label')
return self.label
def get_weight(self):
"""
Get the weight of the Dataset.
"""Get the weight of the Dataset.
Returns
-------
weight : array
weight : numpy array
Weight for each data point from the Dataset.
"""
if self.weight is None and self.handle is not None:
self.weight = self.get_field('weight')
return self.weight
def get_init_score(self):
"""
Get the initial score of the Dataset.
"""Get the initial score of the Dataset.
Returns
-------
init_score : array
init_score : numpy array
Init score of Booster.
"""
if self.init_score is None and self.handle is not None:
self.init_score = self.get_field('init_score')
return self.init_score
def get_group(self):
"""
Get the group of the Dataset.
"""Get the group of the Dataset.
Returns
-------
init_score : array
init_score : numpy array
Group size of each group.
"""
if self.group is None and self.handle is not None:
self.group = self.get_field('group')
......@@ -1156,12 +1161,12 @@ class Dataset(object):
return self.group
def num_data(self):
"""
Get the number of rows in the Dataset.
"""Get the number of rows in the Dataset.
Returns
-------
number of rows : int
number_of_rows : int
The number of rows in the Dataset.
"""
if self.handle is not None:
ret = ctypes.c_int()
......@@ -1172,12 +1177,12 @@ class Dataset(object):
raise LightGBMError("Cannot get num_data before construct dataset")
def num_feature(self):
"""
Get the number of columns (features) in the Dataset.
"""Get the number of columns (features) in the Dataset.
Returns
-------
number of columns : int
number_of_columns : int
The number of columns (features) in the Dataset.
"""
if self.handle is not None:
ret = ctypes.c_int()
......@@ -1188,14 +1193,19 @@ class Dataset(object):
raise LightGBMError("Cannot get num_feature before construct dataset")
def get_ref_chain(self, ref_limit=100):
'''
Gets a chain of Dataset objects, starting with r, then going to r.reference if exists,
then to r.reference.reference, etc. until we hit ref_limit or a reference loop
"""Get a chain of Dataset objects, starting with r, then going to r.reference if exists,
then to r.reference.reference, etc. until we hit ``ref_limit`` or a reference loop.
Parameters
----------
ref_limit : int, optional (default=100)
The limit number of references.
Returns
-------
chain of references of self : set of Dataset objects
'''
ref_chain : set of Dataset
Chain of references of the Datasets.
"""
head = self
ref_chain = set()
while len(ref_chain) < ref_limit:
......@@ -1211,21 +1221,20 @@ class Dataset(object):
class Booster(object):
""""Booster in LightGBM."""
"""Booster in LightGBM."""
def __init__(self, params=None, train_set=None, model_file=None, silent=False):
"""
Initialize the Booster.
"""Initialize the Booster.
Parameters
----------
params : dict
Parameters for boosters.
train_set : Dataset
Training dataset
model_file : string
params: dict or None, optional (default=None)
Parameters for Booster.
train_set : Dataset or None, optional (default=None)
Training dataset.
model_file : string or None, optional (default=None)
Path to the model file.
silent : boolean, optional
Whether print messages during construction
silent : bool, optional (default=False)
Whether to print messages during construction.
"""
self.handle = None
self.__need_reload_eval_info = True
......@@ -1325,6 +1334,7 @@ class Booster(object):
self.__dict__.update(state)
def free_dataset(self):
"""Free Booster's Datasets."""
self.__dict__.pop('train_set', None)
self.__dict__.pop('valid_sets', None)
self.__num_dataset = 0
......@@ -1334,21 +1344,27 @@ class Booster(object):
self.__is_predicted_cur_iter = []
def set_train_data_name(self, name):
"""Set the name to the training Dataset.
Parameters
----------
name: string
Name for training Dataset.
"""
self.__train_data_name = name
def add_valid(self, data, name):
"""
Add an validation data
"""Add validation data.
Parameters
----------
data : Dataset
Validation data
name : String
Name of validation data
Validation data.
name : string
Name of validation data.
"""
if not isinstance(data, Dataset):
raise TypeError('valid data should be Dataset instance, met {}'.format(type(data).__name__))
raise TypeError('Validation data should be Dataset instance, met {}'.format(type(data).__name__))
if data._predictor is not self.__init_predictor:
raise LightGBMError("Add validation data failed, you should use same predictor for these data")
_safe_call(_LIB.LGBM_BoosterAddValidData(
......@@ -1361,15 +1377,12 @@ class Booster(object):
self.__is_predicted_cur_iter.append(False)
def reset_parameter(self, params):
"""
Reset parameters for booster
"""Reset parameters of Booster.
Parameters
----------
params : dict
New parameters for boosters
silent : boolean, optional
Whether print messages during construction
New parameters for Booster.
"""
if 'metric' in params:
self.__need_reload_eval_info = True
......@@ -1380,22 +1393,24 @@ class Booster(object):
c_str(params_str)))
def update(self, train_set=None, fobj=None):
"""
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
"""Update for one iteration.
Parameters
----------
train_set :
Training data, None means use last training data
fobj : function
train_set : Dataset or None, optional (default=None)
Training data.
If None, last training data is used.
fobj : callable or None, optional (default=None)
Customized objective function.
For multi-class task, the score is group by class_id first, then group by row_id.
If you want to get i-th row score in j-th class, the access way is score[j * num_data + i]
and you should group grad and hess in this way as well.
Returns
-------
is_finished, bool
is_finished : bool
Whether the update was successfully finished.
"""
"""need reset training data"""
......@@ -1452,14 +1467,19 @@ class Booster(object):
return is_finished.value == 1
def rollback_one_iter(self):
"""
Rollback one iteration
"""
"""Rollback one iteration."""
_safe_call(_LIB.LGBM_BoosterRollbackOneIter(
self.handle))
self.__is_predicted_cur_iter = [False for _ in range_(self.__num_dataset)]
def current_iteration(self):
"""Get the index of the current iteration.
Returns
-------
cur_iter : int
The index of the current iteration.
"""
out_cur_iter = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle,
......@@ -1467,20 +1487,21 @@ class Booster(object):
return out_cur_iter.value
def eval(self, data, name, feval=None):
"""
Evaluate for data
"""Evaluate for data.
Parameters
----------
data : Dataset object
name :
Name of data
feval : function
data : Dataset
Data for the evaluating.
name : string
Name of the data.
feval : callable or None, optional (default=None)
Custom evaluation function.
Returns
-------
result: list
Evaluation result list.
List with evaluation results.
"""
if not isinstance(data, Dataset):
raise TypeError("Can only eval for Dataset instance")
......@@ -1500,48 +1521,46 @@ class Booster(object):
return self.__inner_eval(name, data_idx, feval)
def eval_train(self, feval=None):
"""
Evaluate for training data
"""Evaluate for training data.
Parameters
----------
feval : function
feval : callable or None, optional (default=None)
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
result: list
List with evaluation results.
"""
return self.__inner_eval(self.__train_data_name, 0, feval)
def eval_valid(self, feval=None):
"""
Evaluate for validation data
"""Evaluate for validation data.
Parameters
----------
feval : function
feval : callable or None, optional (default=None)
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
result: list
List with evaluation results.
"""
return [item for i in range_(1, self.__num_dataset)
for item in self.__inner_eval(self.name_valid_sets[i - 1], i, feval)]
def save_model(self, filename, num_iteration=-1):
"""
Save model of booster to file
"""Save Booster to file.
Parameters
----------
filename : str
Filename to save
num_iteration: int
Number of iteration that want to save. < 0 means save the best iteration(if have)
filename : string
Filename to save Booster.
num_iteration: int, optional (default=-1)
Index of the iteration that should to saved.
If <0, the best iteration (if exists) is saved.
"""
if num_iteration <= 0:
num_iteration = self.best_iteration
......@@ -1596,17 +1615,18 @@ class Booster(object):
return string_buffer.value.decode()
def dump_model(self, num_iteration=-1):
"""
Dump model to json format
"""Dump Booster to json format.
Parameters
----------
num_iteration: int
Number of iteration that want to dump. < 0 means dump to best iteration(if have)
num_iteration: int, optional (default=-1)
Index of the iteration that should to dumped.
If <0, the best iteration (if exists) is dumped.
Returns
-------
Json format of model
json_repr : dict
Json format of Booster.
"""
if num_iteration <= 0:
num_iteration = self.best_iteration
......@@ -1633,32 +1653,34 @@ class Booster(object):
ptr_string_buffer))
return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True,
pred_parameter=None):
"""
Predict logic
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False,
data_has_header=False, is_reshape=True, pred_parameter=None):
"""Make a prediction.
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data type is string, it represents the path of txt file
num_iteration : int
Used iteration for prediction, < 0 means predict for best iteration(if have)
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
is_reshape : bool
Reshape to (nrow, ncol) if true
pred_parameter: dict
Other parameters for the prediction
data : string, numpy array or scipy.sparse
Data source for prediction.
If string, it represents the path to txt file.
num_iteration : int, optional (default=-1)
Iteration used for prediction.
If <0, the best iteration (if exists) is used for prediction.
raw_score : bool, optional (default=False)
Whether to predict raw scores.
pred_leaf : bool, optional (default=False)
Whether to predict leaf index.
data_has_header : bool, optional (default=False)
Whether the data has header.
Used only if data is string.
is_reshape : bool, optional (default=True)
If True, result is reshaped to [nrow, ncol].
pred_parameter: dict or None, optional (default=None)
Other parameters for the prediction.
Returns
-------
Prediction result
result : numpy array
Prediction result.
"""
predictor = self._to_predictor(pred_parameter)
if num_iteration <= 0:
......@@ -1666,6 +1688,20 @@ class Booster(object):
return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape)
def get_leaf_output(self, tree_id, leaf_id):
"""Get the output of a leaf.
Parameters
----------
tree_id : int
The index of the tree.
leaf_id : int
The index of the leaf in the tree.
Returns
-------
result : float
The output of the leaf.
"""
ret = ctypes.c_double(0)
_safe_call(_LIB.LGBM_BoosterGetLeafValue(
self.handle,
......@@ -1681,7 +1717,13 @@ class Booster(object):
return predictor
def num_feature(self):
"""Get num of features"""
"""Get number of features.
Returns
-------
num_feature : int
The number of features.
"""
out_num_feature = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterGetNumFeature(
self.handle,
......@@ -1689,13 +1731,12 @@ class Booster(object):
return out_num_feature.value
def feature_name(self):
"""
Get feature names.
"""Get names of features.
Returns
-------
result : array
Array of feature names.
result : list
List with names of features.
"""
num_feature = self.num_feature()
"""Get name of features"""
......@@ -1711,20 +1752,19 @@ class Booster(object):
return [string_buffers[i].value.decode() for i in range_(num_feature)]
def feature_importance(self, importance_type='split', iteration=-1):
"""
Get feature importances
"""Get feature importances.
Parameters
----------
importance_type : str, default "split"
How the importance is calculated: "split" or "gain"
"split" is the number of times a feature is used in a model
"gain" is the total gain of splits which use the feature
importance_type : string, optional (default="split")
How the importance is calculated.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
Returns
-------
result : array
Array of feature importances.
result : numpy array
Array with feature importances.
"""
if importance_type == "split":
importance_type_int = 0
......@@ -1834,29 +1874,29 @@ class Booster(object):
[name.startswith(('auc', 'ndcg', 'map')) for name in self.__name_inner_eval]
def attr(self, key):
"""
Get attribute string from the Booster.
"""Get attribute string from the Booster.
Parameters
----------
key : str
The key to get attribute from.
key : string
The name of the attribute.
Returns
-------
value : str
The attribute value of the key, returns None if attribute do not exist.
value : string or None
The attribute value.
Returns None if attribute do not exist.
"""
return self.__attr.get(key, None)
def set_attr(self, **kwargs):
"""
Set the attribute of the Booster.
"""Set the attribute of the Booster.
Parameters
----------
**kwargs
The attributes to set. Setting a value to None deletes an attribute.
The attributes to set.
Setting a value to None deletes an attribute.
"""
for key, value in kwargs.items():
if value is not None:
......
......@@ -10,6 +10,7 @@ from .compat import range_
class EarlyStopException(Exception):
"""Exception of early stopping.
Parameters
----------
best_iteration : int
......@@ -46,20 +47,19 @@ def _format_eval_result(value, show_stdv=True):
def print_evaluation(period=1, show_stdv=True):
"""Create a callback that print evaluation result.
"""Create a callback that prints the evaluation results.
Parameters
----------
period : int
The period to log the evaluation results
show_stdv : bool, optional
Whether show stdv if provided
period : int, optional (default=1)
The period to print the evaluation results.
show_stdv : bool, optional (default=True)
Whether to show stdv (if provided).
Returns
-------
callback : function
A callback that print evaluation every period iterations.
The callback that prints the evaluation results every ``period`` iteration(s).
"""
def callback(env):
"""internal function"""
......@@ -71,7 +71,7 @@ def print_evaluation(period=1, show_stdv=True):
def record_evaluation(eval_result):
"""Create a call back that records the evaluation history into eval_result.
"""Create a callback that records the evaluation history into ``eval_result``.
Parameters
----------
......@@ -81,7 +81,7 @@ def record_evaluation(eval_result):
Returns
-------
callback : function
The requested callback function.
The callback that records the evaluation history into the passed dictionary.
"""
if not isinstance(eval_result, dict):
raise TypeError('Eval_result should be a dictionary')
......@@ -103,22 +103,25 @@ def record_evaluation(eval_result):
def reset_parameter(**kwargs):
"""Reset parameter after first iteration
"""Create a callback that resets the parameter after the first iteration.
NOTE: the initial parameter will still take in-effect on first iteration.
Note
----
The initial parameter will still take in-effect on first iteration.
Parameters
----------
**kwargs: value should be list or function
List of parameters for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round (e.g. yields learning rate decay)
- list l: parameter = l[current_round]
- function f: parameter = f(current_round)
or a customized function that calculates the parameter in terms of
current number of round (e.g. yields learning rate decay).
If list lst, parameter = lst[current_round].
If function func, parameter = func(current_round).
Returns
-------
callback : function
The requested callback function.
The callback that resets the parameter after the first iteration.
"""
def callback(env):
"""internal function"""
......@@ -144,22 +147,25 @@ def reset_parameter(**kwargs):
def early_stopping(stopping_rounds, verbose=True):
"""Create a callback that activates early stopping.
Note
----
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Requires at least one validation data and one metric.
If there's more than one, will check all of them.
Parameters
----------
stopping_rounds : int
The stopping rounds before the trend occur.
The possible number of rounds without the trend occurrence.
verbose : optional, bool
Whether to print message about early stopping information.
verbose : bool, optional (default=True)
Whether to print message with early stopping information.
Returns
-------
callback : function
The requested callback function.
The callback that activates early stopping.
"""
best_score = []
best_iter = []
......
......@@ -22,8 +22,7 @@ def train(params, train_set, num_boost_round=100,
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None,
keep_training_booster=False, callbacks=None):
"""
Train with given parameters.
"""Perform the training with given parameters.
Parameters
----------
......@@ -31,68 +30,67 @@ def train(params, train_set, num_boost_round=100,
Parameters for training.
train_set : Dataset
Data to be trained.
num_boost_round: int
num_boost_round: int, optional (default=100)
Number of boosting iterations.
valid_sets: list of Datasets
List of data to be evaluated during training
valid_names: list of string
Names of valid_sets
fobj : function
valid_sets: list of Datasets or None, optional (default=None)
List of data to be evaluated during training.
valid_names: list of string or None, optional (default=None)
Names of ``valid_sets``.
fobj : callable or None, optional (default=None)
Customized objective function.
feval : function
feval : callable or None, optional (default=None)
Customized evaluation function.
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_sets.
Example: with a valid_sets containing [valid_set, train_set]
and valid_names containing ['eval', 'train']
and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True,
the eval metric on the valid set is printed at each boosting stage.
If `verbose_eval` is int,
the eval metric on the valid set is printed at every `verbose_eval` boosting stage.
The last boosting stage
or the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals,
an evaluation metric is printed every 4 (instead of 1) boosting stages.
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate
in terms of current number of round (e.g. yields learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round)
keep_training_booster : boolean
Whether the return booster will be used to keep training.
If false, will convert into _InnerPredictor before return.
You can still use _InnerPredictor as init_model for future continue training.
callbacks : list of callback functions
Note: should return (eval_name, eval_result, is_higher_better) or list of such tuples.
init_model : string or None, optional (default=None)
Filename of LightGBM model or Booster instance used for continue training.
feature_name : list of strings or 'auto', optional (default="auto")
Feature names.
If 'auto' and data is pandas DataFrame, data columns names are used.
categorical_feature : list of strings or int, or 'auto', optional (default="auto")
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
early_stopping_rounds: int or None, optional (default=None)
Activates early stopping. The model will train until the validation score stops improving.
Requires at least one validation data and one metric. If there's more than one, will check all of them.
If early stopping occurs, the model will add ``best_iteration`` field.
evals_result: dict or None, optional (default=None)
This dictionary used to store all evaluation results of all the items in ``valid_sets``.
Example
-------
With a ``valid_sets`` = [valid_set, train_set],
``valid_names`` = ['eval', 'train']
and a ``params`` = ('metric':'logloss')
returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}.
verbose_eval : bool or int, optional (default=True)
Requires at least one validation data.
If True, the eval metric on the valid set is printed at each boosting stage.
If int, the eval metric on the valid set is printed at every ``verbose_eval`` boosting stage.
The last boosting stage or the boosting stage found by using ``early_stopping_rounds`` is also printed.
Example
-------
With ``verbose_eval`` = 4 and at least one item in evals,
an evaluation metric is printed every 4 (instead of 1) boosting stages.
learning_rates: list, callable or None, optional (default=None)
List of learning rates for each boosting round
or a customized function that calculates ``learning_rate``
in terms of current number of round (e.g. yields learning rate decay).
keep_training_booster : bool, optional (default=False)
Whether the returned Booster will be used to keep training.
If False, the returned value will be converted into _InnerPredictor before returning.
You can still use _InnerPredictor as ``init_model`` for future continue training.
callbacks : list of callables or None, optional (default=None)
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
Returns
-------
booster : a trained booster model
booster : Booster
The trained Booster model.
"""
"""create predictor first"""
for alias in ["num_boost_round", "num_iterations", "num_iteration", "num_tree", "num_trees", "num_round", "num_rounds"]:
......@@ -316,68 +314,71 @@ def cv(params, train_set, num_boost_round=10,
early_stopping_rounds=None, fpreproc=None,
verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
"""
Cross-validation with given paramaters.
"""Perform the cross-validation with given paramaters.
Parameters
----------
params : dict
Booster params.
Parameters for Booster.
train_set : Dataset
Data to be trained.
num_boost_round : int
Data to be trained on.
num_boost_round : int, optional (default=10)
Number of boosting iterations.
folds : a generator or iterator of (train_idx, test_idx) tuples
The train indices and test indices for each folds.
folds : a generator or iterator of (train_idx, test_idx) tuples or None, optional (default=None)
The train and test indices for the each fold.
This argument has highest priority over other data split arguments.
nfold : int
nfold : int, optional (default=5)
Number of folds in CV.
stratified : bool
Perform stratified sampling.
shuffle: bool
Whether shuffle before split data
metrics : string or list of strings
Evaluation metrics to be watched in CV.
If `metrics` is not None, the metric in `params` will be overridden.
fobj : function
stratified : bool, optional (default=True)
Whether to perform stratified sampling.
shuffle: bool, optional (default=True)
Whether to shuffle before splitting data.
metrics : string, list of strings or None, optional (default=None)
Evaluation metrics to be monitored while CV.
If not None, the metric in ``params`` will be overridden.
fobj : callable or None, optional (default=None)
Custom objective function.
feval : function
feval : callable or None, optional (default=None)
Custom evaluation function.
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
init_model : string or None, optional (default=None)
Filename of LightGBM model or Booster instance used for continue training.
feature_name : list of strings or 'auto', optional (default="auto")
Feature names.
If 'auto' and data is pandas DataFrame, data columns names are used.
categorical_feature : list of strings or int, or 'auto', optional (default="auto")
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
early_stopping_rounds: int or None, optional (default=None)
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
every ``early_stopping_rounds`` round(s) to continue.
Last entry in evaluation history is the one from best iteration.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param)
fpreproc : callable or None, optional (default=None)
Preprocessing function that takes (dtrain, dtest, params)
and returns transformed versions of those.
verbose_eval : bool, int, or None, default None
verbose_eval : bool, int, or None, optional (default=None)
Whether to display the progress.
If None, progress will be displayed when np.ndarray is returned.
If True, progress will be displayed at boosting stage.
If an integer is given,
progress will be displayed at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
If True, progress will be displayed at every boosting stage.
If int, progress will be displayed at every given ``verbose_eval`` boosting stage.
show_stdv : bool, optional (default=True)
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
seed : int
Results are not affected by this parameter, and always contains std.
seed : int, optional (default=0)
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callback functions
callbacks : list of callables or None, optional (default=None)
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
Returns
-------
evaluation history : list(string)
eval_hist : dict
Evaluation history.
The dictionary has the following format:
{'metric1-mean': [values], 'metric1-stdv': [values],
'metric2-mean': [values], 'metric1-stdv': [values],
...}.
"""
if not isinstance(train_set, Dataset):
raise TypeError("Traninig only accepts Dataset object")
......
......@@ -24,45 +24,50 @@ def plot_importance(booster, ax=None, height=0.2,
xlabel='Feature importance', ylabel='Features',
importance_type='split', max_num_features=None,
ignore_zero=True, figsize=None, grid=True, **kwargs):
"""Plot model feature importances.
"""Plot model's feature importances.
Parameters
----------
booster : Booster or LGBMModel
Booster or LGBMModel instance
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
height : float
Bar height, passed to ax.barh()
xlim : tuple of 2 elements
Tuple passed to axes.xlim()
ylim : tuple of 2 elements
Tuple passed to axes.ylim()
title : str
Axes title. Pass None to disable.
xlabel : str
X axis title label. Pass None to disable.
ylabel : str
Y axis title label. Pass None to disable.
importance_type : str
How the importance is calculated: "split" or "gain"
"split" is the number of times a feature is used in a model
"gain" is the total gain of splits which use the feature
max_num_features : int
Booster or LGBMModel instance which feature importance should be plotted.
ax : matplotlib.axes.Axes or None, optional (default=None)
Target axes instance.
If None, new figure and axes will be created.
height : float, optional (default=0.2)
Bar height, passed to ``ax.barh()``.
xlim : tuple of 2 elements or None, optional (default=None)
Tuple passed to ``ax.xlim()``.
ylim : tuple of 2 elements or None, optional (default=None)
Tuple passed to ``ax.ylim()``.
title : string or None, optional (default="Feature importance")
Axes title.
If None, title is disabled.
xlabel : string or None, optional (default="Feature importance")
X-axis title label.
If None, title is disabled.
ylabel : string or None, optional (default="Features")
Y-axis title label.
If None, title is disabled.
importance_type : string, optional (default="split")
How the importance is calculated.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
max_num_features : int or None, optional (default=None)
Max number of top features displayed on plot.
If None or smaller than 1, all features will be displayed.
ignore_zero : bool
Ignore features with zero importance
figsize : tuple of 2 elements
Figure size
grid : bool
Whether add grid for axes
**kwargs :
Other keywords passed to ax.barh()
If None or <1, all features will be displayed.
ignore_zero : bool, optional (default=True)
Whether to ignore features with zero importance.
figsize : tuple of 2 elements or None, optional (default=None)
Figure size.
grid : bool, optional (default=True)
Whether to add a grid for axes.
**kwargs : other parameters
Other parameters passed to ``ax.barh()``.
Returns
-------
ax : matplotlib Axes
ax : matplotlib.axes.Axes
The plot with model's feature importances.
"""
try:
import matplotlib.pyplot as plt
......@@ -133,34 +138,40 @@ def plot_metric(booster, metric=None, dataset_names=None,
Parameters
----------
booster : dict or LGBMModel
Evals_result recorded by lightgbm.train() or LGBMModel instance
metric : str or None
Dictionary returned from ``lightgbm.train()`` or LGBMModel instance.
metric : string or None, optional (default=None)
The metric name to plot.
Only one metric supported because different metrics have various scales.
Pass None to pick `first` one (according to dict hashcode).
dataset_names : None or list of str
List of the dataset names to plot.
Pass None to plot all datasets.
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
xlim : tuple of 2 elements
Tuple passed to axes.xlim()
ylim : tuple of 2 elements
Tuple passed to axes.ylim()
title : str
Axes title. Pass None to disable.
xlabel : str
X axis title label. Pass None to disable.
ylabel : str
Y axis title label. Pass None to disable. Pass 'auto' to use `metric`.
figsize : tuple of 2 elements
Figure size
grid : bool
Whether add grid for axes
If None, first metric picked from dictionary (according to hashcode).
dataset_names : list of strings or None, optional (default=None)
List of the dataset names which are used to calculate metric to plot.
If None, all datasets are used.
ax : matplotlib.axes.Axes or None, optional (default=None)
Target axes instance.
If None, new figure and axes will be created.
xlim : tuple of 2 elements or None, optional (default=None)
Tuple passed to ``ax.xlim()``.
ylim : tuple of 2 elements or None, optional (default=None)
Tuple passed to ``ax.ylim()``.
title : string or None, optional (default="Metric during training")
Axes title.
If None, title is disabled.
xlabel : string or None, optional (default="Iterations")
X-axis title label.
If None, title is disabled.
ylabel : string or None, optional (default="auto")
Y-axis title label.
If 'auto', metric name is used.
If None, title is disabled.
figsize : tuple of 2 elements or None, optional (default=None)
Figure size.
grid : bool, optional (default=True)
Whether to add a grid for axes.
Returns
-------
ax : matplotlib Axes
ax : matplotlib.axes.Axes
The plot with metric's history over the training.
"""
try:
import matplotlib.pyplot as plt
......@@ -298,48 +309,52 @@ def create_tree_digraph(booster, tree_index=0, show_info=None,
name=None, comment=None, filename=None, directory=None,
format=None, engine=None, encoding=None, graph_attr=None,
node_attr=None, edge_attr=None, body=None, strict=False):
"""Create a digraph of specified tree.
"""Create a digraph representation of specified tree.
See:
- http://graphviz.readthedocs.io/en/stable/api.html#digraph
Note
----
For more information please visit
http://graphviz.readthedocs.io/en/stable/api.html#digraph.
Parameters
----------
booster : Booster, LGBMModel
booster : Booster or LGBMModel
Booster or LGBMModel instance.
tree_index : int, default 0
Specify tree index of target tree.
show_info : list
Information shows on nodes.
options: 'split_gain', 'internal_value', 'internal_count' or 'leaf_count'.
name : str
tree_index : int, optional (default=0)
The index of a target tree to convert.
show_info : list or None, optional (default=None)
What information should be showed on nodes.
Possible values of list items: 'split_gain', 'internal_value', 'internal_count', 'leaf_count'.
name : string or None, optional (default=None)
Graph name used in the source code.
comment : str
comment : string or None, optional (default=None)
Comment added to the first line of the source.
filename : str
Filename for saving the source (defaults to name + '.gv').
directory : str
filename : string or None, optional (default=None)
Filename for saving the source.
If None, ``name`` + '.gv' is used.
directory : string or None, optional (default=None)
(Sub)directory for source saving and rendering.
format : str
format : string or None, optional (default=None)
Rendering output format ('pdf', 'png', ...).
engine : str
engine : string or None, optional (default=None)
Layout command used ('dot', 'neato', ...).
encoding : str
encoding : string or None, optional (default=None)
Encoding for saving the source.
graph_attr : dict
Mapping of (attribute, value) pairs for the graph.
node_attr : dict
graph_attr : dict or None, optional (default=None)
Mapping of (attribute, value) pairs set for the graph.
node_attr : dict or None, optional (default=None)
Mapping of (attribute, value) pairs set for all nodes.
edge_attr : dict
edge_attr : dict or None, optional (default=None)
Mapping of (attribute, value) pairs set for all edges.
body : list of str
Iterable of lines to add to the graph body.
strict : bool
Iterable of lines to add to the graph body.
body : list of strings or None, optional (default=None)
Lines to add to the graph body.
strict : bool, optional (default=False)
Whether rendering should merge multi-edges.
Returns
-------
graph : graphviz Digraph
graph : graphviz.Digraph
The digraph representation of specified tree.
"""
if isinstance(booster, LGBMModel):
booster = booster.booster_
......@@ -376,27 +391,29 @@ def plot_tree(booster, ax=None, tree_index=0, figsize=None,
Parameters
----------
booster : Booster, LGBMModel
Booster or LGBMModel instance.
ax : matplotlib Axes
Target axes instance. If None, new figure and axes will be created.
tree_index : int, default 0
Specify tree index of target tree.
figsize : tuple of 2 elements
booster : Booster or LGBMModel
Booster or LGBMModel instance to be plotted.
ax : matplotlib.axes.Axes or None, optional (default=None)
Target axes instance.
If None, new figure and axes will be created.
tree_index : int, optional (default=0)
The index of a target tree to plot.
figsize : tuple of 2 elements or None, optional (default=None)
Figure size.
graph_attr : dict
Mapping of (attribute, value) pairs for the graph.
node_attr : dict
graph_attr : dict or None, optional (default=None)
Mapping of (attribute, value) pairs set for the graph.
node_attr : dict or None, optional (default=None)
Mapping of (attribute, value) pairs set for all nodes.
edge_attr : dict
edge_attr : dict or None, optional (default=None)
Mapping of (attribute, value) pairs set for all edges.
show_info : list
Information shows on nodes.
options: 'split_gain', 'internal_value', 'internal_count' or 'leaf_count'.
show_info : list or None, optional (default=None)
What information should be showed on nodes.
Possible values of list items: 'split_gain', 'internal_value', 'internal_count', 'leaf_count'.
Returns
-------
ax : matplotlib Axes
ax : matplotlib.axes.Axes
The plot with single tree.
"""
try:
import matplotlib.pyplot as plt
......
......@@ -188,7 +188,10 @@ class LGBMModel(_LGBMModelBase):
Whether to print messages while running boosting.
**kwargs : other parameters
Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
Note: **kwargs is not supported in sklearn, it may cause unexpected issues.
Note
----
**kwargs is not supported in sklearn, it may cause unexpected issues.
Attributes
----------
......@@ -201,13 +204,13 @@ class LGBMModel(_LGBMModelBase):
best_score_ : dict or None
The best score of fitted model.
best_iteration_ : int or None
The best iteration of fitted model if `early_stopping_rounds` has been specified.
The best iteration of fitted model if ``early_stopping_rounds`` has been specified.
objective_ : string or callable
The concrete objective used while fitting this model.
booster_ : Booster
The underlying Booster of this model.
evals_result_ : dict or None
The evaluation results if `early_stopping_rounds` has been specified.
The evaluation results if ``early_stopping_rounds`` has been specified.
feature_importances_ : array of shape = [n_features]
The feature importances (the higher, the more important the feature).
......@@ -319,7 +322,7 @@ class LGBMModel(_LGBMModelBase):
If callable, it should be a custom evaluation metric, see note for more details.
early_stopping_rounds : int or None, optional (default=None)
Activates early stopping. The model will train until the validation score stops improving.
Validation error needs to decrease at least every `early_stopping_rounds` round(s)
Validation error needs to decrease at least every ``early_stopping_rounds`` round(s)
to continue training.
verbose : bool, optional (default=True)
If True and an evaluation set is used, writes the evaluation progress.
......@@ -560,7 +563,9 @@ class LGBMModel(_LGBMModelBase):
def feature_importances_(self):
"""Get feature importances.
Note: feature importance in sklearn interface used to normalize to 1,
Note
----
Feature importance in sklearn interface used to normalize to 1,
it's deprecated after 2.0.4 and same as Booster.feature_importance() now.
"""
if self._n_features is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment