Merge branch 'master' of https://github.com/Microsoft/LightGBM

54963bb7 · Guolin Ke · 762b5707 · ced64bad · 54963bb7 · 54963bb7
Commit 54963bb7 authored Dec 13, 2016 by Guolin Ke
7 changed files
--- a/docs/Python_API.md
+++ b/docs/Python_API.md
--- a/docs/autodoc.py
+++ b/docs/autodoc.py
+# coding: utf-8
+# pylint: disable = C0103, C0111, C0301, C0321, C0330, W0621
+import inspect
+import lightgbm as lgb
+
+file_api = open('Python_API.md', 'w+')
+
+def write_func(func, leftSpace=0):
+    file_api.write('####' + func.__name__ + '('
+        + ', '.join([
+                v.name + ('=' + str(v.default) if v.default != v.empty else '')
+                for _, v in inspect.signature(func).parameters.items() if v.name != 'self'
+            ])
+        + ')\n')
+    if func.__doc__:
+        for line in func.__doc__.splitlines():
+            if line: file_api.write(line[leftSpace:])
+            file_api.write('\n')
+    file_api.write('\n')
+
+def write_class(class_):
+    file_api.write('###' + class_.__name__ + '\n')
+    for name, members in sorted(class_.__dict__.items(), key=lambda x: x[0]):
+        if name == '__init__' or not name.startswith('_'): write_func(members, leftSpace=4)
+
+def write_module(name, members):
+    file_api.write('##' + name + '\n----\n')
+    for member in members:
+        if inspect.isclass(member): write_class(member)
+        else: write_func(member)
+
+write_module('Basic Data Structure API', [
+        lgb.Dataset,
+        lgb.Booster
+    ])
+write_module('Training API', [
+        lgb.train,
+        lgb.cv
+    ])
+write_module('Scikit-learn API', [
+        lgb.LGBMModel,
+        lgb.LGBMClassifier,
+        lgb.LGBMRegressor,
+        lgb.LGBMRanker
+    ])
+
+file_api.close()
--- a/docs/css/extra.css
+++ b/docs/css/extra.css
+.toctree-l4{
+    padding: 0.4045em 2.427em 0.4045em 3.227em !important;
+}
--- a/mkdocs.yml
+++ b/mkdocs.yml
+site_name: LightGBM
+theme: readthedocs
+
+extra_css:
+  - css/extra.css
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -758,7 +758,7 @@ class _InnerDataset(object):
        self.set_field('weight', weight)

    def set_init_score(self, score):
-        """ Set init score of booster to start from.
+        """Set init score of booster to start from.

        Parameters
        ----------
@@ -869,7 +869,8 @@ class Dataset(object):
        feature_name : list of str
            Feature names
        categorical_feature : list of str or int
-            Categorical features, type int represents index, \
+            Categorical features,
+            type int represents index,
            type str represents feature names (need to specify feature_name as well)
        params: dict, optional
            Other parameters
@@ -919,7 +920,9 @@ class Dataset(object):
        return ret

    def construct(self):
-        """Lazy init"""
+        """
+        Lazy init
+        """
        if self.inner_dataset is None:
            if self.reference is not None:
                if self.used_indices is None:
@@ -1036,7 +1039,8 @@ class Dataset(object):
        return ret

    def save_binary(self, filename):
-        """Save Dataset to binary file
+        """
+        Save Dataset to binary file

        Parameters
        ----------
@@ -1047,7 +1051,8 @@ class Dataset(object):


    def set_label(self, label):
-        """Set label of Dataset
+        """
+        Set label of Dataset

        Parameters
        ----------
@@ -1059,7 +1064,8 @@ class Dataset(object):
            self.inner_dataset.set_label(self.label)

    def set_weight(self, weight):
-        """ Set weight of each instance.
+        """
+        Set weight of each instance.

        Parameters
        ----------
@@ -1071,7 +1077,8 @@ class Dataset(object):
            self.inner_dataset.set_weight(self.weight)

    def set_init_score(self, init_score):
-        """ Set init score of booster to start from.
+        """
+        Set init score of booster to start from.

        Parameters
        ----------
@@ -1083,7 +1090,8 @@ class Dataset(object):
            self.inner_dataset.set_init_score(self.init_score)

    def set_group(self, group):
-        """Set group size of Dataset (used for ranking).
+        """
+        Set group size of Dataset (used for ranking).

        Parameters
        ----------
@@ -1095,7 +1103,8 @@ class Dataset(object):
            self.inner_dataset.set_group(self.group)

    def get_label(self):
-        """Get the label of the Dataset.
+        """
+        Get the label of the Dataset.

        Returns
        -------
@@ -1106,7 +1115,8 @@ class Dataset(object):
        return self.label

    def get_weight(self):
-        """Get the weight of the Dataset.
+        """
+        Get the weight of the Dataset.

        Returns
        -------
@@ -1117,7 +1127,8 @@ class Dataset(object):
        return self.weight

    def get_init_score(self):
-        """Get the initial score of the Dataset.
+        """
+        Get the initial score of the Dataset.

        Returns
        -------
@@ -1128,7 +1139,8 @@ class Dataset(object):
        return self.init_score

    def get_group(self):
-        """Get the initial score of the Dataset.
+        """
+        Get the initial score of the Dataset.

        Returns
        -------
@@ -1139,7 +1151,8 @@ class Dataset(object):
        return self.group

    def num_data(self):
-        """Get the number of rows in the Dataset.
+        """
+        Get the number of rows in the Dataset.

        Returns
        -------
@@ -1151,7 +1164,8 @@ class Dataset(object):
            raise LightGBMError("Cannot call num_data before construct, please call it explicitly")

    def num_feature(self):
-        """Get the number of columns (features) in the Dataset.
+        """
+        Get the number of columns (features) in the Dataset.

        Returns
        -------
@@ -1166,7 +1180,8 @@ class Booster(object):
    """"A Booster of LightGBM.
    """
    def __init__(self, params=None, train_set=None, model_file=None, silent=False):
-        """Initialize the Booster.
+        """
+        Initialize the Booster.

        Parameters
        ----------
@@ -1241,7 +1256,8 @@ class Booster(object):
        self.__train_data_name = name

    def add_valid(self, data, name):
-        """Add an validation data
+        """
+        Add an validation data

        Parameters
        ----------
@@ -1262,7 +1278,8 @@ class Booster(object):
        self.__is_predicted_cur_iter.append(False)

    def reset_parameter(self, params):
-        """Reset parameters for booster
+        """
+        Reset parameters for booster

        Parameters
        ----------
@@ -1365,7 +1382,8 @@ class Booster(object):
        return out_cur_iter.value

    def eval(self, data, name, feval=None):
-        """Evaluate for data
+        """
+        Evaluate for data

        Parameters
        ----------
@@ -1397,7 +1415,8 @@ class Booster(object):
        return self.__inner_eval(name, data_idx, feval)

    def eval_train(self, feval=None):
-        """Evaluate for training data
+        """
+        Evaluate for training data

        Parameters
        ----------
@@ -1412,7 +1431,8 @@ class Booster(object):
        return self.__inner_eval(self.__train_data_name, 0, feval)

    def eval_valid(self, feval=None):
-        """Evaluate for validation data
+        """
+        Evaluate for validation data

        Parameters
        ----------
@@ -1428,7 +1448,8 @@ class Booster(object):
            for item in self.__inner_eval(self.name_valid_sets[i-1], i, feval)]

    def save_model(self, filename, num_iteration=-1):
-        """Save model of booster to file
+        """
+        Save model of booster to file

        Parameters
        ----------
@@ -1443,7 +1464,8 @@ class Booster(object):
            c_str(filename)))

    def dump_model(self):
-        """Dump model to json format
+        """
+        Dump model to json format

        Returns
        -------
@@ -1471,7 +1493,8 @@ class Booster(object):
        return json.loads(string_buffer.value.decode())

    def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
-        """Predict logic
+        """
+        Predict logic

        Parameters
        ----------
@@ -1503,7 +1526,8 @@ class Booster(object):
        return predictor

    def feature_importance(self, importance_type='split'):
-        """Feature importances
+        """
+        Feature importances

        Returns
        -------
@@ -1615,7 +1639,8 @@ class Booster(object):
                    [name.startswith(('auc', 'ndcg')) for name in self.__name_inner_eval]

    def attr(self, key):
-        """Get attribute string from the Booster.
+        """
+        Get attribute string from the Booster.

        Parameters
        ----------
@@ -1630,7 +1655,8 @@ class Booster(object):
        return self.__attr.get(key, None)

    def set_attr(self, **kwargs):
-        """Set the attribute of the Booster.
+        """
+        Set the attribute of the Booster.

        Parameters
        ----------

--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -15,7 +15,8 @@ def train(params, train_set, num_boost_round=100,
          feature_name=None, categorical_feature=None,
          early_stopping_rounds=None, evals_result=None,
          verbose_eval=True, learning_rates=None, callbacks=None):
-    """Train with given parameters.
+    """
+    Train with given parameters.

    Parameters
    ----------
@@ -39,7 +40,8 @@ def train(params, train_set, num_boost_round=100,
    feature_name : list of str
        Feature names
    categorical_feature : list of str or int
-        Categorical features, type int represents index, \
+        Categorical features,
+        type int represents index,
        type str represents feature names (need to specify feature_name as well)
    early_stopping_rounds: int
        Activates early stopping.
@@ -49,27 +51,29 @@ def train(params, train_set, num_boost_round=100,
        If early stopping occurs, the model will add 'best_iteration' field
    evals_result: dict or None
        This dictionary used to store all evaluation results of all the items in valid_sets.
-        Example: with a valid_sets containing [valid_set, train_set] \
-        and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
+        Example: with a valid_sets containing [valid_set, train_set]
+                 and valid_names containing ['eval', 'train']
+                 and a paramater containing ('metric':'logloss')
        Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
                  'eval': {'logloss': ['0.480385', '0.357756', ...]}}
        passed with None means no using this function
    verbose_eval : bool or int
        Requires at least one item in evals.
-        If `verbose_eval` is True then the evaluation metric on the validation set is \
-        printed at each boosting stage.
-        If `verbose_eval` is an integer then the evaluation metric on the validation set \
-        is printed at every given `verbose_eval` boosting stage. The last boosting stage \
-        / the boosting stage found by using `early_stopping_rounds` is also printed.
-        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric \
-        is printed every 4 boosting stages, instead of every boosting stage.
+        If `verbose_eval` is True,
+            the eval metric on the valid set is printed at each boosting stage.
+        If `verbose_eval` is int,
+            the eval metric on the valid set is printed at every `verbose_eval` boosting stage.
+        The last boosting stage
+            or the boosting stage found by using `early_stopping_rounds` is also printed.
+        Example: with verbose_eval=4 and at least one item in evals,
+            an evaluation metric is printed every 4 (instead of 1) boosting stages.
    learning_rates: list or function
-        List of learning rate for each boosting round \
-        or a customized function that calculates learning_rate in terms of \
-        current number of round and the total number of boosting round \
+        List of learning rate for each boosting round
+        or a customized function that calculates learning_rate in terms of
+        current number of round (and the total number of boosting round)
        (e.g. yields learning rate decay)
        - list l: learning_rate = l[current_round]
-        - function f: learning_rate = f(current_round, total_boost_round) \
+        - function f: learning_rate = f(current_round, total_boost_round)
                   or learning_rate = f(current_round)
    callbacks : list of callback functions
        List of callback functions that are applied at end of each iteration.
@@ -259,12 +263,13 @@ def _agg_cv_result(raw_results):
    return [('cv_agg', k, np.mean(v), metric_type[k], np.std(v)) for k, v in cvmap.items()]

 def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
-       metrics=(), fobj=None, feval=None, init_model=None,
+       metrics=None, fobj=None, feval=None, init_model=None,
       feature_name=None, categorical_feature=None,
       early_stopping_rounds=None, fpreproc=None,
       verbose_eval=None, show_stdv=True, seed=0,
       callbacks=None):
-    """Cross-validation with given paramaters.
+    """
+    Cross-validation with given paramaters.

    Parameters
    ----------
@@ -291,20 +296,21 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
    feature_name : list of str
        Feature names
    categorical_feature : list of str or int
-        Categorical features, type int represents index, \
+        Categorical features, type int represents index,
        type str represents feature names (need to specify feature_name as well)
    early_stopping_rounds: int
-        Activates early stopping. CV error needs to decrease at least \
+        Activates early stopping. CV error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue.
        Last entry in evaluation history is the one from best iteration.
    fpreproc : function
-        Preprocessing function that takes (dtrain, dtest, param) and returns \
-        transformed versions of those.
+        Preprocessing function that takes (dtrain, dtest, param)
+        and returns transformed versions of those.
    verbose_eval : bool, int, or None, default None
-        Whether to display the progress. If None, progress will be displayed \
-        when np.ndarray is returned. If True, progress will be displayed at \
-        boosting stage. If an integer is given, progress will be displayed \
-        at every given `verbose_eval` boosting stage.
+        Whether to display the progress.
+        If None, progress will be displayed when np.ndarray is returned.
+        If True, progress will be displayed at boosting stage.
+        If an integer is given,
+            progress will be displayed at every given `verbose_eval` boosting stage.
    show_stdv : bool, default True
        Whether to display the standard deviation in progress.
        Results are not affected, and always contains std.

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -128,7 +128,16 @@ def _eval_function_wrapper(func):
    return inner

 class LGBMModel(LGBMModelBase):
-    """Implementation of the Scikit-Learn API for LightGBM.
+
+    def __init__(self, num_leaves=31, max_depth=-1,
+                 learning_rate=0.1, n_estimators=10, max_bin=255,
+                 silent=True, objective="regression",
+                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
+                 subsample=1, subsample_freq=1, colsample_bytree=1,
+                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
+                 is_unbalance=False, seed=0):
+        """
+        Implementation of the Scikit-Learn API for LightGBM.

        Parameters
        ----------
@@ -145,6 +154,7 @@ class LGBMModel(LGBMModelBase):
        objective : string or callable
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
+            default: binary for LGBMClassifier, lambdarank for LGBMRanker
        nthread : int
            Number of parallel threads
        min_split_gain : float
@@ -174,7 +184,8 @@ class LGBMModel(LGBMModelBase):
        ----
        A custom objective function can be provided for the ``objective``
        parameter. In this case, it should have the signature
-    ``objective(y_true, y_pred) -> grad, hess`` or ``objective(y_true, y_pred, group) -> grad, hess``:
+        ``objective(y_true, y_pred) -> grad, hess`` 
+            or ``objective(y_true, y_pred, group) -> grad, hess``:

            y_true: array_like of shape [n_samples]
                The target values
@@ -191,14 +202,6 @@ class LGBMModel(LGBMModelBase):
            if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
            and you should group grad and hess in this way as well
        """
-
-    def __init__(self, num_leaves=31, max_depth=-1,
-                 learning_rate=0.1, n_estimators=10, max_bin=255,
-                 silent=True, objective="regression",
-                 nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
-                 subsample=1, subsample_freq=1, colsample_bytree=1,
-                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
-                 is_unbalance=False, seed=0):
        if not SKLEARN_INSTALLED:
            raise LightGBMError('Scikit-learn is required for this module')

@@ -229,8 +232,8 @@ class LGBMModel(LGBMModelBase):
            self.fobj = None

    def booster(self):
-        """Get the underlying lightgbm Booster of this model.
-
+        """
+        Get the underlying lightgbm Booster of this model.
        This will raise an exception when fit was not called

        Returns
@@ -242,7 +245,9 @@ class LGBMModel(LGBMModelBase):
        return self._Booster

    def get_params(self, deep=False):
-        """Get parameters"""
+        """
+        Get parameters
+        """
        params = super(LGBMModel, self).get_params(deep=deep)
        if self.nthread <= 0:
            params.pop('nthread', None)
@@ -288,20 +293,23 @@ class LGBMModel(LGBMModelBase):
        feature_name : list of str
            Feature names
        categorical_feature : list of str or int
-            Categorical features, type int represents index, \
+            Categorical features,
+            type int represents index,
            type str represents feature names (need to specify feature_name as well)
        other_params: dict
            Other parameters

        Note
        ----
-        Custom eval function expects a callable with following functions: ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)`` 
+        Custom eval function expects a callable with following functions:
+            ``func(y_true, y_pred)``, ``func(y_true, y_pred, weight)``
                or ``func(y_true, y_pred, weight, group)``.
-            return (eval_name, eval_result, is_bigger_better) or list of (eval_name, eval_result, is_bigger_better)
+            return (eval_name, eval_result, is_bigger_better)
+                or list of (eval_name, eval_result, is_bigger_better)

            y_true: array_like of shape [n_samples]
                The target values
-            y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
+            y_pred: array_like of shape [n_samples] or shape[n_samples * n_class] (for multi-class)
                The predicted values
            weight: array_like of shape [n_samples]
                The weight of samples
@@ -383,20 +391,36 @@ class LGBMModel(LGBMModelBase):
        return self

    def predict(self, data, raw_score=False, num_iteration=0):
+        """
+        Return the predicted value for each sample.
+
+        Parameters
+        ----------
+        X : array_like, shape=[n_samples, n_features]
+            Input features matrix.
+
+        num_iteration : int
+            Limit number of iterations in the prediction; defaults to 0 (use all trees).
+
+        Returns
+        -------
+        predicted_result : array_like, shape=[n_samples] or [n_samples, n_classes]
+        """
        return self.booster().predict(data,
                                      raw_score=raw_score,
                                      num_iteration=num_iteration)

    def apply(self, X, num_iteration=0):
-        """Return the predicted leaf every tree for each sample.
+        """
+        Return the predicted leaf every tree for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

-        ntree_limit : int
-            Limit number of trees in the prediction; defaults to 0 (use all trees).
+        num_iteration : int
+            Limit number of iterations in the prediction; defaults to 0 (use all trees).

        Returns
        -------
@@ -407,7 +431,9 @@ class LGBMModel(LGBMModelBase):
                                      num_iteration=num_iteration)

    def evals_result(self):
-        """Return the evaluation results.
+        """
+        Return the evaluation results.
+
        Returns
        -------
        evals_result : dictionary
@@ -420,7 +446,9 @@ class LGBMModel(LGBMModelBase):
        return evals_result

    def feature_importance(self):
-        """Feature importances
+        """
+        Feature importances
+
        Returns
        -------
        Array of normailized feature importances
@@ -429,8 +457,6 @@ class LGBMModel(LGBMModelBase):
        return importace_array / importace_array.sum()

 class LGBMRegressor(LGBMModel, LGBMRegressorBase):
-    __doc__ = """Implementation of the scikit-learn API for LightGBM regression.
-    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

    def fit(self, X, y,
            sample_weight=None, init_score=None,
@@ -449,9 +475,6 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
        return self

 class LGBMClassifier(LGBMModel, LGBMClassifierBase):
-    __doc__ = """Implementation of the scikit-learn API for LightGBM classification.
-
-    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

    def __init__(self, num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
@@ -511,6 +534,21 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
        return self._le.inverse_transform(column_indexes)

    def predict_proba(self, data, raw_score=False, num_iteration=0):
+        """
+        Return the predicted probability for each class for each sample.
+
+        Parameters
+        ----------
+        X : array_like, shape=[n_samples, n_features]
+            Input features matrix.
+
+        num_iteration : int
+            Limit number of iterations in the prediction; defaults to 0 (use all trees).
+
+        Returns
+        -------
+        predicted_probability : array_like, shape=[n_samples, n_classes]
+        """
        class_probs = self.booster().predict(data,
                                             raw_score=raw_score,
                                             num_iteration=num_iteration)
@@ -522,9 +560,6 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
            return np.vstack((classzero_probs, classone_probs)).transpose()

 class LGBMRanker(LGBMModel):
-    __doc__ = """Implementation of the scikit-learn API for LightGBM ranking application.
-
-    """ + '\n'.join(LGBMModel.__doc__.split('\n')[2:])

    def __init__(self, num_leaves=31, max_depth=-1,
                 learning_rate=0.1, n_estimators=10, max_bin=255,
@@ -550,7 +585,7 @@ class LGBMRanker(LGBMModel):
            feature_name=None, categorical_feature=None,
            other_params=None):
        """
-        Most arguments like LGBMModel.fit except following:
+        Most arguments like common methods except following:

        eval_at : list of int
            The evaulation positions of NDCG