add an advanced example; add guide-python README.md details; clean error messages (#117)

ebfc8521 · wxchan · Guolin Ke · b51c7be4 · ebfc8521 · ebfc8521
Commit ebfc8521 authored Dec 09, 2016 by wxchan Committed by Guolin Ke Dec 09, 2016
8 changed files
--- a/examples/python-guide/README.md
+++ b/examples/python-guide/README.md
@@ -16,3 +16,23 @@ Now you can run examples in this folder, for example:
 ```
 python simple_example.py
 ```
+Examples including:
+- [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
+    - Construct Dataset
+    - Basic train and predict
+    - Eval during training 
+    - Early stopping
+    - Save model to file
+    - Dump model to json format
+    - Feature importances
+- [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
+    - Basic train and predict with sklearn interface
+    - Feature importances with sklearn interface
+- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
+    - Set feature names
+    - Directly use categorical features without one-hot encoding
+    - Load model file to continue training
+    - Change learning rates during training
+    - Self-defined objective function
+    - Self-defined eval metric
+    - Callback function
\ No newline at end of file
--- a/examples/python-guide/advanced_example.py
+++ b/examples/python-guide/advanced_example.py
+# coding: utf-8
+# pylint: disable = invalid-name, C0111
+import lightgbm as lgb
+import pandas as pd
+import numpy as np
+
+# load or create your dataset
+print('Load data...')
+df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
+df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
+W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
+W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
+
+y_train = df_train[0]
+y_test = df_test[0]
+X_train = df_train.drop(0, axis=1)
+X_test = df_test.drop(0, axis=1)
+
+num_train, num_feature = X_train.shape
+
+# create dataset for lightgbm
+# if you want to re-use data, remember to set free_raw_data=False
+lgb_train = lgb.Dataset(X_train, y_train,
+                        weight=W_train, free_raw_data=False)
+lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
+                       weight=W_test, free_raw_data=False)
+
+# specify your configurations as a dict
+params = {
+    'boosting_type' : 'gbdt',
+    'objective' : 'binary',
+    'metric' : 'binary_logloss',
+    'num_leaves' : 31,
+    'learning_rate' : 0.05,
+    'feature_fraction' : 0.9,
+    'bagging_fraction' : 0.8,
+    'bagging_freq': 5,
+    'verbose' : 0
+}
+
+# generate a feature name
+feature_name = ['feature_' + str(col) for col in range(num_feature)]
+
+print('Start training...')
+# feature_name and categorical_feature
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                valid_sets=lgb_train, # eval training data
+                feature_name=feature_name,
+                categorical_feature=[21])
+
+# check feature name
+print('Finish first 10 rounds...')
+print('7th feature name is:', repr(lgb_train.feature_name[6]))
+
+# save model to file
+gbm.save_model('model.txt')
+
+# continue training
+# init_model accepts:
+# 1. model file name
+# 2. Booster()
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                init_model='model.txt',
+                valid_sets=lgb_eval)
+
+print('Finish 10 - 20 rounds with model file...')
+
+# decay learning rates
+# learning_rates accepts:
+# 1. list/tuple with length = num_boost_round
+# 2. function(curr_iter)
+# 3. function(curr_iter, total_iter)
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                init_model=gbm,
+                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
+                valid_sets=lgb_eval)
+
+print('Finish 20 - 30 rounds with decay learning rates...')
+
+# self-defined objective function
+# f(preds: array, train_data: Dataset) -> grad: array, hess: array
+# log likelihood loss
+def loglikelood(preds, train_data):
+    labels = train_data.get_label()
+    preds = 1. / (1. + np.exp(-preds))
+    grad = preds - labels
+    hess = preds * (1. - preds)
+    return grad, hess
+
+# self-defined eval metric
+# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
+# binary error
+def binary_error(preds, train_data):
+    labels = train_data.get_label()
+    return 'error', np.mean(labels != (preds > 0.5)), False
+
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                init_model=gbm,
+                fobj=loglikelood,
+                feval=binary_error,
+                valid_sets=lgb_eval)
+
+print('Finish 30 - 40 rounds with self-defined objective function and eval metric...')
+
+print('Start a new training job...')
+# callback
+def reset_metrics():
+    def callback(env):
+        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
+        if env.iteration - env.begin_iteration == 5:
+            print('Add a new valid dataset at iteration 5...')
+            env.model.add_valid(lgb_eval_new, 'new valid')
+    callback.before_iteration = True
+    callback.order = 0
+    return callback
+
+gbm = lgb.train(params,
+                lgb_train,
+                num_boost_round=10,
+                valid_sets=lgb_train,
+                callbacks=[reset_metrics()])
+
+print('Finish first 10 rounds with callback function...')
--- a/examples/python-guide/simple_example.py
+++ b/examples/python-guide/simple_example.py
@@ -6,6 +6,7 @@ import pandas as pd
 from sklearn.metrics import mean_squared_error

 # load or create your dataset
+print('Load data...')
 df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
 df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

@@ -18,7 +19,6 @@ X_test = df_test.drop(0, axis=1)
 lgb_train = lgb.Dataset(X_train, y_train)
 lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

-
 # specify your configurations as a dict
 params = {
    'task' : 'train',
@@ -33,27 +33,32 @@ params = {
    'verbose' : 0
 }

+print('Start training...')
 # train
 gbm = lgb.train(params,
                lgb_train,
-                num_boost_round=100,
+                num_boost_round=20,
                valid_sets=lgb_eval,
-                early_stopping_rounds=10)
+                early_stopping_rounds=5)

+print('Save model...')
 # save model to file
 gbm.save_model('model.txt')

+print('Start predicting...')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

+print('Dump model to JSON...')
 # dump model to json (and save to file)
 model_json = gbm.dump_model()

 with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

+print('Calculate feature importances...')
 # feature importances
-print('Feature importances:', gbm.feature_importance())
-print('Feature importances:', gbm.feature_importance("gain"))
+print('Feature importances:', list(gbm.feature_importance()))
+# print('Feature importances:', list(gbm.feature_importance("gain")))
--- a/examples/python-guide/sklearn_example.py
+++ b/examples/python-guide/sklearn_example.py
@@ -5,6 +5,7 @@ import pandas as pd
 from sklearn.metrics import mean_squared_error

 # load or create your dataset
+print('Load data...')
 df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
 df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')

@@ -13,19 +14,23 @@ y_test = df_test[0]
 X_train = df_train.drop(0, axis=1)
 X_test = df_test.drop(0, axis=1)

+print('Start training...')
 # train
 gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=31,
                        learning_rate=0.05,
-                        n_estimators=100)
+                        n_estimators=20)
 gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
-        early_stopping_rounds=10)
+        eval_metric='l1',
+        early_stopping_rounds=5)

+print('Start predicting...')
 # predict
 y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
 # eval
 print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

+print('Calculate feature importances...')
 # feature importances
-print('Feature importances:', gbm.feature_importance())
+print('Feature importances:', list(gbm.feature_importance()))
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -35,7 +35,7 @@ def _format_eval_result(value, show_stdv=True):
        else:
            return '%s\'s %s:%g' % (value[0], value[1], value[2])
    else:
-        raise ValueError("wrong metric value")
+        raise ValueError("Wrong metric value")


 def print_evaluation(period=1, show_stdv=True):
@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
        The requested callback function.
    """
    if not isinstance(eval_result, dict):
-        raise TypeError('eval_result has to be a dictionary')
+        raise TypeError('Eval_result should be a dictionary')
    eval_result.clear()

    def init(env):
@@ -164,7 +164,7 @@ def early_stop(stopping_rounds, verbose=True):
    def init(env):
        """internal function"""
        if not env.evaluation_result_list:
-            raise ValueError('For early stopping you need at least one set in evals.')
+            raise ValueError('For early stopping, at least one dataset is required for evaluation')

        if verbose:
            msg = "Train until valid scores didn't improve in {} rounds."
@@ -194,7 +194,7 @@ def early_stop(stopping_rounds, verbose=True):
                    if env.model is not None:
                        env.model.set_attr(best_iteration=str(best_iter[i]))
                    if verbose:
-                        print('early stopping, best iteration is:')
+                        print('Early stopping, best iteration is:')
                        print(best_msg[i])
                    raise EarlyStopException(best_iter[i])
    callback.order = 30

--- a/python-package/lightgbm/engine.py
+++ b/python-package/lightgbm/engine.py
@@ -85,10 +85,10 @@ def train(params, train_set, num_boost_round=100,
        predictor = init_model._to_predictor()
    else:
        predictor = None
-    init_iteration = predictor.num_total_iteration if predictor else 0
+    init_iteration = predictor.num_total_iteration if predictor is not None else 0
    """check dataset"""
    if not isinstance(train_set, Dataset):
-        raise TypeError("only can accept Dataset instance for traninig")
+        raise TypeError("Traninig only accepts Dataset object")

    train_set._set_predictor(predictor)
    train_set.set_feature_name(feature_name)
@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
    train_data_name = "training"
    reduced_valid_sets = []
    name_valid_sets = []
-    if valid_sets:
+    if valid_sets is not None:
        if isinstance(valid_sets, Dataset):
            valid_sets = [valid_sets]
        if isinstance(valid_names, str):
@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
                    train_data_name = valid_names[i]
                continue
            if not isinstance(valid_data, Dataset):
-                raise TypeError("only can accept Dataset instance for traninig")
+                raise TypeError("Traninig only accepts Dataset object")
            valid_data.set_reference(train_set)
            reduced_valid_sets.append(valid_data)
            if valid_names is not None and len(valid_names) > i:
@@ -120,7 +120,7 @@ def train(params, train_set, num_boost_round=100,
                name_valid_sets.append('valid_'+str(i))

    """process callbacks"""
-    if not callbacks:
+    if callbacks is None:
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
@@ -133,7 +133,7 @@ def train(params, train_set, num_boost_round=100,
    elif isinstance(verbose_eval, int):
        callbacks.add(callback.print_evaluation(verbose_eval))

-    if early_stopping_rounds:
+    if early_stopping_rounds is not None:
        callbacks.add(callback.early_stop(early_stopping_rounds,
                                          verbose=bool(verbose_eval)))

@@ -169,7 +169,7 @@ def train(params, train_set, num_boost_round=100,

        evaluation_result_list = []
        # check evaluation result.
-        if valid_sets:
+        if valid_sets is not None:
            if is_valid_contain_train:
                evaluation_result_list.extend(booster.eval_train(feval))
            evaluation_result_list.extend(booster.eval_valid(feval))
@@ -227,7 +227,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
            sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
            idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
        else:
-            raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
+            raise LightGBMError('Scikit-learn is required for stratified cv')
    else:
        full_data.construct()
        randidx = np.random.permutation(full_data.num_data())
@@ -318,7 +318,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
    evaluation history : list(string)
    """
    if not isinstance(train_set, Dataset):
-        raise TypeError("only can accept Dataset instance for traninig")
+        raise TypeError("Traninig only accepts Dataset object")

    if is_str(init_model):
        predictor = _InnerPredictor(model_file=init_model)
@@ -342,13 +342,13 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
    cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)

    # setup callbacks
-    if not callbacks:
+    if callbacks is None:
        callbacks = set()
    else:
        for i, cb in enumerate(callbacks):
            cb.__dict__.setdefault('order', i - len(callbacks))
        callbacks = set(callbacks)
-    if early_stopping_rounds:
+    if early_stopping_rounds is not None:
        callbacks.add(callback.early_stop(early_stopping_rounds, verbose=False))
    if verbose_eval is True:
        callbacks.add(callback.print_evaluation(show_stdv=show_stdv))

--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -6,7 +6,7 @@ from __future__ import absolute_import
 import numpy as np
 from .basic import LightGBMError, Dataset, is_str
 from .engine import train
-# sklearn
+'''sklearn'''
 try:
    from sklearn.base import BaseEstimator
    from sklearn.base import RegressorMixin, ClassifierMixin
@@ -38,7 +38,6 @@ def _point_wise_objective(func):
        y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
            The predicted values

-
    Returns
    -------
    new_func: callable
@@ -66,7 +65,7 @@ def _point_wise_objective(func):
                num_data = len(weight)
                num_class = len(grad) // num_data
                if num_class * num_data != len(grad):
-                    raise ValueError("length of grad and hess should equal to num_class * num_data")
+                    raise ValueError("Length of grad and hess should equal to num_class * num_data")
                for k in range(num_class):
                    for i in range(num_data):
                        idx = k * num_data + i
@@ -147,7 +146,7 @@ class LGBMModel(LGBMModelBase):
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
        if not SKLEARN_INSTALLED:
-            raise LightGBMError('sklearn needs to be installed in order to use this module')
+            raise LightGBMError('Scikit-learn is required for this module')

        self.num_leaves = num_leaves
        self.max_depth = max_depth
@@ -185,7 +184,7 @@ class LGBMModel(LGBMModelBase):
        booster : a lightgbm booster of underlying model
        """
        if self._Booster is None:
-            raise LightGBMError('need to call fit beforehand')
+            raise LightGBMError('Need to call fit beforehand')
        return self._Booster

    def get_params(self, deep=False):
@@ -196,8 +195,8 @@ class LGBMModel(LGBMModelBase):
        return params

    def fit(self, X, y,
-            sample_weight=None, init_score=None, group=None, 
-            eval_set=None, eval_sample_weight=None, 
+            sample_weight=None, init_score=None, group=None,
+            eval_set=None, eval_sample_weight=None,
            eval_init_score=None, eval_group=None,
            eval_metric=None,
            early_stopping_rounds=None, verbose=True,
@@ -343,7 +342,7 @@ class LGBMModel(LGBMModelBase):
        if self.evals_result_:
            evals_result = self.evals_result_
        else:
-            raise LightGBMError('No results.')
+            raise LightGBMError('No results found.')

        return evals_result

@@ -362,7 +361,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):

    def fit(self, X, y,
            sample_weight=None, init_score=None,
-            eval_set=None, eval_sample_weight=None, 
+            eval_set=None, eval_sample_weight=None,
            eval_init_score=None,
            eval_metric=None,
            early_stopping_rounds=None, verbose=True,
@@ -370,10 +369,10 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
            other_params=None):

        super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
-                                    eval_set, eval_sample_weight, eval_init_score, None,
-                                    eval_metric, early_stopping_rounds,
-                                    verbose, feature_name, categorical_feature,
-                                    other_params)
+                                       eval_set, eval_sample_weight, eval_init_score, None,
+                                       eval_metric, early_stopping_rounds,
+                                       verbose, feature_name, categorical_feature,
+                                       other_params)
        return self

 class LGBMClassifier(LGBMModel, LGBMClassifierBase):
@@ -390,15 +389,15 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
                 is_unbalance=False, seed=0):
        super(LGBMClassifier, self).__init__(num_leaves, max_depth,
                                             learning_rate, n_estimators, max_bin,
-                                             silent, objective,
-                                             nthread, min_split_gain, min_child_weight, min_child_samples,
+                                             silent, objective, nthread,
+                                             min_split_gain, min_child_weight, min_child_samples,
                                             subsample, subsample_freq, colsample_bytree,
                                             reg_alpha, reg_lambda, scale_pos_weight,
                                             is_unbalance, seed)

    def fit(self, X, y,
            sample_weight=None, init_score=None,
-            eval_set=None, eval_sample_weight=None, 
+            eval_set=None, eval_sample_weight=None,
            eval_init_score=None,
            eval_metric=None,
            early_stopping_rounds=None, verbose=True,
@@ -480,7 +479,7 @@ def _group_wise_objective(func):
        labels = dataset.get_label()
        group = dataset.get_group()
        if group is None:
-            raise ValueError("group should not be None for ranking task")
+            raise ValueError("Group should not be None for ranking task")
        grad, hess = func(labels, group, preds)
        """weighted for objective"""
        weight = dataset.get_weight()
@@ -490,7 +489,7 @@ def _group_wise_objective(func):
                grad = np.multiply(grad, weight)
                hess = np.multiply(hess, weight)
            else:
-                raise ValueError("lenght of grad and hess should equal with num_data")
+                raise ValueError("Length of grad and hess should equal with num_data")
        return grad, hess
    return inner

@@ -507,20 +506,20 @@ class LGBMRanker(LGBMModel):
                 reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
                 is_unbalance=False, seed=0):
        super(LGBMRanker, self).__init__(num_leaves, max_depth,
-                                             learning_rate, n_estimators, max_bin,
-                                             silent, objective,
-                                             nthread, min_split_gain, min_child_weight, min_child_samples,
-                                             subsample, subsample_freq, colsample_bytree,
-                                             reg_alpha, reg_lambda, scale_pos_weight,
-                                             is_unbalance, seed)
+                                         learning_rate, n_estimators, max_bin,
+                                         silent, objective, nthread,
+                                         min_split_gain, min_child_weight, min_child_samples,
+                                         subsample, subsample_freq, colsample_bytree,
+                                         reg_alpha, reg_lambda, scale_pos_weight,
+                                         is_unbalance, seed)
        if callable(self.objective):
            self.fobj = _group_wise_objective(self.objective)
        else:
            self.fobj = None

    def fit(self, X, y,
-            sample_weight=None, init_score=None, group=None, 
-            eval_set=None, eval_sample_weight=None, 
+            sample_weight=None, init_score=None, group=None,
+            eval_set=None, eval_sample_weight=None,
            eval_init_score=None, eval_group=None,
            eval_metric=None, eval_at=None,
            early_stopping_rounds=None, verbose=True,
@@ -535,17 +534,18 @@ class LGBMRanker(LGBMModel):

        """check group data"""
        if group is None:
-            raise ValueError("should use group for ranking task")
+            raise ValueError("Should set group for ranking task")

        if eval_set is not None:
            if eval_group is None:
-                raise ValueError("eval_group cannot be None when eval_set is not None")
+                raise ValueError("Eval_group cannot be None when eval_set is not None")
            elif len(eval_group) != len(eval_set):
-                raise ValueError("length of eval_group should equal with eval_set")
+                raise ValueError("Length of eval_group should equal to eval_set")
            else:
                for inner_group in eval_group:
                    if inner_group is None:
-                        raise ValueError("should set group for all eval data for ranking task")
+                        raise ValueError("Should set group for all eval dataset for ranking task")
+
        if eval_at is not None:
            other_params = {} if other_params is None else other_params
            other_params['ndcg_eval_at'] = list(eval_at)