"tests/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "b60068c810cbcac9cf4e1a8e678d8d531c40eb72"
Commit ebfc8521 authored by wxchan's avatar wxchan Committed by Guolin Ke
Browse files

add an advanced example; add guide-python README.md details; clean error messages (#117)

parent b51c7be4
......@@ -16,3 +16,23 @@ Now you can run examples in this folder, for example:
```
python simple_example.py
```
Examples including:
- [simple_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py)
- Construct Dataset
- Basic train and predict
- Eval during training
- Early stopping
- Save model to file
- Dump model to json format
- Feature importances
- [sklearn_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/sklearn_example.py)
- Basic train and predict with sklearn interface
- Feature importances with sklearn interface
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Set feature names
- Directly use categorical features without one-hot encoding
- Load model file to continue training
- Change learning rates during training
- Self-defined objective function
- Self-defined eval metric
- Callback function
\ No newline at end of file
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
import numpy as np
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
weight=W_test, free_raw_data=False)
# specify your configurations as a dict
params = {
'boosting_type' : 'gbdt',
'objective' : 'binary',
'metric' : 'binary_logloss',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
'verbose' : 0
}
# generate a feature name
feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21])
# check feature name
print('Finish first 10 rounds...')
print('7th feature name is:', repr(lgb_train.feature_name[6]))
# save model to file
gbm.save_model('model.txt')
# continue training
# init_model accepts:
# 1. model file name
# 2. Booster()
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model='model.txt',
valid_sets=lgb_eval)
print('Finish 10 - 20 rounds with model file...')
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
# 3. function(curr_iter, total_iter)
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)
print('Finish 20 - 30 rounds with decay learning rates...')
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
grad = preds - labels
hess = preds * (1. - preds)
return grad, hess
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
# binary error
def binary_error(preds, train_data):
labels = train_data.get_label()
return 'error', np.mean(labels != (preds > 0.5)), False
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
fobj=loglikelood,
feval=binary_error,
valid_sets=lgb_eval)
print('Finish 30 - 40 rounds with self-defined objective function and eval metric...')
print('Start a new training job...')
# callback
def reset_metrics():
def callback(env):
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new valid')
callback.before_iteration = True
callback.order = 0
return callback
gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
print('Finish first 10 rounds with callback function...')
......@@ -6,6 +6,7 @@ import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
......@@ -18,7 +19,6 @@ X_test = df_test.drop(0, axis=1)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'task' : 'train',
......@@ -33,27 +33,32 @@ params = {
'verbose' : 0
}
print('Start training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=10)
early_stopping_rounds=5)
print('Save model...')
# save model to file
gbm.save_model('model.txt')
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Dump model to JSON...')
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
print('Calculate feature importances...')
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', gbm.feature_importance("gain"))
print('Feature importances:', list(gbm.feature_importance()))
# print('Feature importances:', list(gbm.feature_importance("gain")))
......@@ -5,6 +5,7 @@ import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
......@@ -13,19 +14,23 @@ y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
print('Start training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=100)
n_estimators=20)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10)
eval_metric='l1',
early_stopping_rounds=5)
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print('Calculate feature importances...')
# feature importances
print('Feature importances:', gbm.feature_importance())
print('Feature importances:', list(gbm.feature_importance()))
This diff is collapsed.
......@@ -35,7 +35,7 @@ def _format_eval_result(value, show_stdv=True):
else:
return '%s\'s %s:%g' % (value[0], value[1], value[2])
else:
raise ValueError("wrong metric value")
raise ValueError("Wrong metric value")
def print_evaluation(period=1, show_stdv=True):
......@@ -80,7 +80,7 @@ def record_evaluation(eval_result):
The requested callback function.
"""
if not isinstance(eval_result, dict):
raise TypeError('eval_result has to be a dictionary')
raise TypeError('Eval_result should be a dictionary')
eval_result.clear()
def init(env):
......@@ -164,7 +164,7 @@ def early_stop(stopping_rounds, verbose=True):
def init(env):
"""internal function"""
if not env.evaluation_result_list:
raise ValueError('For early stopping you need at least one set in evals.')
raise ValueError('For early stopping, at least one dataset is required for evaluation')
if verbose:
msg = "Train until valid scores didn't improve in {} rounds."
......@@ -194,7 +194,7 @@ def early_stop(stopping_rounds, verbose=True):
if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose:
print('early stopping, best iteration is:')
print('Early stopping, best iteration is:')
print(best_msg[i])
raise EarlyStopException(best_iter[i])
callback.order = 30
......
......@@ -85,10 +85,10 @@ def train(params, train_set, num_boost_round=100,
predictor = init_model._to_predictor()
else:
predictor = None
init_iteration = predictor.num_total_iteration if predictor else 0
init_iteration = predictor.num_total_iteration if predictor is not None else 0
"""check dataset"""
if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
raise TypeError("Traninig only accepts Dataset object")
train_set._set_predictor(predictor)
train_set.set_feature_name(feature_name)
......@@ -98,7 +98,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name = "training"
reduced_valid_sets = []
name_valid_sets = []
if valid_sets:
if valid_sets is not None:
if isinstance(valid_sets, Dataset):
valid_sets = [valid_sets]
if isinstance(valid_names, str):
......@@ -111,7 +111,7 @@ def train(params, train_set, num_boost_round=100,
train_data_name = valid_names[i]
continue
if not isinstance(valid_data, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
raise TypeError("Traninig only accepts Dataset object")
valid_data.set_reference(train_set)
reduced_valid_sets.append(valid_data)
if valid_names is not None and len(valid_names) > i:
......@@ -120,7 +120,7 @@ def train(params, train_set, num_boost_round=100,
name_valid_sets.append('valid_'+str(i))
"""process callbacks"""
if not callbacks:
if callbacks is None:
callbacks = set()
else:
for i, cb in enumerate(callbacks):
......@@ -133,7 +133,7 @@ def train(params, train_set, num_boost_round=100,
elif isinstance(verbose_eval, int):
callbacks.add(callback.print_evaluation(verbose_eval))
if early_stopping_rounds:
if early_stopping_rounds is not None:
callbacks.add(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval)))
......@@ -169,7 +169,7 @@ def train(params, train_set, num_boost_round=100,
evaluation_result_list = []
# check evaluation result.
if valid_sets:
if valid_sets is not None:
if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval))
......@@ -227,7 +227,7 @@ def _make_n_folds(full_data, nfold, params, seed, fpreproc=None, stratified=Fals
sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
else:
raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
raise LightGBMError('Scikit-learn is required for stratified cv')
else:
full_data.construct()
randidx = np.random.permutation(full_data.num_data())
......@@ -318,7 +318,7 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
evaluation history : list(string)
"""
if not isinstance(train_set, Dataset):
raise TypeError("only can accept Dataset instance for traninig")
raise TypeError("Traninig only accepts Dataset object")
if is_str(init_model):
predictor = _InnerPredictor(model_file=init_model)
......@@ -342,13 +342,13 @@ def cv(params, train_set, num_boost_round=10, nfold=5, stratified=False,
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
# setup callbacks
if not callbacks:
if callbacks is None:
callbacks = set()
else:
for i, cb in enumerate(callbacks):
cb.__dict__.setdefault('order', i - len(callbacks))
callbacks = set(callbacks)
if early_stopping_rounds:
if early_stopping_rounds is not None:
callbacks.add(callback.early_stop(early_stopping_rounds, verbose=False))
if verbose_eval is True:
callbacks.add(callback.print_evaluation(show_stdv=show_stdv))
......
......@@ -6,7 +6,7 @@ from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Dataset, is_str
from .engine import train
# sklearn
'''sklearn'''
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
......@@ -38,7 +38,6 @@ def _point_wise_objective(func):
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
......@@ -66,7 +65,7 @@ def _point_wise_objective(func):
num_data = len(weight)
num_class = len(grad) // num_data
if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal to num_class * num_data")
raise ValueError("Length of grad and hess should equal to num_class * num_data")
for k in range(num_class):
for i in range(num_data):
idx = k * num_data + i
......@@ -147,7 +146,7 @@ class LGBMModel(LGBMModelBase):
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module')
raise LightGBMError('Scikit-learn is required for this module')
self.num_leaves = num_leaves
self.max_depth = max_depth
......@@ -185,7 +184,7 @@ class LGBMModel(LGBMModelBase):
booster : a lightgbm booster of underlying model
"""
if self._Booster is None:
raise LightGBMError('need to call fit beforehand')
raise LightGBMError('Need to call fit beforehand')
return self._Booster
def get_params(self, deep=False):
......@@ -196,8 +195,8 @@ class LGBMModel(LGBMModelBase):
return params
def fit(self, X, y,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
......@@ -343,7 +342,7 @@ class LGBMModel(LGBMModelBase):
if self.evals_result_:
evals_result = self.evals_result_
else:
raise LightGBMError('No results.')
raise LightGBMError('No results found.')
return evals_result
......@@ -362,7 +361,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
def fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
......@@ -370,10 +369,10 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
other_params=None):
super(LGBMRegressor, self).fit(X, y, sample_weight, init_score, None,
eval_set, eval_sample_weight, eval_init_score, None,
eval_metric, early_stopping_rounds,
verbose, feature_name, categorical_feature,
other_params)
eval_set, eval_sample_weight, eval_init_score, None,
eval_metric, early_stopping_rounds,
verbose, feature_name, categorical_feature,
other_params)
return self
class LGBMClassifier(LGBMModel, LGBMClassifierBase):
......@@ -390,15 +389,15 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
is_unbalance=False, seed=0):
super(LGBMClassifier, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
silent, objective, nthread,
min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
def fit(self, X, y,
sample_weight=None, init_score=None,
eval_set=None, eval_sample_weight=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None,
eval_metric=None,
early_stopping_rounds=None, verbose=True,
......@@ -480,7 +479,7 @@ def _group_wise_objective(func):
labels = dataset.get_label()
group = dataset.get_group()
if group is None:
raise ValueError("group should not be None for ranking task")
raise ValueError("Group should not be None for ranking task")
grad, hess = func(labels, group, preds)
"""weighted for objective"""
weight = dataset.get_weight()
......@@ -490,7 +489,7 @@ def _group_wise_objective(func):
grad = np.multiply(grad, weight)
hess = np.multiply(hess, weight)
else:
raise ValueError("lenght of grad and hess should equal with num_data")
raise ValueError("Length of grad and hess should equal with num_data")
return grad, hess
return inner
......@@ -507,20 +506,20 @@ class LGBMRanker(LGBMModel):
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
super(LGBMRanker, self).__init__(num_leaves, max_depth,
learning_rate, n_estimators, max_bin,
silent, objective,
nthread, min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
learning_rate, n_estimators, max_bin,
silent, objective, nthread,
min_split_gain, min_child_weight, min_child_samples,
subsample, subsample_freq, colsample_bytree,
reg_alpha, reg_lambda, scale_pos_weight,
is_unbalance, seed)
if callable(self.objective):
self.fobj = _group_wise_objective(self.objective)
else:
self.fobj = None
def fit(self, X, y,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
sample_weight=None, init_score=None, group=None,
eval_set=None, eval_sample_weight=None,
eval_init_score=None, eval_group=None,
eval_metric=None, eval_at=None,
early_stopping_rounds=None, verbose=True,
......@@ -535,17 +534,18 @@ class LGBMRanker(LGBMModel):
"""check group data"""
if group is None:
raise ValueError("should use group for ranking task")
raise ValueError("Should set group for ranking task")
if eval_set is not None:
if eval_group is None:
raise ValueError("eval_group cannot be None when eval_set is not None")
raise ValueError("Eval_group cannot be None when eval_set is not None")
elif len(eval_group) != len(eval_set):
raise ValueError("length of eval_group should equal with eval_set")
raise ValueError("Length of eval_group should equal to eval_set")
else:
for inner_group in eval_group:
if inner_group is None:
raise ValueError("should set group for all eval data for ranking task")
raise ValueError("Should set group for all eval dataset for ranking task")
if eval_at is not None:
other_params = {} if other_params is None else other_params
other_params['ndcg_eval_at'] = list(eval_at)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment