Commit 3ad9cba0 authored by Nikita Titov's avatar Nikita Titov Committed by Tsukasa OMOTO
Browse files

[python] refined examples (#1769)

parent 0312ecde
...@@ -32,11 +32,13 @@ Examples include: ...@@ -32,11 +32,13 @@ Examples include:
- Self-defined eval metric with sklearn interface - Self-defined eval metric with sklearn interface
- Find best parameters for the model with sklearn's GridSearchCV - Find best parameters for the model with sklearn's GridSearchCV
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py) - [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Construct Dataset
- Set feature names - Set feature names
- Directly use categorical features without one-hot encoding - Directly use categorical features without one-hot encoding
- Dump model to json format - Save model to file
- Get feature importances - Dump model to JSON format
- Get feature names - Get feature names
- Get feature importances
- Load model to predict - Load model to predict
- Dump and load model with pickle - Dump and load model with pickle
- Load model file to continue training - Load model file to continue training
......
...@@ -11,17 +11,17 @@ try: ...@@ -11,17 +11,17 @@ try:
except BaseException: except BaseException:
import pickle import pickle
print('Loading data...')
# load or create your dataset # load or create your dataset
print('Load data...')
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t') df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t') df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0] W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0] W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
y_train = df_train[0].values y_train = df_train[0]
y_test = df_test[0].values y_test = df_test[0]
X_train = df_train.drop(0, axis=1).values X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1).values X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape num_train, num_feature = X_train.shape
...@@ -45,10 +45,10 @@ params = { ...@@ -45,10 +45,10 @@ params = {
'verbose': 0 'verbose': 0
} }
# generate a feature name # generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)] feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Start training...') print('Starting training...')
# feature_name and categorical_feature # feature_name and categorical_feature
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
...@@ -57,15 +57,16 @@ gbm = lgb.train(params, ...@@ -57,15 +57,16 @@ gbm = lgb.train(params,
feature_name=feature_name, feature_name=feature_name,
categorical_feature=[21]) categorical_feature=[21])
print('Finished first 10 rounds...')
# check feature name # check feature name
print('Finish first 10 rounds...') print('7th feature name is:', lgb_train.feature_name[6])
print('7th feature name is:', repr(lgb_train.feature_name[6]))
print('Saving model...')
# save model to file # save model to file
gbm.save_model('model.txt') gbm.save_model('model.txt')
print('Dumping model to JSON...')
# dump model to JSON (and save to file) # dump model to JSON (and save to file)
print('Dump model to JSON...')
model_json = gbm.dump_model() model_json = gbm.dump_model()
with open('model.json', 'w+') as f: with open('model.json', 'w+') as f:
...@@ -77,14 +78,15 @@ print('Feature names:', gbm.feature_name()) ...@@ -77,14 +78,15 @@ print('Feature names:', gbm.feature_name())
# feature importances # feature importances
print('Feature importances:', list(gbm.feature_importance())) print('Feature importances:', list(gbm.feature_importance()))
print('Loading model to predict...')
# load model to predict # load model to predict
print('Load model to predict')
bst = lgb.Booster(model_file='model.txt') bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration) # can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test) y_pred = bst.predict(X_test)
# eval with loaded model # eval with loaded model
print('The rmse of loaded model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
print('Dumping and loading model with pickle...')
# dump model with pickle # dump model with pickle
with open('model.pkl', 'wb') as fout: with open('model.pkl', 'wb') as fout:
pickle.dump(gbm, fout) pickle.dump(gbm, fout)
...@@ -94,7 +96,7 @@ with open('model.pkl', 'rb') as fin: ...@@ -94,7 +96,7 @@ with open('model.pkl', 'rb') as fin:
# can predict with any iteration when loaded in pickle way # can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7) y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model # eval with loaded model
print('The rmse of pickled model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
# continue training # continue training
# init_model accepts: # init_model accepts:
...@@ -106,7 +108,7 @@ gbm = lgb.train(params, ...@@ -106,7 +108,7 @@ gbm = lgb.train(params,
init_model='model.txt', init_model='model.txt',
valid_sets=lgb_eval) valid_sets=lgb_eval)
print('Finish 10 - 20 rounds with model file...') print('Finished 10 - 20 rounds with model file...')
# decay learning rates # decay learning rates
# learning_rates accepts: # learning_rates accepts:
...@@ -119,7 +121,7 @@ gbm = lgb.train(params, ...@@ -119,7 +121,7 @@ gbm = lgb.train(params,
learning_rates=lambda iter: 0.05 * (0.99 ** iter), learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval) valid_sets=lgb_eval)
print('Finish 20 - 30 rounds with decay learning rates...') print('Finished 20 - 30 rounds with decay learning rates...')
# change other parameters during training # change other parameters during training
gbm = lgb.train(params, gbm = lgb.train(params,
...@@ -129,13 +131,13 @@ gbm = lgb.train(params, ...@@ -129,13 +131,13 @@ gbm = lgb.train(params,
valid_sets=lgb_eval, valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)]) callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('Finish 30 - 40 rounds with changing bagging_fraction...') print('Finished 30 - 40 rounds with changing bagging_fraction...')
# self-defined objective function # self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array # f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss # log likelihood loss
def loglikelood(preds, train_data): def loglikelihood(preds, train_data):
labels = train_data.get_label() labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds)) preds = 1. / (1. + np.exp(-preds))
grad = preds - labels grad = preds - labels
...@@ -155,13 +157,13 @@ gbm = lgb.train(params, ...@@ -155,13 +157,13 @@ gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=10, num_boost_round=10,
init_model=gbm, init_model=gbm,
fobj=loglikelood, fobj=loglikelihood,
feval=binary_error, feval=binary_error,
valid_sets=lgb_eval) valid_sets=lgb_eval)
print('Finish 40 - 50 rounds with self-defined objective function and eval metric...') print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
print('Start a new training job...') print('Starting a new training job...')
# callback # callback
...@@ -170,7 +172,7 @@ def reset_metrics(): ...@@ -170,7 +172,7 @@ def reset_metrics():
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5: if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...') print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new valid') env.model.add_valid(lgb_eval_new, 'new_valid')
callback.before_iteration = True callback.before_iteration = True
callback.order = 0 callback.order = 0
return callback return callback
...@@ -182,4 +184,4 @@ gbm = lgb.train(params, ...@@ -182,4 +184,4 @@ gbm = lgb.train(params,
valid_sets=lgb_train, valid_sets=lgb_train,
callbacks=[reset_metrics()]) callbacks=[reset_metrics()])
print('Finish first 10 rounds with callback function...') print('Finished first 10 rounds with callback function...')
...@@ -8,15 +8,15 @@ if lgb.compat.MATPLOTLIB_INSTALLED: ...@@ -8,15 +8,15 @@ if lgb.compat.MATPLOTLIB_INSTALLED:
else: else:
raise ImportError('You need to install matplotlib for plot_example.py.') raise ImportError('You need to install matplotlib for plot_example.py.')
print('Loading data...')
# load or create your dataset # load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0].values y_train = df_train[0]
y_test = df_test[0].values y_test = df_test[0]
X_train = df_train.drop(0, axis=1).values X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1).values X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm # create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
...@@ -31,29 +31,29 @@ params = { ...@@ -31,29 +31,29 @@ params = {
evals_result = {} # to record eval results for plotting evals_result = {} # to record eval results for plotting
print('Start training...') print('Starting training...')
# train # train
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
num_boost_round=100, num_boost_round=100,
valid_sets=[lgb_train, lgb_test], valid_sets=[lgb_train, lgb_test],
feature_name=['f' + str(i + 1) for i in range(28)], feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
categorical_feature=[21], categorical_feature=[21],
evals_result=evals_result, evals_result=evals_result,
verbose_eval=10) verbose_eval=10)
print('Plot metrics recorded during training...') print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1') ax = lgb.plot_metric(evals_result, metric='l1')
plt.show() plt.show()
print('Plot feature importances...') print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10) ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show() plt.show()
print('Plot 84th tree...') # one tree use categorical feature to split print('Plotting 84th tree...') # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain']) ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show() plt.show()
print('Plot 84th tree with graphviz...') print('Plotting 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84') graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True) graph.render(view=True)
...@@ -4,16 +4,15 @@ import lightgbm as lgb ...@@ -4,16 +4,15 @@ import lightgbm as lgb
import pandas as pd import pandas as pd
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
print('Loading data...')
# load or create your dataset # load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0].values y_train = df_train[0]
y_test = df_test[0].values y_test = df_test[0]
X_train = df_train.drop(0, axis=1).values X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1).values X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm # create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train)
...@@ -21,10 +20,9 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) ...@@ -21,10 +20,9 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict # specify your configurations as a dict
params = { params = {
'task': 'train',
'boosting_type': 'gbdt', 'boosting_type': 'gbdt',
'objective': 'regression', 'objective': 'regression',
'metric': {'l2', 'auc'}, 'metric': {'l2', 'l1'},
'num_leaves': 31, 'num_leaves': 31,
'learning_rate': 0.05, 'learning_rate': 0.05,
'feature_fraction': 0.9, 'feature_fraction': 0.9,
...@@ -33,7 +31,7 @@ params = { ...@@ -33,7 +31,7 @@ params = {
'verbose': 0 'verbose': 0
} }
print('Start training...') print('Starting training...')
# train # train
gbm = lgb.train(params, gbm = lgb.train(params,
lgb_train, lgb_train,
...@@ -41,11 +39,11 @@ gbm = lgb.train(params, ...@@ -41,11 +39,11 @@ gbm = lgb.train(params,
valid_sets=lgb_eval, valid_sets=lgb_eval,
early_stopping_rounds=5) early_stopping_rounds=5)
print('Save model...') print('Saving model...')
# save model to file # save model to file
gbm.save_model('model.txt') gbm.save_model('model.txt')
print('Start predicting...') print('Starting predicting...')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval # eval
......
...@@ -7,20 +7,19 @@ import lightgbm as lgb ...@@ -7,20 +7,19 @@ import lightgbm as lgb
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
print('Loading data...')
# load or create your dataset # load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t') df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t') df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0].values y_train = df_train[0]
y_test = df_test[0].values y_test = df_test[0]
X_train = df_train.drop(0, axis=1).values X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1).values X_test = df_test.drop(0, axis=1)
print('Start training...') print('Starting training...')
# train # train
gbm = lgb.LGBMRegressor(objective='regression', gbm = lgb.LGBMRegressor(num_leaves=31,
num_leaves=31,
learning_rate=0.05, learning_rate=0.05,
n_estimators=20) n_estimators=20)
gbm.fit(X_train, y_train, gbm.fit(X_train, y_train,
...@@ -28,7 +27,7 @@ gbm.fit(X_train, y_train, ...@@ -28,7 +27,7 @@ gbm.fit(X_train, y_train,
eval_metric='l1', eval_metric='l1',
early_stopping_rounds=5) early_stopping_rounds=5)
print('Start predicting...') print('Starting predicting...')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval # eval
...@@ -45,14 +44,14 @@ def rmsle(y_true, y_pred): ...@@ -45,14 +44,14 @@ def rmsle(y_true, y_pred):
return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
print('Start training with custom eval function...') print('Starting training with custom eval function...')
# train # train
gbm.fit(X_train, y_train, gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)], eval_set=[(X_test, y_test)],
eval_metric=rmsle, eval_metric=rmsle,
early_stopping_rounds=5) early_stopping_rounds=5)
print('Start predicting...') print('Starting predicting...')
# predict # predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_) y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval # eval
...@@ -67,7 +66,6 @@ param_grid = { ...@@ -67,7 +66,6 @@ param_grid = {
} }
gbm = GridSearchCV(estimator, param_grid, cv=3) gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train) gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_) print('Best parameters found by grid search are:', gbm.best_params_)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment