Commit 3ad9cba0 authored by Nikita Titov's avatar Nikita Titov Committed by Tsukasa OMOTO
Browse files

[python] refined examples (#1769)

parent 0312ecde
......@@ -32,11 +32,13 @@ Examples include:
- Self-defined eval metric with sklearn interface
- Find best parameters for the model with sklearn's GridSearchCV
- [advanced_example.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py)
- Construct Dataset
- Set feature names
- Directly use categorical features without one-hot encoding
- Dump model to json format
- Get feature importances
- Save model to file
- Dump model to JSON format
- Get feature names
- Get feature importances
- Load model to predict
- Dump and load model with pickle
- Load model file to continue training
......
......@@ -11,17 +11,17 @@ try:
except BaseException:
import pickle
print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape
......@@ -45,10 +45,10 @@ params = {
'verbose': 0
}
# generate a feature name
# generate feature names
feature_name = ['feature_' + str(col) for col in range(num_feature)]
print('Start training...')
print('Starting training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
lgb_train,
......@@ -57,15 +57,16 @@ gbm = lgb.train(params,
feature_name=feature_name,
categorical_feature=[21])
print('Finished first 10 rounds...')
# check feature name
print('Finish first 10 rounds...')
print('7th feature name is:', repr(lgb_train.feature_name[6]))
print('7th feature name is:', lgb_train.feature_name[6])
print('Saving model...')
# save model to file
gbm.save_model('model.txt')
print('Dumping model to JSON...')
# dump model to JSON (and save to file)
print('Dump model to JSON...')
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
......@@ -77,14 +78,15 @@ print('Feature names:', gbm.feature_name())
# feature importances
print('Feature importances:', list(gbm.feature_importance()))
print('Loading model to predict...')
# load model to predict
print('Load model to predict')
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
print('The rmse of loaded model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
print('Dumping and loading model with pickle...')
# dump model with pickle
with open('model.pkl', 'wb') as fout:
pickle.dump(gbm, fout)
......@@ -94,7 +96,7 @@ with open('model.pkl', 'rb') as fin:
# can predict with any iteration when loaded in pickle way
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# eval with loaded model
print('The rmse of pickled model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)
# continue training
# init_model accepts:
......@@ -106,7 +108,7 @@ gbm = lgb.train(params,
init_model='model.txt',
valid_sets=lgb_eval)
print('Finish 10 - 20 rounds with model file...')
print('Finished 10 - 20 rounds with model file...')
# decay learning rates
# learning_rates accepts:
......@@ -119,7 +121,7 @@ gbm = lgb.train(params,
learning_rates=lambda iter: 0.05 * (0.99 ** iter),
valid_sets=lgb_eval)
print('Finish 20 - 30 rounds with decay learning rates...')
print('Finished 20 - 30 rounds with decay learning rates...')
# change other parameters during training
gbm = lgb.train(params,
......@@ -129,13 +131,13 @@ gbm = lgb.train(params,
valid_sets=lgb_eval,
callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])
print('Finish 30 - 40 rounds with changing bagging_fraction...')
print('Finished 30 - 40 rounds with changing bagging_fraction...')
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelood(preds, train_data):
def loglikelihood(preds, train_data):
labels = train_data.get_label()
preds = 1. / (1. + np.exp(-preds))
grad = preds - labels
......@@ -155,13 +157,13 @@ gbm = lgb.train(params,
lgb_train,
num_boost_round=10,
init_model=gbm,
fobj=loglikelood,
fobj=loglikelihood,
feval=binary_error,
valid_sets=lgb_eval)
print('Finish 40 - 50 rounds with self-defined objective function and eval metric...')
print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')
print('Start a new training job...')
print('Starting a new training job...')
# callback
......@@ -170,7 +172,7 @@ def reset_metrics():
lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
if env.iteration - env.begin_iteration == 5:
print('Add a new valid dataset at iteration 5...')
env.model.add_valid(lgb_eval_new, 'new valid')
env.model.add_valid(lgb_eval_new, 'new_valid')
callback.before_iteration = True
callback.order = 0
return callback
......@@ -182,4 +184,4 @@ gbm = lgb.train(params,
valid_sets=lgb_train,
callbacks=[reset_metrics()])
print('Finish first 10 rounds with callback function...')
print('Finished first 10 rounds with callback function...')
......@@ -8,15 +8,15 @@ if lgb.compat.MATPLOTLIB_INSTALLED:
else:
raise ImportError('You need to install matplotlib for plot_example.py.')
print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
......@@ -31,29 +31,29 @@ params = {
evals_result = {} # to record eval results for plotting
print('Start training...')
print('Starting training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_sets=[lgb_train, lgb_test],
feature_name=['f' + str(i + 1) for i in range(28)],
feature_name=['f' + str(i + 1) for i in range(X_train.shape[-1])],
categorical_feature=[21],
evals_result=evals_result,
verbose_eval=10)
print('Plot metrics recorded during training...')
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='l1')
plt.show()
print('Plot feature importances...')
print('Plotting feature importances...')
ax = lgb.plot_importance(gbm, max_num_features=10)
plt.show()
print('Plot 84th tree...') # one tree use categorical feature to split
print('Plotting 84th tree...') # one tree use categorical feature to split
ax = lgb.plot_tree(gbm, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()
print('Plot 84th tree with graphviz...')
print('Plotting 84th tree with graphviz...')
graph = lgb.create_tree_digraph(gbm, tree_index=83, name='Tree84')
graph.render(view=True)
......@@ -4,16 +4,15 @@ import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
......@@ -21,10 +20,9 @@ lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'auc'},
'metric': {'l2', 'l1'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
......@@ -33,7 +31,7 @@ params = {
'verbose': 0
}
print('Start training...')
print('Starting training...')
# train
gbm = lgb.train(params,
lgb_train,
......@@ -41,11 +39,11 @@ gbm = lgb.train(params,
valid_sets=lgb_eval,
early_stopping_rounds=5)
print('Save model...')
print('Saving model...')
# save model to file
gbm.save_model('model.txt')
print('Start predicting...')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
......
......@@ -7,20 +7,19 @@ import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
print('Loading data...')
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
print('Start training...')
print('Starting training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
gbm = lgb.LGBMRegressor(num_leaves=31,
learning_rate=0.05,
n_estimators=20)
gbm.fit(X_train, y_train,
......@@ -28,7 +27,7 @@ gbm.fit(X_train, y_train,
eval_metric='l1',
early_stopping_rounds=5)
print('Start predicting...')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
......@@ -45,14 +44,14 @@ def rmsle(y_true, y_pred):
return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False
print('Start training with custom eval function...')
print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
eval_metric=rmsle,
early_stopping_rounds=5)
print('Start predicting...')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
......@@ -67,7 +66,6 @@ param_grid = {
}
gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)
print('Best parameters found by grid search are:', gbm.best_params_)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment