Commit eba6d200 authored by wxchan's avatar wxchan
Browse files

Squash into one commit:

1. merge python-package
2. add dump model to json
3. fix bugs
4. clean code with pylint
5. update python examples
parent 19e085c9
......@@ -21,9 +21,13 @@ script:
- cd $TRAVIS_BUILD_DIR
- mkdir build && cd build && cmake .. && make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
notifications:
email: false
......
LightGBM, Light Gradient Boosting Machine
==========
=========================================
[![Build Status](https://travis-ci.org/Microsoft/LightGBM.svg?branch=master)](https://travis-ci.org/Microsoft/LightGBM)
LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:
......@@ -14,6 +14,11 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG
[Experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#comparison-experiment) on public datasets show that LightGBM can outperform other existing boosting framework on both efficiency and accuracy, with significant lower memory consumption. What's more, the [experiments](https://github.com/Microsoft/LightGBM/wiki/Experiments#parallel-experiment) show that LightGBM can achieve a linear speed-up by using multiple machines for training in specific settings.
News
----
12/02/2012 : Release [python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) beta version, welcome to have a try and provide issues and feedback.
Get Started
------------
To get started, please follow the [Installation Guide](https://github.com/Microsoft/LightGBM/wiki/Installation-Guide) and [Quick Start](https://github.com/Microsoft/LightGBM/wiki/Quick-Start).
......
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# or you can simply use a tuple of length=2 here
lgb_train = (X_train, y_train)
lgb_eval = (X_test, y_test)
# specify your configurations as a dict
params = {
'task' : 'train',
'boosting_type' : 'gbdt',
'objective' : 'regression',
'metric' : 'l2',
'num_leaves' : 31,
'learning_rate' : 0.05,
'feature_fraction' : 0.9,
'bagging_fraction' : 0.8,
'bagging_freq': 5,
# 'ndcg_eval_at' : [1, 3, 5, 10],
# this metric is not needed in this task, show as an example
'verbose' : 0
}
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=100,
valid_datas=lgb_eval,
# you can use a list to represent multiple valid_datas/valid_names
# don't use tuple, tuple is used to represent one dataset
early_stopping_rounds=10)
# save model to file
gbm.save_model('model.txt')
# load model from file
gbm = lgb.Booster(model_file='model.txt')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
# dump model to json (and save to file)
model_json = gbm.dump_model()
with open('model.json', 'w+') as f:
json.dump(model_json, f, indent=4)
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# train
gbm = lgb.LGBMRegressor(objective='regression',
num_leaves=31,
learning_rate=0.05,
n_estimators=100)
gbm.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10)
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
......@@ -51,6 +51,18 @@ public:
explicit BinMapper(const void* memory);
~BinMapper();
bool CheckAlign(const BinMapper& other) const {
if (num_bin_ != other.num_bin_) {
return false;
}
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false;
}
}
return true;
}
/*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; }
/*! \brief True if bin is trival (contains only one bin) */
......
......@@ -35,12 +35,34 @@ public:
const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) = 0;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
virtual void MergeFrom(const Boosting* other) = 0;
/*!
* \brief Reset training data for current boosting
* \param config Configs for boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metric
*/
virtual void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, const std::vector<const Metric*>& training_metrics) = 0;
/*!
* \brief Reset shrinkage_rate data for current boosting
* \param shrinkage_rate Configs for boosting
*/
virtual void ResetShrinkageRate(double shrinkage_rate) = 0;
/*!
* \brief Add a validation data
* \param valid_data Validation data
* \param valid_metrics Metric for validation data
*/
virtual void AddDataset(const Dataset* valid_data,
virtual void AddValidDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) = 0;
/*!
......@@ -52,6 +74,19 @@ public:
*/
virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) = 0;
/*!
* \brief Rollback one iteration
*/
virtual void RollbackOneIter() = 0;
/*!
* \brief return current iteration
*/
virtual int GetCurrentIteration() const = 0;
/*!
* \brief Eval metrics and check is met early stopping or not
*/
virtual bool EvalAndCheckEarlyStopping() = 0;
/*!
* \brief Get evaluation result at data_idx data
......@@ -73,7 +108,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) const = 0;
virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) = 0;
/*!
* \brief Prediction for one record, not sigmoid transform
......@@ -98,12 +133,18 @@ public:
const double* feature_values) const = 0;
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param filename filename that want to save to
* \brief Dump model to json format string
* \return Json format string of model
*/
virtual std::string DumpModel() const = 0;
/*!
* \brief Save model to file
* \param num_used_model Number of model that want to save, -1 means save all
* \param is_finish Is training finished or not
* \param filename Filename that want to save to
*/
virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) = 0;
virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;
/*!
* \brief Restore from a serialized string
......@@ -127,7 +168,7 @@ public:
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
virtual int NumberOfSubModels() const = 0;
virtual int NumberOfTotalModel() const = 0;
/*!
* \brief Get number of classes
......@@ -138,7 +179,7 @@ public:
/*!
* \brief Set number of used model for prediction
*/
virtual void SetNumUsedModel(int num_used_model) = 0;
virtual void SetNumIterationForPred(int num_iteration) = 0;
/*!
* \brief Get Type name of this boosting object
......@@ -151,6 +192,8 @@ public:
/*! \brief Disable copy */
Boosting(const Boosting&) = delete;
static void LoadFileToBoosting(Boosting* boosting, const char* filename);
/*!
* \brief Create boosting object
* \param type Type of boosting
......
......@@ -3,13 +3,15 @@
#include <cstdint>
#include <exception>
#include <stdexcept>
#include <cstring>
#include <string>
/*!
* To avoid type conversion on large data, most of our expose interface support both for float_32 and float_64.
* Except following:
* 1. gradients and hessians.
* 1. gradients and hessians.
* 2. Get current score for training data and validation
* The reason is because they are called frequently, the type-conversion on them maybe time cost.
* The reason is because they are called frequently, the type-conversion on them maybe time cost.
*/
#ifdef __cplusplus
......@@ -38,7 +40,7 @@ typedef void* BoosterHandle;
/*!
* \brief get string message of the last error
* all function in this file will return 0 when success
* all function in this file will return 0 when succeed
* and -1 when an error occured,
* \return const char* error inforomation
*/
......@@ -53,38 +55,29 @@ DllExport const char* LGBM_GetLastError();
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromFile(const char* filename,
DllExport int LGBM_DatasetCreateFromFile(const char* filename,
const char* parameters,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief load data set from binary file like the command_line LightGBM do
* \param filename the name of the file
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
DatesetHandle* out);
/*!
* \brief create a dataset from CSR format
* \param indptr pointer to row headers
* \param indptr_type
* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nindptr number of rows in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param num_col number of columns
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
......@@ -99,19 +92,19 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
/*!
* \brief create a dataset from CSC format
* \param col_ptr pointer to col headers
* \param col_ptr_type
* \param col_ptr_type type of col_ptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param ncol_ptr number of rows in the matrix + 1
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param ncol_ptr number of cols in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows; when it's set to 0, then guess from data
* \param num_row number of rows
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
......@@ -126,16 +119,16 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
/*!
* \brief create dataset from dense matrix
* \param data pointer to the data space
* \param data_type 0
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param is_row_major 1 for row major, 0 for column major
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromMat(const void* data,
DllExport int LGBM_DatasetCreateFromMat(const void* data,
int data_type,
int32_t nrow,
int32_t ncol,
......@@ -144,9 +137,25 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief Create subset of a data
* \param handle handle of full dataset
* \param used_row_indices Indices used in subset
* \param num_used_row_indices len of used_row_indices
* \param parameters additional parameters
* \param out subset of data
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetSubset(
const DatesetHandle* handle,
const int32_t* used_row_indices,
int32_t num_used_row_indices,
const char* parameters,
DatesetHandle* out);
/*!
* \brief free space for dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetFree(DatesetHandle handle);
......@@ -154,19 +163,21 @@ DllExport int LGBM_DatasetFree(DatesetHandle handle);
* \brief save dateset to binary file
* \param handle a instance of dataset
* \param filename file name
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetSaveBinary(DatesetHandle handle,
const char* filename);
/*!
* \brief set vector to a content in info
* Note: group and group only work for C_API_DTYPE_INT32
* label and weight only work for C_API_DTYPE_FLOAT32
* \param handle a instance of dataset
* \param field_name field name, can be label, weight, group
* \param field_name field name, can be label, weight, group, group_id
* \param field_data pointer to vector
* \param num_element number of element in field_data
* \param type float_32:0, int32_t:1
* \return 0 when success, -1 when failure happens
* \param type C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetSetField(DatesetHandle handle,
const char* field_name,
......@@ -180,8 +191,8 @@ DllExport int LGBM_DatasetSetField(DatesetHandle handle,
* \param field_name field name
* \param out_len used to set result length
* \param out_ptr pointer to the result
* \param out_type float_32:0, int32_t:1
* \return 0 when success, -1 when failure happens
* \param out_type C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetField(DatesetHandle handle,
const char* field_name,
......@@ -193,7 +204,7 @@ DllExport int LGBM_DatasetGetField(DatesetHandle handle,
* \brief get number of data.
* \param handle the handle to the dataset
* \param out The address to hold number of data
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
int64_t* out);
......@@ -202,7 +213,7 @@ DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
* \brief get number of features
* \param handle the handle to the dataset
* \param out The output of number of features
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
int64_t* out);
......@@ -212,42 +223,82 @@ DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
/*!
* \brief create an new boosting learner
* \param train_data training data set
* \param valid_datas validation data sets
* \param valid_names names of validation data sets
* \param n_valid_datas number of validation set
* \param parameters format: 'key1=value1 key2=value2'
* \prama out handle of created Booster
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
const DatesetHandle valid_datas[],
const char* valid_names[],
int n_valid_datas,
const char* parameters,
BoosterHandle* out);
/*!
* \brief load an existing boosting from model file
* \param filename filename of model
* \param out_num_iterations number of iterations of this booster
* \param out handle of created Booster
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterLoadFromModelfile(
DllExport int LGBM_BoosterCreateFromModelfile(
const char* filename,
int64_t* out_num_iterations,
BoosterHandle* out);
/*!
* \brief free obj in handle
* \param handle handle to be freed
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterFree(BoosterHandle handle);
/*!
* \brief Merge model in two booster to first handle
* \param handle handle, will merge other handle to this
* \param other_handle
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterMerge(BoosterHandle handle,
BoosterHandle other_handle);
/*!
* \brief Add new validation to booster
* \param handle handle
* \param valid_data validation data set
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterAddValidData(BoosterHandle handle,
const DatesetHandle valid_data);
/*!
* \brief Reset training data for booster
* \param handle handle
* \param train_data training data set
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterResetTrainingData(BoosterHandle handle,
const DatesetHandle train_data);
/*!
* \brief Reset config for current booster
* \param handle handle
* \param parameters format: 'key1=value1 key2=value2'
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterResetParameter(BoosterHandle handle, const char* parameters);
/*!
* \brief Get number of class
* \param handle handle
* \param out_len number of class
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetNumClasses(BoosterHandle handle, int64_t* out_len);
/*!
* \brief update the model in one round
* \param handle handle
* \param is_finished 1 means finised(cannot split any more)
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);
......@@ -258,7 +309,7 @@ DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param is_finished 1 means finised(cannot split any more)
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
const float* grad,
......@@ -266,81 +317,106 @@ DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
int* is_finished);
/*!
* \brief get evaluation for training data and validation data
* \brief Rollback one iteration
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result the string containing evaluation statistics, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterEval(BoosterHandle handle,
int data,
int64_t* out_len,
float* out_results);
DllExport int LGBM_BoosterRollbackOneIter(BoosterHandle handle);
/*!
* \brief Get iteration of current boosting rounds
* \param out_iteration iteration of boosting rounds
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetCurrentIteration(BoosterHandle handle, int64_t* out_iteration);
/*!
* \brief get raw score for training data, used to calculate gradients outside
* \brief Get number of eval
* \param out_len total number of eval results
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len);
/*!
* \brief Get Name of eval
* \param out_len total number of eval results
* \param out_strs names of eval result
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, char** out_strs);
/*!
* \brief get evaluation for training data and validation data
Note: 1. you should call LGBM_BoosterGetEvalNames first to get the name of evaluation results
2. should pre-allocate memory for out_results, you can get its length by LGBM_BoosterGetEvalCounts
* \param handle handle
* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
* \param out_result float arrary contains result
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetScore(BoosterHandle handle,
DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int data_idx,
int64_t* out_len,
const float** out_result);
float* out_results);
/*!
* \brief Get prediction for training data and validation data
this can be used to support customized eval function
this can be used to support customized eval function
Note: should pre-allocate memory for out_result, its length is equal to num_class * num_data
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
int data,
int data_idx,
int64_t* out_len,
float* out_result);
/*!
* \brief make prediction for file
* \param handle handle
* \param predict_type
* 0:raw score
* 1:with transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param data_has_header data file has header or not
* \param data_filename filename of data file
* \param data_has_header data file has header or not
* \param predict_type
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param result_filename filename of result file
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
int predict_type,
int64_t n_used_trees,
int data_has_header,
const char* data_filename,
int data_has_header,
int predict_type,
int64_t num_iteration,
const char* result_filename);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param indptr pointer to row headers
* \param indptr_type
* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nindptr number of rows in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param predict_type
* 0:raw score
* 1:with transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const void* indptr,
......@@ -352,24 +428,29 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t nelem,
int64_t num_col,
int predict_type,
int64_t n_used_trees,
double* out_result);
int64_t num_iteration,
int64_t* out_len,
float* out_result);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param data pointer to the data space
* \param data_type
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param is_row_major 1 for row major, 0 for column major
* \param predict_type
* 0:raw score
* 1:with transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
const void* data,
......@@ -378,21 +459,33 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t ncol,
int is_row_major,
int predict_type,
int64_t n_used_trees,
double* out_result);
int64_t num_iteration,
int64_t* out_len,
float* out_result);
/*!
* \brief save model into file
* \param handle handle
* \param num_used_model
* \param num_iteration, <= 0 means save all
* \param filename file name
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_used_model,
int num_iteration,
const char* filename);
/*!
* \brief dump model to json
* \param handle handle
* \param buffer_len string buffer length, if buffer_len < out_len, re-allocate buffer
* \param out_len actual output length
* \param out_str json format string of model
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterDumpModel(BoosterHandle handle,
int buffer_len,
int64_t* out_len,
char** out_str);
// some help functions used to convert data
......@@ -403,23 +496,25 @@ std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major);
std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices,
const void* data, int data_type, int64_t nindptr, int64_t nelem);
std::function<std::vector<std::pair<int, double>>(int idx)>
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices,
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices,
const void* data, int data_type, int64_t ncol_ptr, int64_t nelem);
std::vector<double>
std::vector<double>
SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<int>& indices);
#if defined(_MSC_VER)
// exception handle and error msg
static std::string& LastErrorMsg() { static std::string err_msg("Everything is fine"); return err_msg; }
static char* LastErrorMsg() { static __declspec(thread) char err_msg[512] = "Everything is fine"; return err_msg; }
#else
static char* LastErrorMsg() { static thread_local char err_msg[512] = "Everything is fine"; return err_msg; }
#endif
inline void LGBM_SetLastError(const char* msg) {
LastErrorMsg() = msg;
std::strcpy(LastErrorMsg(), msg);
}
inline int LGBM_APIHandleException(const std::exception& ex) {
......@@ -437,6 +532,6 @@ inline int LGBM_APIHandleException(const std::string& ex) {
catch(std::exception& ex) { return LGBM_APIHandleException(ex); } \
catch(std::string& ex) { return LGBM_APIHandleException(ex); } \
catch(...) { return LGBM_APIHandleException("unknown exception"); } \
return 0;
return 0;
#endif // LIGHTGBM_C_API_H_
......@@ -72,6 +72,8 @@ public:
inline bool GetBool(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, bool* out);
static std::unordered_map<std::string, std::string> Str2Map(const char* parameters);
};
/*! \brief Types of boosting */
......@@ -97,7 +99,7 @@ public:
std::string output_result = "LightGBM_predict_result.txt";
std::string input_model = "";
int verbosity = 1;
int num_model_predict = NO_LIMIT;
int num_iteration_predict = -1;
bool is_pre_partition = false;
bool is_enable_sparse = true;
bool use_two_round_loading = false;
......@@ -136,6 +138,8 @@ public:
bool is_unbalance = false;
// for multiclass
int num_class = 1;
// Balancing of positive and negative weights
double scale_pos_weight = 1.0f;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -164,12 +168,12 @@ public:
int feature_fraction_seed = 2;
double feature_fraction = 1.0f;
// max cache size(unit:MB) for historical histogram. < 0 means not limit
double histogram_pool_size = NO_LIMIT;
double histogram_pool_size = -1.0f;
// max depth of tree model.
// Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
// And the max leaves will be min(num_leaves, pow(2, max_depth - 1))
// max_depth < 0 means not limit
int max_depth = NO_LIMIT;
int max_depth = -1;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -231,7 +235,7 @@ public:
MetricConfig metric_config;
void Set(const std::unordered_map<std::string, std::string>& params) override;
void LoadFromString(const char* str);
private:
void GetBoostingType(const std::unordered_map<std::string, std::string>& params);
......@@ -328,17 +332,22 @@ struct ParameterAlias {
{ "ndcg_at", "ndcg_eval_at" },
{ "min_data_per_leaf", "min_data_in_leaf" },
{ "min_data", "min_data_in_leaf" },
{ "min_child_samples", "min_data_in_leaf" },
{ "min_sum_hessian_per_leaf", "min_sum_hessian_in_leaf" },
{ "min_sum_hessian", "min_sum_hessian_in_leaf" },
{ "min_hessian", "min_sum_hessian_in_leaf" },
{ "min_child_weight", "min_sum_hessian_in_leaf" },
{ "num_leaf", "num_leaves" },
{ "sub_feature", "feature_fraction" },
{ "colsample_bytree", "feature_fraction" },
{ "num_iteration", "num_iterations" },
{ "num_tree", "num_iterations" },
{ "num_round", "num_iterations" },
{ "num_trees", "num_iterations" },
{ "num_rounds", "num_iterations" },
{ "sub_row", "bagging_fraction" },
{ "subsample", "bagging_fraction" },
{ "subsample_freq", "bagging_freq" },
{ "shrinkage_rate", "learning_rate" },
{ "tree", "tree_learner" },
{ "num_machine", "num_machines" },
......@@ -361,6 +370,9 @@ struct ParameterAlias {
{ "blacklist", "ignore_column" },
{ "predict_raw_score", "is_predict_raw_score" },
{ "predict_leaf_index", "is_predict_leaf_index" },
{ "min_split_gain", "min_gain_to_split" },
{ "reg_alpha", "lambda_l1" },
{ "reg_lambda", "lambda_l2" },
{ "num_classes", "num_class" }
});
std::unordered_map<std::string, std::string> tmp_map;
......
......@@ -13,6 +13,7 @@
#include <functional>
#include <string>
#include <unordered_set>
#include <mutex>
namespace LightGBM {
......@@ -46,6 +47,13 @@ public:
*/
void Init(const char* data_filename, const int num_class);
/*!
* \brief init as subset
* \param metadata Filename of data
* \param used_indices
* \param num_used_indices
*/
void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
*/
......@@ -76,13 +84,14 @@ public:
void CheckOrPartition(data_size_t num_all_data,
const std::vector<data_size_t>& used_data_indices);
void SetLabel(const float* label, data_size_t len);
void SetWeights(const float* weights, data_size_t len);
void SetQueryBoundaries(const data_size_t* query_boundaries, data_size_t len);
void SetQueryId(const data_size_t* query_id, data_size_t len);
/*!
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
......@@ -141,8 +150,13 @@ public:
* \brief Get weights, if not exists, will return nullptr
* \return Pointer of weights
*/
inline const float* weights()
const { return weights_.data(); }
inline const float* weights() const {
if (weights_.size() > 0) {
return weights_.data();
} else {
return nullptr;
}
}
/*!
* \brief Get data boundaries on queries, if not exists, will return nullptr
......@@ -151,8 +165,13 @@ public:
* is the data indices for query i.
* \return Pointer of data boundaries on queries
*/
inline const data_size_t* query_boundaries()
const { return query_boundaries_.data(); }
inline const data_size_t* query_boundaries() const {
if (query_boundaries_.size() > 0) {
return query_boundaries_.data();
} else {
return nullptr;
}
}
/*!
* \brief Get Number of queries
......@@ -164,13 +183,25 @@ public:
* \brief Get weights for queries, if not exists, will return nullptr
* \return Pointer of weights for queries
*/
inline const float* query_weights() const { return query_weights_.data(); }
inline const float* query_weights() const {
if (query_weights_.size() > 0) {
return query_weights_.data();
} else {
return nullptr;
}
}
/*!
* \brief Get initial scores, if not exists, will return nullptr
* \return Pointer of initial scores
*/
inline const float* init_score() const { return init_score_.data(); }
inline const float* init_score() const {
if (init_score_.size() > 0) {
return init_score_.data();
} else {
return nullptr;
}
}
/*! \brief Disable copy */
Metadata& operator=(const Metadata&) = delete;
......@@ -210,6 +241,8 @@ private:
std::vector<float> init_score_;
/*! \brief Queries data */
std::vector<data_size_t> queries_;
/*! \brief mutex for threading safe call */
std::mutex mutex_;
};
......@@ -253,6 +286,27 @@ public:
/*! \brief Destructor */
~Dataset();
bool CheckAlign(const Dataset& other) const {
if (num_features_ != other.num_features_) {
return false;
}
if (num_total_features_ != other.num_total_features_) {
return false;
}
if (num_class_ != other.num_class_) {
return false;
}
if (label_idx_ != other.label_idx_) {
return false;
}
for (int i = 0; i < num_features_; ++i) {
if (!features_[i]->CheckAlign(*(other.features_[i].get()))) {
return false;
}
}
return true;
}
inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
int feature_idx = used_feature_map_[i];
......@@ -282,6 +336,8 @@ public:
}
}
Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const;
void FinishLoad();
bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
......@@ -348,12 +404,12 @@ private:
int num_class_;
/*! \brief Store some label level data*/
Metadata metadata_;
/*! \brief True if dataset is loaded from binary file */
bool is_loading_from_binfile_;
/*! \brief index of label column */
int label_idx_ = 0;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
/*! \brief store feature names */
static const char* binary_file_token;
};
} // namespace LightGBM
......
......@@ -49,7 +49,7 @@ private:
void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);
/*! \brief Check can load from binary file */
bool CheckCanLoadFromBin(const char* filename);
std::string CheckCanLoadFromBin(const char* filename);
const IOConfig& io_config_;
/*! \brief Random generator*/
......
......@@ -63,6 +63,13 @@ public:
~Feature() {
}
bool CheckAlign(const Feature& other) const {
if (feature_index_ != other.feature_index_) {
return false;
}
return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
}
/*!
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
......@@ -73,6 +80,9 @@ public:
unsigned int bin = bin_mapper_->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin);
}
inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
bin_data_->Push(tid, line_idx, bin);
}
inline void FinishLoad() { bin_data_->FinishLoad(); }
/*! \brief Index of this feature */
inline int feature_index() const { return feature_index_; }
......
......@@ -24,7 +24,6 @@ using ReduceFunction = std::function<void(const char*, char*, int)>;
using PredictFunction =
std::function<std::vector<double>(const std::vector<std::pair<int, double>>&)>;
#define NO_LIMIT (-1)
#define NO_SPECIFIC (-1)
} // namespace LightGBM
......
......@@ -24,8 +24,7 @@ public:
* \param metadata Label data
* \param num_data Number of data
*/
virtual void Init(const char* test_name,
const Metadata& metadata, data_size_t num_data) = 0;
virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
virtual const std::vector<std::string>& GetName() const = 0;
......
......@@ -98,13 +98,12 @@ public:
}
}
/*! \brief Serialize this object by string*/
/*! \brief Serialize this object to string*/
std::string ToString();
/*! \brief Disable copy */
Tree& operator=(const Tree&) = delete;
/*! \brief Disable copy */
Tree(const Tree&) = delete;
/*! \brief Serialize this object to json*/
std::string ToJSON();
private:
/*!
* \brief Find leaf index of which record belongs by data
......@@ -122,6 +121,9 @@ private:
*/
inline int GetLeaf(const double* feature_values) const;
/*! \brief Serialize one node to json*/
inline std::string NodeToJSON(int index);
/*! \brief Number of max leaves*/
int max_leaves_;
/*! \brief Number of current levas*/
......@@ -141,13 +143,13 @@ private:
std::vector<double> threshold_;
/*! \brief A non-leaf node's split gain */
std::vector<double> split_gain_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
// used for leaf node
/*! \brief The parent of leaf */
std::vector<int> leaf_parent_;
/*! \brief Output of leaves */
std::vector<double> leaf_value_;
/*! \brief Output of internal nodes(save internal output for per inference feature importance calc) */
std::vector<double> internal_value_;
/*! \brief Depth for leaves */
std::vector<int> leaf_depth_;
};
......
......@@ -89,7 +89,11 @@ private:
// a trick to use static variable in header file.
// May be not good, but avoid to use an additional cpp file
static LogLevel& GetLevel() { static LogLevel level; return level; }
#if defined(_MSC_VER)
static LogLevel& GetLevel() { static __declspec(thread) LogLevel level = LogLevel::Info; return level; }
#else
static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; }
#endif
};
......
LightGBM Python Package
=======================
Installation
------------
1. Following `Installation Guide <https://github.com/Microsoft/LightGBM/wiki/Installation-Guide>`__ to build first.
For the windows user, please change the build config to ``DLL``.
2. Install with ``cd python-package; python setup.py install``
Note: Make sure you have `setuptools <https://pypi.python.org/pypi/setuptools>`__
Examples
--------
- Refer also to the walk through examples in `python-guide
folder <https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide>`__
# coding: utf-8
"""LightGBM, Light Gradient Boosting Machine.
Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
"""
from __future__ import absolute_import
import os
from .basic import Predictor, Dataset, Booster
from .engine import train, cv
try:
from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
except ImportError:
pass
__version__ = 0.1
__all__ = ['Dataset', 'Booster',
'train', 'cv',
'LGBMModel', 'LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
# coding: utf-8
# pylint: disable = invalid-name, C0111, R0912, R0913, R0914, W0105
"""Wrapper c_api of LightGBM"""
from __future__ import absolute_import
import sys
import os
import ctypes
import tempfile
import json
import numpy as np
import scipy.sparse
from .libpath import find_lib_path
# pandas
try:
from pandas import Series, DataFrame
IS_PANDAS_INSTALLED = True
except ImportError:
IS_PANDAS_INSTALLED = False
class Series(object):
pass
class DataFrame(object):
pass
IS_PY3 = (sys.version_info[0] == 3)
def _load_lib():
"""Load LightGBM Library."""
lib_path = find_lib_path()
if len(lib_path) == 0:
raise Exception("cannot find LightGBM library")
lib = ctypes.cdll.LoadLibrary(lib_path[0])
lib.LGBM_GetLastError.restype = ctypes.c_char_p
return lib
_LIB = _load_lib()
class LightGBMError(Exception):
"""Error throwed by LightGBM"""
pass
def _safe_call(ret):
"""Check the return value of C API call
Parameters
----------
ret : int
return value from API calls
"""
if ret != 0:
raise LightGBMError(_LIB.LGBM_GetLastError())
def is_str(s):
if IS_PY3:
return isinstance(s, str)
else:
return isinstance(s, basestring)
def is_numpy_object(data):
return type(data).__module__ == np.__name__
def is_numpy_1d_array(data):
if isinstance(data, np.ndarray) and len(data.shape) == 1:
return True
else:
return False
def is_1d_list(data):
if not isinstance(data, list):
return False
if len(data) > 0:
if not isinstance(data[0], (int, float, bool)):
return False
return True
def list_to_1d_numpy(data, dtype):
if is_numpy_1d_array(data):
if data.dtype == dtype:
return data
else:
return data.astype(dtype=dtype, copy=False)
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif IS_PANDAS_INSTALLED and isinstance(data, Series):
return data.astype(dtype).values
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
def cfloat32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
res = np.fromiter(cptr, dtype=np.float32, count=length)
return res
else:
raise RuntimeError('expected float pointer')
def cint32_array_to_numpy(cptr, length):
"""Convert a ctypes float pointer array to a numpy array.
"""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
res = np.fromiter(cptr, dtype=np.int32, count=length)
return res
else:
raise RuntimeError('expected int pointer')
def c_str(string):
"""Convert a python string to cstring."""
return ctypes.c_char_p(string.encode('utf-8'))
def c_array(ctype, values):
"""Convert a python array to c array."""
return (ctype * len(values))(*values)
def param_dict_to_str(data):
if data is None or len(data) == 0:
return ""
pairs = []
for key, val in data.items():
if is_str(val):
pairs.append(str(key)+'='+str(val))
elif isinstance(val, (list, tuple)):
pairs.append(str(key)+'='+','.join(map(str, val)))
elif isinstance(val, (int, float, bool)):
pairs.append(str(key)+'='+str(val))
else:
raise TypeError('unknow type of parameter:%s , got:%s'
% (key, type(val).__name__))
return ' '.join(pairs)
"""marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32 = 0
C_API_DTYPE_FLOAT64 = 1
C_API_DTYPE_INT32 = 2
C_API_DTYPE_INT64 = 3
"""Matric is row major in python"""
C_API_IS_ROW_MAJOR = 1
C_API_PREDICT_NORMAL = 0
C_API_PREDICT_RAW_SCORE = 1
C_API_PREDICT_LEAF_INDEX = 2
FIELD_TYPE_MAPPER = {"label": C_API_DTYPE_FLOAT32,
"weight": C_API_DTYPE_FLOAT32,
"init_score": C_API_DTYPE_FLOAT32,
"group": C_API_DTYPE_INT32}
def c_float_array(data):
"""Convert numpy array / list to c float array."""
if is_1d_list(data):
data = np.array(data, copy=False)
if is_numpy_1d_array(data):
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.float64:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
type_data = C_API_DTYPE_FLOAT64
else:
raise TypeError("expected np.float32 or np.float64, met type({})"
.format(data.dtype))
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
return (ptr_data, type_data)
def c_int_array(data):
"""Convert numpy array to c int array."""
if is_1d_list(data):
data = np.array(data, copy=False)
if is_numpy_1d_array(data):
if data.dtype == np.int32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
elif data.dtype == np.int64:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
type_data = C_API_DTYPE_INT64
else:
raise TypeError("expected np.int32 or np.int64, met type({})"
.format(data.dtype))
else:
raise TypeError("Unknow type({})".format(type(data).__name__))
return (ptr_data, type_data)
class Predictor(object):
""""A Predictor of LightGBM.
"""
def __init__(self, model_file=None, booster_handle=None, is_manage_handle=True):
"""Initialize the Predictor.
Parameters
----------
model_file : string
Path to the model file.
"""
self.handle = ctypes.c_void_p()
self.__is_manage_handle = True
if model_file is not None:
"""Prediction task"""
out_num_iterations = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.num_class = out_num_class.value
self.__num_total_iteration = out_num_iterations.value
elif booster_handle is not None:
self.__is_manage_handle = is_manage_handle
self.handle = booster_handle
out_num_class = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.num_class = out_num_class.value
out_num_iterations = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle,
ctypes.byref(out_num_iterations)))
self.__num_total_iteration = out_num_iterations.value
else:
raise TypeError('Need Model file to create a booster')
def __del__(self):
if self.__is_manage_handle:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def predict(self, data, num_iteration=-1,
raw_score=False, pred_leaf=False, data_has_header=False,
is_reshape=True):
"""
Predict logic
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data is string type, it represents the path of txt file,
num_iteration : int
used iteration for prediction
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
is_reshape : bool
True for reshape to [nrow, ...]
Returns
-------
Prediction result
"""
if isinstance(data, Dataset):
raise TypeError("cannot use Dataset instance for prediction, \
please use raw data instead")
predict_type = C_API_PREDICT_NORMAL
if raw_score:
predict_type = C_API_PREDICT_RAW_SCORE
if pred_leaf:
predict_type = C_API_PREDICT_LEAF_INDEX
int_data_has_header = 1 if data_has_header else 0
if num_iteration > self.__num_total_iteration:
num_iteration = self.__num_total_iteration
if is_str(data):
tmp_pred_fname = tempfile.NamedTemporaryFile(prefix="lightgbm_tmp_pred_").name
_safe_call(_LIB.LGBM_BoosterPredictForFile(
self.handle,
c_str(data),
int_data_has_header,
predict_type,
num_iteration,
c_str(tmp_pred_fname)))
tmp_file = open(tmp_pred_fname, "r")
lines = tmp_file.readlines()
tmp_file.close()
nrow = len(lines)
preds = []
for line in lines:
for token in line.split('\t'):
preds.append(float(token))
preds = np.array(preds, copy=False)
os.remove(tmp_pred_fname)
elif isinstance(data, scipy.sparse.csr_matrix):
preds, nrow = self.__pred_for_csr(data, num_iteration,
predict_type)
elif isinstance(data, np.ndarray):
preds, nrow = self.__pred_for_np2d(data, num_iteration,
predict_type)
else:
try:
csr = scipy.sparse.csr_matrix(data)
preds, nrow = self.__pred_for_csr(csr, num_iteration,
predict_type)
except:
raise TypeError('can not predict data for type {}'.
format(type(data).__name__))
if pred_leaf:
preds = preds.astype(np.int32)
if preds.size != nrow and is_reshape:
if preds.size % nrow == 0:
ncol = int(preds.size / nrow)
preds = preds.reshape(nrow, ncol)
else:
raise ValueError('len of predict result(%d) cannot be divide nrow (%d)'
% (preds.size, nrow))
return preds
def __get_num_preds(self, num_iteration, nrow, predict_type):
n_preds = self.num_class * nrow
if predict_type == C_API_PREDICT_LEAF_INDEX:
if num_iteration > 0:
n_preds *= min(num_iteration, self.__num_total_iteration)
else:
n_preds *= self.__num_total_iteration
return n_preds
def __pred_for_np2d(self, mat, num_iteration, predict_type):
"""
Predict for a 2-D numpy matrix.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0],
predict_type)
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterPredictForMat(
self.handle,
ptr_data,
type_ptr_data,
mat.shape[0],
mat.shape[1],
C_API_IS_ROW_MAJOR,
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, mat.shape[0]
def __pred_for_csr(self, csr, num_iteration, predict_type):
"""
Predict for a csr data
"""
nrow = len(csr.indptr) - 1
n_preds = self.__get_num_preds(num_iteration, nrow, predict_type)
preds = np.zeros(n_preds, dtype=np.float32)
out_num_preds = ctypes.c_int64(0)
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(_LIB.LGBM_BoosterPredictForCSR(
self.handle,
ptr_indptr,
type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
predict_type,
num_iteration,
ctypes.byref(out_num_preds),
preds.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
))
if n_preds != out_num_preds.value:
raise ValueError("incorrect number for predict result")
return preds, nrow
PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'float16': 'float',
'float32': 'float', 'float64': 'float', 'bool': 'i'}
def _data_from_pandas(data):
if isinstance(data, DataFrame):
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
msg = """DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise ValueError(msg + ', '.join(bad_fields))
data = data.values.astype('float')
return data
def _label_from_pandas(label):
if isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float')
return label
class Dataset(object):
"""Dataset used in LightGBM.
Dataset is a internal data structure that used by LightGBM
"""
def __init__(self, data, label=None, max_bin=255, reference=None,
weight=None, group=None, predictor=None,
silent=False, params=None):
"""
Dataset used in LightGBM.
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data is string type, it represents the path of txt file,
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
max number of discrete bin for features
reference : Other Dataset, optional
If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
group/query size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
other parameters
"""
self.__label = None
self.__weight = None
self.__init_score = None
self.__group = None
if data is None:
self.handle = None
return
data = _data_from_pandas(data)
label = _label_from_pandas(label)
self.data_has_header = False
"""process for args"""
params = {} if params is None else params
self.max_bin = max_bin
self.predictor = predictor
params["max_bin"] = max_bin
if silent:
params["verbose"] = 0
elif "verbose" not in params:
params["verbose"] = 1
params_str = param_dict_to_str(params)
"""process for reference dataset"""
ref_dataset = None
if isinstance(reference, Dataset):
ref_dataset = ctypes.byref(reference.handle)
elif reference is not None:
raise TypeError('Reference dataset should be None or dataset instance')
"""start construct data"""
if is_str(data):
"""check data has header or not"""
if "has_header" in params or "header" in params:
if params["has_header"].lower() == "true" or params["header"].lower() == "true":
self.data_has_header = True
self.handle = ctypes.c_void_p()
_safe_call(_LIB.LGBM_DatasetCreateFromFile(
c_str(data),
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
elif isinstance(data, scipy.sparse.csr_matrix):
self.__init_from_csr(data, params_str, ref_dataset)
elif isinstance(data, np.ndarray):
self.__init_from_np2d(data, params_str, ref_dataset)
else:
try:
csr = scipy.sparse.csr_matrix(data)
self.__init_from_csr(csr, params_str, ref_dataset)
except:
raise TypeError('can not initialize Dataset from {}'.format(type(data).__name__))
if label is not None:
self.set_label(label)
if self.get_label() is None:
raise ValueError("label should not be None")
if weight is not None:
self.set_weight(weight)
if group is not None:
self.set_group(group)
# load init score
if self.predictor is not None and isinstance(self.predictor, Predictor):
init_score = self.predictor.predict(data,
raw_score=True,
data_has_header=self.data_has_header,
is_reshape=False)
if self.predictor.num_class > 1:
# need re group init score
new_init_score = np.zeros(init_score.size(), dtype=np.float32)
num_data = self.num_data()
for i in range(num_data):
for j in range(self.predictor.num_class):
new_init_score[j * num_data + i] = init_score[i * self.predictor.num_class + j]
init_score = new_init_score
self.set_init_score(init_score)
def create_valid(self, data, label=None, weight=None, group=None,
silent=False, params=None):
"""
Create validation data align with current dataset
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data is string type, it represents the path of txt file,
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group : list or numpy 1-D array , optional
group/query size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
other parameters
"""
return Dataset(data, label=label, max_bin=self.max_bin, reference=self,
weight=weight, group=group, predictor=self.predictor,
silent=silent, params=params)
def subset(self, used_indices, params=None):
"""
Get subset of current dataset
"""
used_indices = list_to_1d_numpy(used_indices, np.int32)
ret = Dataset(None)
ret.handle = ctypes.c_void_p()
params_str = param_dict_to_str(params)
_safe_call(_LIB.LGBM_DatasetGetSubset(
ctypes.byref(self.handle),
used_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
used_indices.shape[0],
c_str(params_str),
ctypes.byref(ret.handle)))
ret.max_bin = self.max_bin
ret.predictor = self.predictor
if ret.get_label() is None:
raise ValueError("label should not be None")
return ret
def __init_from_np2d(self, mat, params_str, ref_dataset):
"""
Initialize data from a 2-D numpy matrix.
"""
if len(mat.shape) != 2:
raise ValueError('Input numpy.ndarray must be 2 dimensional')
self.handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data = c_float_array(data)
_safe_call(_LIB.LGBM_DatasetCreateFromMat(
ptr_data,
type_ptr_data,
mat.shape[0],
mat.shape[1],
C_API_IS_ROW_MAJOR,
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __init_from_csr(self, csr, params_str, ref_dataset):
"""
Initialize data from a CSR matrix.
"""
if len(csr.indices) != len(csr.data):
raise ValueError('length mismatch: {} vs {}'.format(len(csr.indices), len(csr.data)))
self.handle = ctypes.c_void_p()
ptr_indptr, type_ptr_indptr = c_int_array(csr.indptr)
ptr_data, type_ptr_data = c_float_array(csr.data)
_safe_call(_LIB.LGBM_DatasetCreateFromCSR(
ptr_indptr,
type_ptr_indptr,
csr.indices.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
ptr_data,
type_ptr_data,
len(csr.indptr),
len(csr.data),
csr.shape[1],
c_str(params_str),
ref_dataset,
ctypes.byref(self.handle)))
def __del__(self):
_safe_call(_LIB.LGBM_DatasetFree(self.handle))
def get_field(self, field_name):
"""Get property from the Dataset.
Parameters
----------
field_name: str
The field name of the information
Returns
-------
info : array
a numpy array of information of the data
"""
tmp_out_len = ctypes.c_int64()
out_type = ctypes.c_int32()
ret = ctypes.POINTER(ctypes.c_void_p)()
_safe_call(_LIB.LGBM_DatasetGetField(
self.handle,
c_str(field_name),
ctypes.byref(tmp_out_len),
ctypes.byref(ret),
ctypes.byref(out_type)))
if out_type.value != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("Return type error for get_field")
if tmp_out_len.value == 0:
return None
if out_type.value == C_API_DTYPE_INT32:
return cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
elif out_type.value == C_API_DTYPE_FLOAT32:
return cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
else:
raise TypeError("unknow type")
def set_field(self, field_name, data):
"""Set property into the Dataset.
Parameters
----------
field_name: str
The field name of the information
data: numpy array or list or None
The array ofdata to be set
"""
if data is None:
"""set to None"""
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
None,
0,
FIELD_TYPE_MAPPER[field_name]))
return
if not is_numpy_1d_array(data):
raise TypeError("Unknow type({})".format(type(data).__name__))
if data.dtype == np.float32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
type_data = C_API_DTYPE_FLOAT32
elif data.dtype == np.int32:
ptr_data = data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
type_data = C_API_DTYPE_INT32
else:
raise TypeError("excepted np.float32 or np.int32, met type({})".format(data.dtype))
if type_data != FIELD_TYPE_MAPPER[field_name]:
raise TypeError("type error for set_field")
_safe_call(_LIB.LGBM_DatasetSetField(
self.handle,
c_str(field_name),
ptr_data,
len(data),
type_data))
def save_binary(self, filename):
"""Save Dataset to binary file
Parameters
----------
filename : string
Name of the output file.
"""
_safe_call(_LIB.LGBM_DatasetSaveBinary(
self.handle,
c_str(filename)))
def set_label(self, label):
"""Set label of Dataset
Parameters
----------
label: array like
The label information to be set into Dataset
"""
label = list_to_1d_numpy(label, np.float32)
self.__label = label
self.set_field('label', label)
def set_weight(self, weight):
""" Set weight of each instance.
Parameters
----------
weight : array like
Weight for each data point
"""
if weight is not None:
weight = list_to_1d_numpy(weight, np.float32)
self.__weight = weight
self.set_field('weight', weight)
def set_init_score(self, score):
""" Set init score of booster to start from.
Parameters
----------
score: array like
"""
if score is not None:
score = list_to_1d_numpy(score, np.float32)
self.__init_score = score
self.set_field('init_score', score)
def set_group(self, group):
"""Set group size of Dataset (used for ranking).
Parameters
----------
group : array like
Group size of each group
"""
if group is not None:
group = list_to_1d_numpy(group, np.int32)
self.__group = group
self.set_field('group', group)
def get_label(self):
"""Get the label of the Dataset.
Returns
-------
label : array
"""
if self.__label is None:
self.__label = self.get_field('label')
if self.__label is None:
raise TypeError("label should not be None")
return self.__label
def get_weight(self):
"""Get the weight of the Dataset.
Returns
-------
weight : array
"""
if self.__weight is None:
self.__weight = self.get_field('weight')
return self.__weight
def get_init_score(self):
"""Get the initial score of the Dataset.
Returns
-------
init_score : array
"""
if self.__init_score is None:
self.__init_score = self.get_field('init_score')
return self.__init_score
def get_group(self):
"""Get the initial score of the Dataset.
Returns
-------
init_score : array
"""
if self.__group is None:
self.__group = self.get_field('group')
return self.__group
def num_data(self):
"""Get the number of rows in the Dataset.
Returns
-------
number of rows : int
"""
ret = ctypes.c_int64()
_safe_call(_LIB.LGBM_DatasetGetNumData(self.handle,
ctypes.byref(ret)))
return ret.value
def num_feature(self):
"""Get the number of columns (features) in the Dataset.
Returns
-------
number of columns : int
"""
ret = ctypes.c_int64()
_safe_call(_LIB.LGBM_DatasetGetNumFeature(self.handle,
ctypes.byref(ret)))
return ret.value
class Booster(object):
""""A Booster of of LightGBM.
"""
def __init__(self, params=None, train_set=None, model_file=None, silent=False):
"""Initialize the Booster.
Parameters
----------
params : dict
Parameters for boosters.
train_set : Dataset
training dataset
model_file : string
Path to the model file.
silent : boolean, optional
Whether print messages during construction
"""
self.handle = ctypes.c_void_p()
self.__need_reload_eval_info = True
self.__is_manage_handle = True
self.__train_data_name = "training"
self.__attr = {}
self.best_iteration = -1
params = {} if params is None else params
if silent:
params["verbose"] = 0
elif "verbose" not in params:
params["verbose"] = 1
if train_set is not None:
"""Training task"""
if not isinstance(train_set, Dataset):
raise TypeError('training data should be Dataset instance, met{}'.format(type(train_set).__name__))
params_str = param_dict_to_str(params)
"""construct booster object"""
_safe_call(_LIB.LGBM_BoosterCreate(
train_set.handle,
c_str(params_str),
ctypes.byref(self.handle)))
"""save reference to data"""
self.train_set = train_set
self.valid_sets = []
self.name_valid_sets = []
self.__num_dataset = 1
self.init_predictor = train_set.predictor
if self.init_predictor is not None:
_safe_call(_LIB.LGBM_BoosterMerge(
self.handle,
self.init_predictor.handle))
out_num_class = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
"""buffer for inner predict"""
self.__inner_predict_buffer = [None]
self.__is_predicted_cur_iter = [False]
self.__get_eval_info()
elif model_file is not None:
"""Prediction task"""
out_num_iterations = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterCreateFromModelfile(
c_str(model_file),
ctypes.byref(out_num_iterations),
ctypes.byref(self.handle)))
out_num_class = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetNumClasses(
self.handle,
ctypes.byref(out_num_class)))
self.__num_class = out_num_class.value
else:
raise TypeError('At least need training dataset or model file to create booster instance')
def __del__(self):
if self.handle is not None and self.__is_manage_handle:
_safe_call(_LIB.LGBM_BoosterFree(self.handle))
def set_train_data_name(self, name):
self.__train_data_name = name
def add_valid(self, data, name):
"""Add an validation data
Parameters
----------
data : Dataset
validation data
name : String
name of validation data
"""
if data.predictor is not self.init_predictor:
raise Exception("Add validation data failed, you should use same predictor for these data")
_safe_call(_LIB.LGBM_BoosterAddValidData(
self.handle,
data.handle))
self.valid_sets.append(data)
self.name_valid_sets.append(name)
self.__num_dataset += 1
self.__inner_predict_buffer.append(None)
self.__is_predicted_cur_iter.append(False)
def reset_parameter(self, params):
"""Reset parameters for booster
Parameters
----------
params : dict
params
silent : boolean, optional
Whether print messages during construction
"""
if 'metric' in params:
self.__need_reload_eval_info = True
params_str = param_dict_to_str(params)
if params_str:
_safe_call(_LIB.LGBM_BoosterResetParameter(
self.handle,
c_str(params_str)))
def update(self, train_set=None, fobj=None):
"""
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
train_set : training data, None means use last training data
fobj : function
Customized objective function.
Returns
-------
is_finished, bool
"""
"""need reset training data"""
if train_set is not None and train_set is not self.train_set:
if train_set.predictor is not self.init_predictor:
raise Exception("Replace training data failed, you should use same predictor for these data")
self.train_set = train_set
_safe_call(_LIB.LGBM_BoosterResetTrainingData(
self.handle,
self.train_set.handle))
self.__inner_predict_buffer[0] = None
is_finished = ctypes.c_int(0)
if fobj is None:
_safe_call(_LIB.LGBM_BoosterUpdateOneIter(
self.handle,
ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
return is_finished.value == 1
else:
grad, hess = fobj(self.__inner_predict(0), self.train_set)
return self.__boost(grad, hess)
def __boost(self, grad, hess):
"""
Boost the booster for one iteration, with customized gradient statistics.
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
grad : 1d numpy or 1d list
The first order of gradient.
hess : 1d numpy or 1d list
The second order of gradient.
Returns
-------
is_finished, bool
"""
if not is_numpy_1d_array(grad):
if is_1d_list(grad):
grad = np.array(grad, dtype=np.float32, copy=False)
else:
raise TypeError("grad should be numpy 1d array or 1d list")
if not is_numpy_1d_array(hess):
if is_1d_list(hess):
hess = np.array(hess, dtype=np.float32, copy=False)
else:
raise TypeError("hess should be numpy 1d array or 1d list")
if len(grad) != len(hess):
raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
if grad.dtype != np.float32:
grad = grad.astype(np.float32, copy=False)
if hess.dtype != np.float32:
hess = hess.astype(np.float32, copy=False)
is_finished = ctypes.c_int(0)
_safe_call(_LIB.LGBM_BoosterUpdateOneIterCustom(
self.handle,
grad.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
hess.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
ctypes.byref(is_finished)))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
return is_finished.value == 1
def rollback_one_iter(self):
"""
Rollback one iteration
"""
_safe_call(_LIB.LGBM_BoosterRollbackOneIter(
self.handle))
self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
def current_iteration(self):
out_cur_iter = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetCurrentIteration(
self.handle,
ctypes.byref(out_cur_iter)))
return out_cur_iter.value
def eval(self, data, name, feval=None):
"""Evaluate for data
Parameters
----------
data : Dataset object
name : name of data
feval : function
Custom evaluation function.
Returns
-------
result: list
Evaluation result list.
"""
if not isinstance(data, Dataset):
raise TypeError("Can only eval for Dataset instance")
data_idx = -1
if data is self.train_set:
data_idx = 0
else:
for i in range(len(self.valid_sets)):
if data is self.valid_sets[i]:
data_idx = i + 1
break
"""need to push new valid data"""
if data_idx == -1:
self.add_valid(data, name)
data_idx = self.__num_dataset - 1
return self.__inner_eval(name, data_idx, feval)
def eval_train(self, feval=None):
"""Evaluate for training data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
"""
return self.__inner_eval(self.__train_data_name, 0, feval)
def eval_valid(self, feval=None):
"""Evaluate for validation data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result list.
"""
ret = []
for i in range(1, self.__num_dataset):
ret.extend(self.__inner_eval(self.name_valid_sets[i-1], i, feval))
return ret
def save_model(self, filename, num_iteration=-1):
"""Save model of booster to file
Parameters
----------
filename : str
filename to save
num_iteration: int
number of iteration that want to save. < 0 means save all
"""
_safe_call(_LIB.LGBM_BoosterSaveModel(
self.handle,
num_iteration,
c_str(filename)))
def dump_model(self):
"""
Dump model to json format
Returns
-------
Json format of model
"""
buffer_len = 1 << 20
tmp_out_len = ctypes.c_int64(0)
string_buffer = ctypes.create_string_buffer(buffer_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
buffer_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
actual_len = tmp_out_len.value
if actual_len > buffer_len:
string_buffer = ctypes.create_string_buffer(actual_len)
ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
_safe_call(_LIB.LGBM_BoosterDumpModel(
self.handle,
actual_len,
ctypes.byref(tmp_out_len),
ctypes.byref(ptr_string_buffer)))
return json.loads(string_buffer.value.decode())
def predict(self, data, num_iteration=-1, raw_score=False, pred_leaf=False, data_has_header=False, is_reshape=True):
"""
Predict logic
Parameters
----------
data : string/numpy array/scipy.sparse
Data source for prediction
When data is string type, it represents the path of txt file,
num_iteration : int
used iteration for prediction
raw_score : bool
True for predict raw score
pred_leaf : bool
True for predict leaf index
data_has_header : bool
Used for txt data
is_reshape : bool
True for reshape to [nrow, ...]
Returns
-------
Prediction result
"""
predictor = Predictor(booster_handle=self.handle, is_manage_handle=False)
return predictor.predict(data, num_iteration, raw_score, pred_leaf, data_has_header, is_reshape)
def to_predictor(self):
"""Convert to predictor
Note: Predictor will manage the handle after doing this
"""
predictor = Predictor(booster_handle=self.handle, is_manage_handle=True)
self.__is_manage_handle = False
return predictor
def __inner_eval(self, data_name, data_idx, feval=None):
"""
Evaulate training or validation data
"""
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
self.__get_eval_info()
ret = []
if self.__num_inner_eval > 0:
result = np.array([0.0 for _ in range(self.__num_inner_eval)], dtype=np.float32)
tmp_out_len = ctypes.c_int64(0)
_safe_call(_LIB.LGBM_BoosterGetEval(
self.handle,
data_idx,
ctypes.byref(tmp_out_len),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_float))))
if tmp_out_len.value != self.__num_inner_eval:
raise ValueError("incorrect number of eval results")
for i in range(self.__num_inner_eval):
ret.append((data_name, self.__name_inner_eval[i], result[i], self.__higher_better_inner_eval[i]))
if feval is not None:
if data_idx == 0:
cur_data = self.train_set
else:
cur_data = self.valid_sets[data_idx - 1]
feval_ret = feval(self.__inner_predict(data_idx), cur_data)
if isinstance(feval_ret, list):
for eval_name, val, is_higher_better in feval_ret:
ret.append((data_name, eval_name, val, is_higher_better))
else:
eval_name, val, is_higher_better = feval_ret
ret.append((data_name, eval_name, val, is_higher_better))
return ret
def __inner_predict(self, data_idx):
"""
Predict for training and validation dataset
"""
if data_idx >= self.__num_dataset:
raise ValueError("data_idx should be smaller than number of dataset")
if self.__inner_predict_buffer[data_idx] is None:
if data_idx == 0:
n_preds = self.train_set.num_data() * self.__num_class
else:
n_preds = self.valid_sets[data_idx - 1].num_data() * self.__num_class
self.__inner_predict_buffer[data_idx] = \
np.array([0.0 for _ in range(n_preds)], dtype=np.float32, copy=False)
"""avoid to predict many time in one iteration"""
if not self.__is_predicted_cur_iter[data_idx]:
tmp_out_len = ctypes.c_int64(0)
data_ptr = self.__inner_predict_buffer[data_idx].ctypes.data_as(ctypes.POINTER(ctypes.c_float))
_safe_call(_LIB.LGBM_BoosterGetPredict(
self.handle,
data_idx,
ctypes.byref(tmp_out_len),
data_ptr))
if tmp_out_len.value != len(self.__inner_predict_buffer[data_idx]):
raise ValueError("incorrect number of predict results for data %d" % (data_idx))
self.__is_predicted_cur_iter[data_idx] = True
return self.__inner_predict_buffer[data_idx]
def __get_eval_info(self):
"""
Get inner evaluation count and names
"""
if self.__need_reload_eval_info:
self.__need_reload_eval_info = False
out_num_eval = ctypes.c_int64(0)
"""Get num of inner evals"""
_safe_call(_LIB.LGBM_BoosterGetEvalCounts(
self.handle,
ctypes.byref(out_num_eval)))
self.__num_inner_eval = out_num_eval.value
if self.__num_inner_eval > 0:
"""Get name of evals"""
tmp_out_len = ctypes.c_int64(0)
string_buffers = [ctypes.create_string_buffer(255) for i in range(self.__num_inner_eval)]
ptr_string_buffers = (ctypes.c_char_p*self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
_safe_call(_LIB.LGBM_BoosterGetEvalNames(
self.handle,
ctypes.byref(tmp_out_len),
ptr_string_buffers))
if self.__num_inner_eval != tmp_out_len.value:
raise ValueError("size of eval names doesn't equal with num_evals")
self.__name_inner_eval = []
for i in range(self.__num_inner_eval):
self.__name_inner_eval.append(string_buffers[i].value.decode())
self.__higher_better_inner_eval = []
higher_better_metric = ['auc', 'ndcg']
for name in self.__name_inner_eval:
if any(name.startswith(x) for x in higher_better_metric):
self.__higher_better_inner_eval.append(True)
else:
self.__higher_better_inner_eval.append(False)
def attr(self, key):
"""Get attribute string from the Booster.
Parameters
----------
key : str
The key to get attribute from.
Returns
-------
value : str
The attribute value of the key, returns None if attribute do not exist.
"""
if key in self.__attr:
return self.__attr[key]
else:
return None
def set_attr(self, **kwargs):
"""Set the attribute of the Booster.
Parameters
----------
**kwargs
The attributes to set. Setting a value to None deletes an attribute.
"""
for key, value in kwargs.items():
if value is not None:
if not is_str(value):
raise ValueError("Set Attr only accepts string values")
self.__attr[key] = value
else:
self.__attr.pop(key, None)
# coding: utf-8
# pylint: disable = invalid-name, W0105
from __future__ import absolute_import
import collections
class EarlyStopException(Exception):
"""Exception of early stopping.
Parameters
----------
best_iteration : int
The best iteration stopped.
"""
def __init__(self, best_iteration):
super(EarlyStopException, self).__init__()
self.best_iteration = best_iteration
# Callback environment used by callbacks
CallbackEnv = collections.namedtuple(
"LightGBMCallbackEnv",
["model",
"cvfolds",
"iteration",
"begin_iteration",
"end_iteration",
"evaluation_result_list"])
def _format_eval_result(value, show_stdv=True):
"""format metric string"""
if len(value) == 4:
return '%s\'s %s:%g' % (value[0], value[1], value[2])
elif len(value) == 5:
if show_stdv:
return '%s\'s %s:%g+%g' % (value[0], value[1], value[2], value[4])
else:
return '%s\'s %s:%g' % (value[0], value[1], value[2])
else:
raise ValueError("wrong metric value")
def print_evaluation(period=1, show_stdv=True):
"""Create a callback that print evaluation result.
Parameters
----------
period : int
The period to log the evaluation results
show_stdv : bool, optional
Whether show stdv if provided
Returns
-------
callback : function
A callback that print evaluation every period iterations.
"""
def callback(env):
"""internal function"""
if len(env.evaluation_result_list) == 0 or period is False:
return
if env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration:
result = '\t'.join([_format_eval_result(x, show_stdv) \
for x in env.evaluation_result_list])
print('[%d]\t%s' % (env.iteration, result))
return callback
def record_evaluation(eval_result):
"""Create a call back that records the evaluation history into eval_result.
Parameters
----------
eval_result : dict
A dictionary to store the evaluation results.
Returns
-------
callback : function
The requested callback function.
"""
if not isinstance(eval_result, dict):
raise TypeError('eval_result has to be a dictionary')
eval_result.clear()
def init(env):
"""internal function"""
for data_name, eval_name, _, _ in env.evaluation_result_list:
if data_name not in eval_result:
eval_result[data_name] = {}
if eval_name not in eval_result[data_name]:
eval_result[data_name][eval_name] = []
def callback(env):
"""internal function"""
if len(eval_result) == 0:
init(env)
for data_name, eval_name, result, _ in env.evaluation_result_list:
eval_result[data_name][eval_name].append(result)
return callback
def reset_learning_rate(learning_rates):
"""Reset learning rate after iteration 1
NOTE: the initial learning rate will still take in-effect on first iteration.
Parameters
----------
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
Returns
-------
callback : function
The requested callback function.
"""
def callback(env):
"""internal function"""
booster = env.model
i = env.iteration
if isinstance(learning_rates, list):
if len(learning_rates) != env.end_iteration:
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
booster.reset_parameter({'learning_rate':learning_rates[i]})
else:
booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)})
callback.before_iteration = True
return callback
def early_stop(stopping_rounds, verbose=True):
"""Create a callback that activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Parameters
----------
stopping_rounds : int
The stopping rounds before the trend occur.
verbose : optional, bool
Whether to print message about early stopping information.
Returns
-------
callback : function
The requested callback function.
"""
factor_to_bigger_better = {}
best_score = {}
best_iter = {}
best_msg = {}
def init(env):
"""internal function"""
if len(env.evaluation_result_list) == 0:
raise ValueError('For early stopping you need at least one set in evals.')
if verbose:
msg = "Train until valid scores didn't improve in {} rounds."
print(msg.format(stopping_rounds))
for i in range(len(env.evaluation_result_list)):
best_score[i] = float('-inf')
best_iter[i] = 0
if verbose:
best_msg[i] = ""
factor_to_bigger_better[i] = -1.0
if env.evaluation_result_list[i][3]:
factor_to_bigger_better[i] = 1.0
def callback(env):
"""internal function"""
if len(best_score) == 0:
init(env)
for i in range(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
if score > best_score[i]:
best_score[i] = score
best_iter[i] = env.iteration
if verbose:
best_msg[i] = '[%d]\t%s' % (env.iteration, \
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
else:
if env.iteration - best_iter[i] >= stopping_rounds:
if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose:
print('early stopping, best iteration is:\n{}'.format(best_msg[i]))
raise EarlyStopException(best_iter[i])
return callback
# coding: utf-8
# pylint: disable = invalid-name, W0105
"""Training Library containing training routines of LightGBM."""
from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback
def _construct_dataset(X_y, reference=None,
params=None, other_fields=None,
predictor=None):
if 'max_bin' in params:
max_bin = int(params['max_bin'])
else:
max_bin = 255
weight = None
group = None
init_score = None
if other_fields is not None:
if not isinstance(other_fields, dict):
raise TypeError("other filed data should be dict type")
weight = None if 'weight' not in other_fields else other_fields['weight']
group = None if 'group' not in other_fields else other_fields['group']
init_score = None if 'init_score' not in other_fields else other_fields['init_score']
if is_str(X_y):
data = X_y
label = None
else:
if len(X_y) != 2:
raise TypeError("should pass (data, label) pair")
data = X_y[0]
label = X_y[1]
if reference is None:
ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group,
predictor=predictor, params=params)
else:
ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params)
if init_score is not None:
ret.set_init_score(init_score)
return ret
def train(params, train_data, num_boost_round=100,
valid_datas=None, valid_names=None,
fobj=None, feval=None, init_model=None,
train_fields=None, valid_fields=None,
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None):
"""Train with given parameters.
Parameters
----------
params : dict
params.
train_data : Dataset, tuple (X, y) or filename of data
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_datas: list of Datasets, tuples (valid_X, valid_y) or filename of data
List of data to be evaluated during training
valid_names: list of string
names of valid_datas
fobj : function
Customized objective function.
feval : function
Customized evaluation function.
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. \
e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] \
and valid_names containing ['eval', 'train'] and a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is
printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set
is printed at every given `verbose_eval` boosting stage. The last boosting stage
/ the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
booster : a trained booster model
"""
"""create predictor first"""
if is_str(init_model):
predictor = Predictor(model_file=init_model)
elif isinstance(init_model, Booster):
predictor = init_model.to_predictor()
elif isinstance(init_model, Predictor):
predictor = init_model
else:
predictor = None
"""create dataset"""
if isinstance(train_data, Dataset):
train_set = train_data
else:
train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
is_valid_contain_train = False
train_data_name = "training"
valid_sets = []
name_valid_sets = []
if valid_datas is not None:
if isinstance(valid_datas, (Dataset, tuple)):
valid_datas = [valid_datas]
if isinstance(valid_names, str):
valid_names = [valid_names]
for i, valid_data in enumerate(valid_datas):
other_fields = None if valid_fields is None else valid_fields[i]
"""reduce cost for prediction training data"""
if valid_data is train_data:
is_valid_contain_train = True
if valid_names is not None:
train_data_name = valid_names[i]
continue
if isinstance(valid_data, Dataset):
valid_set = valid_data
else:
valid_set = _construct_dataset(
valid_data,
train_set,
params,
other_fields,
predictor)
valid_sets.append(valid_set)
if valid_names is not None:
name_valid_sets.append(valid_names[i])
else:
name_valid_sets.append('valid_'+str(i))
"""process callbacks"""
callbacks = [] if callbacks is None else callbacks
# Most of legacy advanced options becomes callbacks
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation())
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval))
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval)))
if learning_rates is not None:
callbacks.append(callback.reset_learning_rate(learning_rates))
if evals_result is not None:
callbacks.append(callback.record_evaluation(evals_result))
callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
"""construct booster"""
if 'metric' in params:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train:
booster.set_train_data_name(train_data_name)
for valid_set, name_valid_set in zip(valid_sets, name_valid_sets):
booster.add_valid(valid_set, name_valid_set)
"""start training"""
for i in range(num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster,
cvfolds=None,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=None))
booster.update(fobj=fobj)
evaluation_result_list = []
# check evaluation result.
if len(valid_sets) != 0:
if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval))
try:
for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=booster,
cvfolds=None,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=evaluation_result_list))
except callback.EarlyStopException:
break
if booster.attr('best_iteration') is not None:
booster.best_iteration = int(booster.attr('best_iteration')) + 1
else:
booster.best_iteration = num_boost_round
return booster
class CVBooster(object):
""""Auxiliary datastruct to hold one fold of CV."""
def __init__(self, train_set, valid_test, params):
""""Initialize the CVBooster"""
self.train_set = train_set
self.valid_test = valid_test
self.booster = Booster(params=params, train_set=train_set)
self.booster.add_valid(valid_test, 'valid')
def update(self, fobj):
""""Update the boosters for one iteration"""
self.booster.update(fobj=fobj)
def eval(self, feval):
""""Evaluate the CVBooster for one iteration."""
return self.booster.eval_valid(feval)
try:
try:
from sklearn.model_selection import StratifiedKFold
except ImportError:
from sklearn.cross_validation import StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
"""
Make an n-fold list of CVBooster from random indices.
"""
np.random.seed(seed)
if stratified:
if SKLEARN_StratifiedKFold:
sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
else:
raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
else:
randidx = np.random.permutation(full_data.num_data())
kstep = int(len(randidx) / nfold)
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
ret = []
for k in range(nfold):
train_set = full_data.subset(np.concatenate([idset[i] for i in range(nfold) if k != i]))
valid_set = full_data.subset(idset[k])
# run preprocessing on the data set if needed
if fpreproc is not None:
train_set, valid_set, tparam = fpreproc(train_set, valid_set, param.copy())
else:
tparam = param
ret.append(CVBooster(train_set, valid_set, tparam))
return ret
def _agg_cv_result(raw_results):
"""
Aggregate cross-validation results.
"""
cvmap = {}
metric_type = {}
for one_result in raw_results:
for one_line in one_result:
key = one_line[1]
metric_type[key] = one_line[3]
if key not in cvmap:
cvmap[key] = []
cvmap[key].append(one_line[2])
results = []
for k, v in cvmap.items():
v = np.array(v)
mean, std = np.mean(v), np.std(v)
results.append(('cv_agg', k, mean, metric_type[k], std))
return results
def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
"""Cross-validation with given paramaters.
Parameters
----------
params : dict
Booster params.
train_data : pair, (X, y) or filename of data
Data to be trained.
num_boost_round : int
Number of boosting iterations.
nfold : int
Number of folds in CV.
stratified : bool
Perform stratified sampling.
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings
Evaluation metrics to be watched in CV.
fobj : function
Custom objective function.
feval : function
Custom evaluation function.
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns
transformed versions of those.
verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed
when np.ndarray is returned. If True, progress will be displayed at
boosting stage. If an integer is given, progress will be displayed
at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
seed : int
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
evaluation history : list(string)
"""
if isinstance(metrics, str):
metrics = [metrics]
if isinstance(params, list):
params = dict(params)
if 'metric' not in params:
params['metric'] = []
else:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
if metrics is not None and len(metrics) > 0:
params['metric'].extend(metrics)
train_set = _construct_dataset(train_data, None, params, train_fields)
results = {}
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
# setup callbacks
callbacks = [] if callbacks is None else callbacks
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
verbose=False))
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
for i in range(num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=None,
cvfolds=cvfolds,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=None))
for fold in cvfolds:
fold.update(fobj)
res = _agg_cv_result([f.eval(feval) for f in cvfolds])
for _, key, mean, _, std in res:
if key + '-mean' not in results:
results[key + '-mean'] = []
if key + '-std' not in results:
results[key + '-std'] = []
results[key + '-mean'].append(mean)
results[key + '-std'].append(std)
try:
for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=None,
cvfolds=cvfolds,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=res))
except callback.EarlyStopException as e:
for k in results:
results[k] = results[k][:(e.best_iteration + 1)]
break
return results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment