Commit 16d1853d authored by Guolin Ke's avatar Guolin Ke Committed by GitHub
Browse files

Merge pull request #94 from Microsoft/python-package

Python package (#11)
parents 65e711a2 29cf97e9
......@@ -21,9 +21,13 @@ script:
- cd $TRAVIS_BUILD_DIR
- mkdir build && cd build && cmake .. && make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
- cd $TRAVIS_BUILD_DIR
- rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
- cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
- cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
- cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
notifications:
email: false
......
import numpy as np
import random
import lightgbm as lgb
from sklearn import datasets, metrics, model_selection
rng = np.random.RandomState(2016)
X, y = datasets.make_classification(n_samples=10000, n_features=100)
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1, random_state=1)
lgb_model = lgb.LGBMClassifier(n_estimators=100).fit(x_train, y_train, [(x_test, y_test)], eval_metric="auc")
......@@ -51,6 +51,18 @@ public:
explicit BinMapper(const void* memory);
~BinMapper();
bool CheckAlign(const BinMapper& other) const {
if (num_bin_ != other.num_bin_) {
return false;
}
for (int i = 0; i < num_bin_; ++i) {
if (bin_upper_bound_[i] != other.bin_upper_bound_[i]) {
return false;
}
}
return true;
}
/*! \brief Get number of bins */
inline int num_bin() const { return num_bin_; }
/*! \brief True if bin is trival (contains only one bin) */
......
......@@ -35,12 +35,34 @@ public:
const ObjectiveFunction* object_function,
const std::vector<const Metric*>& training_metrics) = 0;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
virtual void MergeFrom(const Boosting* other) = 0;
/*!
* \brief Reset training data for current boosting
* \param config Configs for boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metric
*/
virtual void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function, const std::vector<const Metric*>& training_metrics) = 0;
/*!
* \brief Reset shrinkage_rate data for current boosting
* \param shrinkage_rate Configs for boosting
*/
virtual void ResetShrinkageRate(double shrinkage_rate) = 0;
/*!
* \brief Add a validation data
* \param valid_data Validation data
* \param valid_metrics Metric for validation data
*/
virtual void AddDataset(const Dataset* valid_data,
virtual void AddValidDataset(const Dataset* valid_data,
const std::vector<const Metric*>& valid_metrics) = 0;
/*!
......@@ -52,6 +74,19 @@ public:
*/
virtual bool TrainOneIter(const score_t* gradient, const score_t* hessian, bool is_eval) = 0;
/*!
* \brief Rollback one iteration
*/
virtual void RollbackOneIter() = 0;
/*!
* \brief return current iteration
*/
virtual int GetCurrentIteration() const = 0;
/*!
* \brief Eval metrics and check is met early stopping or not
*/
virtual bool EvalAndCheckEarlyStopping() = 0;
/*!
* \brief Get evaluation result at data_idx data
......@@ -73,7 +108,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) const = 0;
virtual void GetPredictAt(int data_idx, score_t* result, data_size_t* out_len) = 0;
/*!
* \brief Prediction for one record, not sigmoid transform
......@@ -99,11 +134,10 @@ public:
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
*/
virtual void SaveModelToFile(int num_used_model, bool is_finish, const char* filename) = 0;
virtual void SaveModelToFile(int num_iterations, const char* filename) const = 0;
/*!
* \brief Restore from a serialized string
......@@ -127,7 +161,7 @@ public:
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
virtual int NumberOfSubModels() const = 0;
virtual int NumberOfTotalModel() const = 0;
/*!
* \brief Get number of classes
......@@ -138,7 +172,7 @@ public:
/*!
* \brief Set number of used model for prediction
*/
virtual void SetNumUsedModel(int num_used_model) = 0;
virtual void SetNumIterationForPred(int num_iteration) = 0;
/*!
* \brief Get Type name of this boosting object
......@@ -151,6 +185,8 @@ public:
/*! \brief Disable copy */
Boosting(const Boosting&) = delete;
static void LoadFileToBoosting(Boosting* boosting, const char* filename);
/*!
* \brief Create boosting object
* \param type Type of boosting
......
......@@ -3,7 +3,9 @@
#include <cstdint>
#include <exception>
#include <stdexcept>
#include <cstring>
#include <string>
/*!
* To avoid type conversion on large data, most of our expose interface support both for float_32 and float_64.
* Except following:
......@@ -38,7 +40,7 @@ typedef void* BoosterHandle;
/*!
* \brief get string message of the last error
* all function in this file will return 0 when success
* all function in this file will return 0 when succeed
* and -1 when an error occured,
* \return const char* error inforomation
*/
......@@ -53,38 +55,29 @@ DllExport const char* LGBM_GetLastError();
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromFile(const char* filename,
DllExport int LGBM_DatasetCreateFromFile(const char* filename,
const char* parameters,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief load data set from binary file like the command_line LightGBM do
* \param filename the name of the file
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
DatesetHandle* out);
/*!
* \brief create a dataset from CSR format
* \param indptr pointer to row headers
* \param indptr_type
* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nindptr number of rows in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param num_col number of columns
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
......@@ -99,19 +92,19 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
/*!
* \brief create a dataset from CSC format
* \param col_ptr pointer to col headers
* \param col_ptr_type
* \param col_ptr_type type of col_ptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param ncol_ptr number of rows in the matrix + 1
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param ncol_ptr number of cols in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows; when it's set to 0, then guess from data
* \param num_row number of rows
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
int col_ptr_type,
const int32_t* indices,
const void* data,
......@@ -126,16 +119,16 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
/*!
* \brief create dataset from dense matrix
* \param data pointer to the data space
* \param data_type 0
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param is_row_major 1 for row major, 0 for column major
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromMat(const void* data,
DllExport int LGBM_DatasetCreateFromMat(const void* data,
int data_type,
int32_t nrow,
int32_t ncol,
......@@ -144,9 +137,25 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
const DatesetHandle* reference,
DatesetHandle* out);
/*!
* \brief Create subset of a data
* \param handle handle of full dataset
* \param used_row_indices Indices used in subset
* \param num_used_row_indices len of used_row_indices
* \param parameters additional parameters
* \param out subset of data
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetSubset(
const DatesetHandle* handle,
const int32_t* used_row_indices,
int32_t num_used_row_indices,
const char* parameters,
DatesetHandle* out);
/*!
* \brief free space for dataset
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetFree(DatesetHandle handle);
......@@ -154,19 +163,21 @@ DllExport int LGBM_DatasetFree(DatesetHandle handle);
* \brief save dateset to binary file
* \param handle a instance of dataset
* \param filename file name
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetSaveBinary(DatesetHandle handle,
const char* filename);
/*!
* \brief set vector to a content in info
* Note: group and group only work for C_API_DTYPE_INT32
* label and weight only work for C_API_DTYPE_FLOAT32
* \param handle a instance of dataset
* \param field_name field name, can be label, weight, group
* \param field_name field name, can be label, weight, group, group_id
* \param field_data pointer to vector
* \param num_element number of element in field_data
* \param type float_32:0, int32_t:1
* \return 0 when success, -1 when failure happens
* \param type C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetSetField(DatesetHandle handle,
const char* field_name,
......@@ -180,8 +191,8 @@ DllExport int LGBM_DatasetSetField(DatesetHandle handle,
* \param field_name field name
* \param out_len used to set result length
* \param out_ptr pointer to the result
* \param out_type float_32:0, int32_t:1
* \return 0 when success, -1 when failure happens
* \param out_type C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetField(DatesetHandle handle,
const char* field_name,
......@@ -193,7 +204,7 @@ DllExport int LGBM_DatasetGetField(DatesetHandle handle,
* \brief get number of data.
* \param handle the handle to the dataset
* \param out The address to hold number of data
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
int64_t* out);
......@@ -202,7 +213,7 @@ DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
* \brief get number of features
* \param handle the handle to the dataset
* \param out The output of number of features
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
int64_t* out);
......@@ -212,42 +223,82 @@ DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
/*!
* \brief create an new boosting learner
* \param train_data training data set
* \param valid_datas validation data sets
* \param valid_names names of validation data sets
* \param n_valid_datas number of validation set
* \param parameters format: 'key1=value1 key2=value2'
* \prama out handle of created Booster
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
const DatesetHandle valid_datas[],
const char* valid_names[],
int n_valid_datas,
const char* parameters,
BoosterHandle* out);
/*!
* \brief load an existing boosting from model file
* \param filename filename of model
* \param out_num_iterations number of iterations of this booster
* \param out handle of created Booster
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterLoadFromModelfile(
DllExport int LGBM_BoosterCreateFromModelfile(
const char* filename,
int64_t* out_num_iterations,
BoosterHandle* out);
/*!
* \brief free obj in handle
* \param handle handle to be freed
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterFree(BoosterHandle handle);
/*!
* \brief Merge model in two booster to first handle
* \param handle handle, will merge other handle to this
* \param other_handle
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterMerge(BoosterHandle handle,
BoosterHandle other_handle);
/*!
* \brief Add new validation to booster
* \param handle handle
* \param valid_data validation data set
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterAddValidData(BoosterHandle handle,
const DatesetHandle valid_data);
/*!
* \brief Reset training data for booster
* \param handle handle
* \param train_data training data set
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterResetTrainingData(BoosterHandle handle,
const DatesetHandle train_data);
/*!
* \brief Reset config for current booster
* \param handle handle
* \param parameters format: 'key1=value1 key2=value2'
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterResetParameter(BoosterHandle handle, const char* parameters);
/*!
* \brief Get number of class
* \param handle handle
* \param out_len number of class
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetNumClasses(BoosterHandle handle, int64_t* out_len);
/*!
* \brief update the model in one round
* \param handle handle
* \param is_finished 1 means finised(cannot split any more)
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);
......@@ -258,7 +309,7 @@ DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param is_finished 1 means finised(cannot split any more)
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
const float* grad,
......@@ -266,81 +317,106 @@ DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
int* is_finished);
/*!
* \brief get evaluation for training data and validation data
* \brief Rollback one iteration
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result the string containing evaluation statistics, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterEval(BoosterHandle handle,
int data,
int64_t* out_len,
float* out_results);
DllExport int LGBM_BoosterRollbackOneIter(BoosterHandle handle);
/*!
* \brief get raw score for training data, used to calculate gradients outside
* \brief Get iteration of current boosting rounds
* \param out_iteration iteration of boosting rounds
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetCurrentIteration(BoosterHandle handle, int64_t* out_iteration);
/*!
* \brief Get number of eval
* \param out_len total number of eval results
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len);
/*!
* \brief Get Name of eval
* \param out_len total number of eval results
* \param out_strs names of eval result
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetEvalNames(BoosterHandle handle, int64_t* out_len, char** out_strs);
/*!
* \brief get evaluation for training data and validation data
Note: 1. you should call LGBM_BoosterGetEvalNames first to get the name of evaluation results
2. should pre-allocate memory for out_results, you can get its length by LGBM_BoosterGetEvalCounts
* \param handle handle
* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
* \param out_result float arrary contains result
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetScore(BoosterHandle handle,
DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int data_idx,
int64_t* out_len,
const float** out_result);
float* out_results);
/*!
* \brief Get prediction for training data and validation data
this can be used to support customized eval function
this can be used to support customized eval function
Note: should pre-allocate memory for out_result, its length is equal to num_class * num_data
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
int data,
int data_idx,
int64_t* out_len,
float* out_result);
/*!
* \brief make prediction for file
* \param handle handle
* \param predict_type
* 0:normal, with transform (if needed)
* 1:raw score
* 2:leaf index
* \param n_used_trees number of used tree, < 0 means no limit
* \param data_has_header data file has header or not
* \param data_filename filename of data file
* \param data_has_header data file has header or not
* \param predict_type
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param result_filename filename of result file
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
int predict_type,
int64_t n_used_trees,
int data_has_header,
const char* data_filename,
int data_has_header,
int predict_type,
int64_t num_iteration,
const char* result_filename);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param indptr pointer to row headers
* \param indptr_type
* \param indptr_type type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nindptr number of rows in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param predict_type
* 0:normal, with transform (if needed)
* 1:raw score
* 2:leaf index
* \param n_used_trees number of used tree, < 0 means no limit
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
const void* indptr,
......@@ -352,24 +428,29 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t nelem,
int64_t num_col,
int predict_type,
int64_t n_used_trees,
double* out_result);
int64_t num_iteration,
int64_t* out_len,
float* out_result);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param data pointer to the data space
* \param data_type
* \param data_type type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param is_row_major 1 for row major, 0 for column major
* \param predict_type
* 0:normal, with transform (if needed)
* 1:raw score
* 2:leaf index
* \param n_used_trees number of used tree, < 0 means no limit
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
const void* data,
......@@ -378,18 +459,19 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t ncol,
int is_row_major,
int predict_type,
int64_t n_used_trees,
double* out_result);
int64_t num_iteration,
int64_t* out_len,
float* out_result);
/*!
* \brief save model into file
* \param handle handle
* \param num_used_model, < 0 means no limit
* \param num_iteration, <= 0 means save all
* \param filename file name
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
int num_used_model,
int num_iteration,
const char* filename);
......@@ -413,13 +495,15 @@ ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indi
std::vector<double>
SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<int>& indices);
#if defined(_MSC_VER)
// exception handle and error msg
static std::string& LastErrorMsg() { static std::string err_msg("Everything is fine"); return err_msg; }
static char* LastErrorMsg() { static __declspec(thread) char err_msg[512] = "Everything is fine"; return err_msg; }
#else
static char* LastErrorMsg() { static thread_local char err_msg[512] = "Everything is fine"; return err_msg; }
#endif
inline void LGBM_SetLastError(const char* msg) {
LastErrorMsg() = msg;
std::strcpy(LastErrorMsg(), msg);
}
inline int LGBM_APIHandleException(const std::exception& ex) {
......
......@@ -72,6 +72,8 @@ public:
inline bool GetBool(
const std::unordered_map<std::string, std::string>& params,
const std::string& name, bool* out);
static std::unordered_map<std::string, std::string> Str2Map(const char* parameters);
};
/*! \brief Types of boosting */
......@@ -97,7 +99,7 @@ public:
std::string output_result = "LightGBM_predict_result.txt";
std::string input_model = "";
int verbosity = 1;
int num_model_predict = NO_LIMIT;
int num_iteration_predict = -1;
bool is_pre_partition = false;
bool is_enable_sparse = true;
bool use_two_round_loading = false;
......@@ -136,6 +138,8 @@ public:
bool is_unbalance = false;
// for multiclass
int num_class = 1;
// Balancing of positive and negative weights
double scale_pos_weight = 1.0f;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -164,12 +168,12 @@ public:
int feature_fraction_seed = 2;
double feature_fraction = 1.0f;
// max cache size(unit:MB) for historical histogram. < 0 means not limit
double histogram_pool_size = NO_LIMIT;
double histogram_pool_size = -1.0f;
// max depth of tree model.
// Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
// And the max leaves will be min(num_leaves, pow(2, max_depth - 1))
// max_depth < 0 means not limit
int max_depth = NO_LIMIT;
int max_depth = -1;
void Set(const std::unordered_map<std::string, std::string>& params) override;
};
......@@ -231,7 +235,7 @@ public:
MetricConfig metric_config;
void Set(const std::unordered_map<std::string, std::string>& params) override;
void LoadFromString(const char* str);
private:
void GetBoostingType(const std::unordered_map<std::string, std::string>& params);
......@@ -328,17 +332,22 @@ struct ParameterAlias {
{ "ndcg_at", "ndcg_eval_at" },
{ "min_data_per_leaf", "min_data_in_leaf" },
{ "min_data", "min_data_in_leaf" },
{ "min_child_samples", "min_data_in_leaf" },
{ "min_sum_hessian_per_leaf", "min_sum_hessian_in_leaf" },
{ "min_sum_hessian", "min_sum_hessian_in_leaf" },
{ "min_hessian", "min_sum_hessian_in_leaf" },
{ "min_child_weight", "min_sum_hessian_in_leaf" },
{ "num_leaf", "num_leaves" },
{ "sub_feature", "feature_fraction" },
{ "colsample_bytree", "feature_fraction" },
{ "num_iteration", "num_iterations" },
{ "num_tree", "num_iterations" },
{ "num_round", "num_iterations" },
{ "num_trees", "num_iterations" },
{ "num_rounds", "num_iterations" },
{ "sub_row", "bagging_fraction" },
{ "subsample", "bagging_fraction" },
{ "subsample_freq", "bagging_freq" },
{ "shrinkage_rate", "learning_rate" },
{ "tree", "tree_learner" },
{ "num_machine", "num_machines" },
......@@ -361,6 +370,9 @@ struct ParameterAlias {
{ "blacklist", "ignore_column" },
{ "predict_raw_score", "is_predict_raw_score" },
{ "predict_leaf_index", "is_predict_leaf_index" },
{ "min_split_gain", "min_gain_to_split" },
{ "reg_alpha", "lambda_l1" },
{ "reg_lambda", "lambda_l2" },
{ "num_classes", "num_class" }
});
std::unordered_map<std::string, std::string> tmp_map;
......
......@@ -13,6 +13,7 @@
#include <functional>
#include <string>
#include <unordered_set>
#include <mutex>
namespace LightGBM {
......@@ -46,6 +47,13 @@ public:
*/
void Init(const char* data_filename, const int num_class);
/*!
* \brief init as subset
* \param metadata Filename of data
* \param used_indices
* \param num_used_indices
*/
void Init(const Metadata& metadata, const data_size_t* used_indices, data_size_t num_used_indices);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
*/
......@@ -76,13 +84,14 @@ public:
void CheckOrPartition(data_size_t num_all_data,
const std::vector<data_size_t>& used_data_indices);
void SetLabel(const float* label, data_size_t len);
void SetWeights(const float* weights, data_size_t len);
void SetQueryBoundaries(const data_size_t* query_boundaries, data_size_t len);
void SetQueryId(const data_size_t* query_id, data_size_t len);
/*!
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
......@@ -141,8 +150,13 @@ public:
* \brief Get weights, if not exists, will return nullptr
* \return Pointer of weights
*/
inline const float* weights()
const { return weights_.data(); }
inline const float* weights() const {
if (weights_.size() > 0) {
return weights_.data();
} else {
return nullptr;
}
}
/*!
* \brief Get data boundaries on queries, if not exists, will return nullptr
......@@ -151,8 +165,13 @@ public:
* is the data indices for query i.
* \return Pointer of data boundaries on queries
*/
inline const data_size_t* query_boundaries()
const { return query_boundaries_.data(); }
inline const data_size_t* query_boundaries() const {
if (query_boundaries_.size() > 0) {
return query_boundaries_.data();
} else {
return nullptr;
}
}
/*!
* \brief Get Number of queries
......@@ -164,13 +183,25 @@ public:
* \brief Get weights for queries, if not exists, will return nullptr
* \return Pointer of weights for queries
*/
inline const float* query_weights() const { return query_weights_.data(); }
inline const float* query_weights() const {
if (query_weights_.size() > 0) {
return query_weights_.data();
} else {
return nullptr;
}
}
/*!
* \brief Get initial scores, if not exists, will return nullptr
* \return Pointer of initial scores
*/
inline const float* init_score() const { return init_score_.data(); }
inline const float* init_score() const {
if (init_score_.size() > 0) {
return init_score_.data();
} else {
return nullptr;
}
}
/*! \brief Disable copy */
Metadata& operator=(const Metadata&) = delete;
......@@ -210,6 +241,8 @@ private:
std::vector<float> init_score_;
/*! \brief Queries data */
std::vector<data_size_t> queries_;
/*! \brief mutex for threading safe call */
std::mutex mutex_;
};
......@@ -253,6 +286,27 @@ public:
/*! \brief Destructor */
~Dataset();
bool CheckAlign(const Dataset& other) const {
if (num_features_ != other.num_features_) {
return false;
}
if (num_total_features_ != other.num_total_features_) {
return false;
}
if (num_class_ != other.num_class_) {
return false;
}
if (label_idx_ != other.label_idx_) {
return false;
}
for (int i = 0; i < num_features_; ++i) {
if (!features_[i]->CheckAlign(*(other.features_[i].get()))) {
return false;
}
}
return true;
}
inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
int feature_idx = used_feature_map_[i];
......@@ -282,6 +336,8 @@ public:
}
}
Dataset* Subset(const data_size_t* used_indices, data_size_t num_used_indices, bool is_enable_sparse) const;
void FinishLoad();
bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
......@@ -348,12 +404,12 @@ private:
int num_class_;
/*! \brief Store some label level data*/
Metadata metadata_;
/*! \brief True if dataset is loaded from binary file */
bool is_loading_from_binfile_;
/*! \brief index of label column */
int label_idx_ = 0;
/*! \brief store feature names */
std::vector<std::string> feature_names_;
/*! \brief store feature names */
static const char* binary_file_token;
};
} // namespace LightGBM
......
......@@ -49,7 +49,7 @@ private:
void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);
/*! \brief Check can load from binary file */
bool CheckCanLoadFromBin(const char* filename);
std::string CheckCanLoadFromBin(const char* filename);
const IOConfig& io_config_;
/*! \brief Random generator*/
......
......@@ -63,6 +63,13 @@ public:
~Feature() {
}
bool CheckAlign(const Feature& other) const {
if (feature_index_ != other.feature_index_) {
return false;
}
return bin_mapper_->CheckAlign(*(other.bin_mapper_.get()));
}
/*!
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
......@@ -73,6 +80,9 @@ public:
unsigned int bin = bin_mapper_->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin);
}
inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
bin_data_->Push(tid, line_idx, bin);
}
inline void FinishLoad() { bin_data_->FinishLoad(); }
/*! \brief Index of this feature */
inline int feature_index() const { return feature_index_; }
......
......@@ -24,7 +24,6 @@ using ReduceFunction = std::function<void(const char*, char*, int)>;
using PredictFunction =
std::function<std::vector<double>(const std::vector<std::pair<int, double>>&)>;
#define NO_LIMIT (-1)
#define NO_SPECIFIC (-1)
} // namespace LightGBM
......
......@@ -24,8 +24,7 @@ public:
* \param metadata Label data
* \param num_data Number of data
*/
virtual void Init(const char* test_name,
const Metadata& metadata, data_size_t num_data) = 0;
virtual void Init(const Metadata& metadata, data_size_t num_data) = 0;
virtual const std::vector<std::string>& GetName() const = 0;
......
......@@ -101,10 +101,6 @@ public:
/*! \brief Serialize this object by string*/
std::string ToString();
/*! \brief Disable copy */
Tree& operator=(const Tree&) = delete;
/*! \brief Disable copy */
Tree(const Tree&) = delete;
private:
/*!
* \brief Find leaf index of which record belongs by data
......
......@@ -89,7 +89,11 @@ private:
// a trick to use static variable in header file.
// May be not good, but avoid to use an additional cpp file
static LogLevel& GetLevel() { static LogLevel level; return level; }
#if defined(_MSC_VER)
static LogLevel& GetLevel() { static __declspec(thread) LogLevel level = LogLevel::Info; return level; }
#else
static LogLevel& GetLevel() { static thread_local LogLevel level = LogLevel::Info; return level; }
#endif
};
......
LightGBM Python Package
=======================
Installation
------------
1. Following `Installation Guide <https://github.com/Microsoft/LightGBM/wiki/Installation-Guide>`__ to build first.
For the windows user, please change the build config to ``DLL``.
2. Install with ``cd python-package; python setpy.py install``
Note: Make sure you have `setuptools <https://pypi.python.org/pypi/setuptools>`__
Examples
--------
- Refer also to the walk through examples in `python-guide
folder <https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide>`__
# coding: utf-8
"""LightGBM, Light Gradient Boosting Machine.
Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
"""
from __future__ import absolute_import
import os
from .basic import Predictor, Dataset, Booster
from .engine import train, cv
try:
from .sklearn import LGBMModel, LGBMRegressor, LGBMClassifier, LGBMRanker
except ImportError:
pass
__version__ = 0.1
__all__ = ['Dataset', 'Booster',
'train', 'cv',
'LGBMModel','LGBMRegressor', 'LGBMClassifier', 'LGBMRanker']
\ No newline at end of file
This diff is collapsed.
from __future__ import absolute_import
import collections
class EarlyStopException(Exception):
"""Exception of early stopping.
Parameters
----------
best_iteration : int
The best iteration stopped.
"""
def __init__(self, best_iteration):
super(EarlyStopException, self).__init__()
self.best_iteration = best_iteration
# Callback environment used by callbacks
CallbackEnv = collections.namedtuple(
"LightGBMCallbackEnv",
["model",
"cvfolds",
"iteration",
"begin_iteration",
"end_iteration",
"evaluation_result_list"])
def _format_eval_result(value, show_stdv=True):
"""format metric string"""
if len(value) == 4:
return '%s_%s:%g' % (value[0], value[1], value[2])
elif len(value) == 5:
if show_stdv:
return '%s_%s:%g+%g' % (value[0], value[1], value[2], value[4])
else:
return '%s_%s:%g' % (value[0], value[1], value[2])
else:
raise ValueError("wrong metric value")
def print_evaluation(period=1, show_stdv=True):
"""Create a callback that print evaluation result.
Parameters
----------
period : int
The period to log the evaluation results
show_stdv : bool, optional
Whether show stdv if provided
Returns
-------
callback : function
A callback that print evaluation every period iterations.
"""
def callback(env):
"""internal function"""
if len(env.evaluation_result_list) == 0 or period is False:
return
if (env.iteration % period == 0 or env.iteration + 1 == env.begin_iteration):
result = '\t'.join([_format_eval_result(x, show_stdv) for x in env.evaluation_result_list])
print('[%d]\t%s\n' % (env.iteration, result))
return callback
def record_evaluation(eval_result):
"""Create a call back that records the evaluation history into eval_result.
Parameters
----------
eval_result : dict
A dictionary to store the evaluation results.
Returns
-------
callback : function
The requested callback function.
"""
if not isinstance(eval_result, dict):
raise TypeError('eval_result has to be a dictionary')
eval_result.clear()
def init(env):
"""internal function"""
for data_name, eval_name, _, _ in env.evaluation_result_list:
if data_name not in eval_result:
eval_result[data_name] = {}
if eval_name not in eval_result[data_name]:
eval_result[data_name][eval_name] = []
def callback(env):
"""internal function"""
if len(eval_result) == 0:
init(env)
for data_name, eval_name, result, _ in env.evaluation_result_list:
eval_result[data_name][eval_name].append(result)
return callback
def reset_learning_rate(learning_rates):
"""Reset learning rate after iteration 1
NOTE: the initial learning rate will still take in-effect on first iteration.
Parameters
----------
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
Returns
-------
callback : function
The requested callback function.
"""
def callback(env):
"""internal function"""
booster = env.model
i = env.iteration
if isinstance(learning_rates, list):
if len(learning_rates) != env.end_iteration:
raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
booster.reset_parameter({'learning_rate':learning_rates[i]})
else:
booster.reset_parameter({'learning_rate':learning_rates(i, env.end_iteration)})
callback.before_iteration = True
return callback
def early_stop(stopping_rounds, verbose=True):
"""Create a callback that activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Parameters
----------
stopp_rounds : int
The stopping rounds before the trend occur.
verbose : optional, bool
Whether to print message about early stopping information.
Returns
-------
callback : function
The requested callback function.
"""
factor_to_bigger_better = {}
best_score = {}
best_iter = {}
best_msg = {}
def init(env):
"""internal function"""
bst = env.model
if len(env.evaluation_result_list) == 0:
raise ValueError('For early stopping you need at least one set in evals.')
if verbose:
msg = "Will train until hasn't improved in {} rounds.\n"
print(msg.format(stopping_rounds))
for i in range(len(env.evaluation_result_list)):
best_score[i] = float('-inf')
best_iter[i] = 0
if verbose:
best_msg[i] = ""
factor_to_bigger_better[i] = -1.0
if env.evaluation_result_list[i][3]:
factor_to_bigger_better[i] = 1.0
def callback(env):
"""internal function"""
if len(best_score) == 0:
init(env)
for i in range(len(env.evaluation_result_list)):
score = env.evaluation_result_list[i][2] * factor_to_bigger_better[i]
if score > best_score[i]:
best_score[i] = score
best_iter[i] = env.iteration
if verbose:
best_msg[i] = '[%d]\t%s' % ( env.iteration,
'\t'.join([_format_eval_result(x) for x in env.evaluation_result_list]))
else:
if env.iteration - best_iter[i] >= stopping_rounds:
if env.model is not None:
env.model.set_attr(best_iteration=str(best_iter[i]))
if verbose:
print('early stopping, best message is:\n {} '.format(best_msg[i]))
raise EarlyStopException(best_iter[i])
return callback
"""Training Library containing training routines of LightGBM."""
from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from . import callback
def _construct_dataset(X_y, reference=None,
params=None, other_fields=None,
predictor=None):
if 'max_bin' in params:
max_bin = int(params['max_bin'])
else:
max_bin = 255
weight = None
group = None
init_score = None
if other_fields is not None:
if not isinstance(other_fields, dict):
raise TypeError("other filed data should be dict type")
weight = None if 'weight' not in other_fields else other_fields['weight']
group = None if 'group' not in other_fields else other_fields['group']
init_score = None if 'init_score' not in other_fields else other_fields['init_score']
if is_str(X_y):
data = X_y
label = None
else:
if len(X_y) != 2:
raise TypeError("should pass (data, label) pair")
data = X_y[0]
label = X_y[1]
if reference is None:
ret = Dataset(data, label=label, max_bin=max_bin,
weight=weight, group=group,
predictor=predictor, params=params)
else:
ret = reference.create_valid(data, label=label, weight=weight,
group=group, params=params)
if init_score is not None:
ret.set_init_score(init_score)
return ret
def train(params, train_data, num_boost_round=100,
valid_datas=None, valid_names=None,
fobj=None, feval=None, init_model=None,
train_fields=None, valid_fields=None,
early_stopping_rounds=None, evals_result=None,
verbose_eval=True, learning_rates=None, callbacks=None):
"""Train with given parameters.
Parameters
----------
params : dict
params.
train_data : pair, (X, y) or filename of data
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_datas: list of pairs (valid_X, valid_y) or filename of data
List of data to be evaluated during training
valid_names: list of string
names of valid_datas
fobj : function
Customized objective function.
feval : function
Customized evaluation function.
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and
a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is
printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set
is printed at every given `verbose_eval` boosting stage. The last boosting stage
/ the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
booster : a trained booster model
"""
"""create predictor first"""
if is_str(init_model):
predictor = Predictor(model_file=init_model)
elif isinstance(init_model, Booster):
predictor = init_model.to_predictor()
elif isinstance(init_model, Predictor):
predictor = init_model
else:
predictor = None
"""create dataset"""
train_set = _construct_dataset(train_data, None, params, train_fields, predictor)
is_valid_contain_train = False
train_data_name = "training"
valid_sets = []
name_valid_sets = []
if valid_datas is not None:
for i in range(len(valid_datas)):
other_fields = None if valid_fields is None else valid_fields[i]
"""reduce cost for prediction training data"""
if valid_datas[i] is train_data:
is_valid_contain_train = True
if valid_names is not None:
train_data_name = valid_names[i]
continue
valid_set = _construct_dataset(
valid_datas[i],
train_set,
params,
other_fields,
predictor)
valid_sets.append(valid_set)
if valid_names is not None:
name_valid_sets.append(valid_names[i])
else:
name_valid_sets.append('valid_'+str(i))
"""process callbacks"""
callbacks = [] if callbacks is None else callbacks
# Most of legacy advanced options becomes callbacks
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation())
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval))
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
verbose=bool(verbose_eval)))
if learning_rates is not None:
callbacks.append(callback.reset_learning_rate(learning_rates))
if evals_result is not None:
callbacks.append(callback.record_evaluation(evals_result))
callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
"""construct booster"""
if 'metric' in params:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
booster = Booster(params=params, train_set=train_set)
if is_valid_contain_train:
booster.set_train_data_name(train_data_name)
for i in range(len(valid_sets)):
booster.add_valid(valid_sets[i], name_valid_sets[i])
"""start training"""
for i in range(num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=booster,
cvfolds=None,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=None))
booster.update(fobj=fobj)
evaluation_result_list = []
# check evaluation result.
if len(valid_sets) != 0:
if is_valid_contain_train:
evaluation_result_list.extend(booster.eval_train(feval))
evaluation_result_list.extend(booster.eval_valid(feval))
try:
for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=booster,
cvfolds=None,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=evaluation_result_list))
except callback.EarlyStopException:
break
if booster.attr('best_iteration') is not None:
booster.best_iteration = int(booster.attr('best_iteration'))
else:
booster.best_iteration = num_boost_round - 1
return booster
class CVBooster(object):
""""Auxiliary datastruct to hold one fold of CV."""
def __init__(self, train_set, valid_test, params):
""""Initialize the CVBooster"""
self.train_set = train_set
self.valid_test = valid_test
self.booster = Booster(params=params, train_set=train_set)
self.booster.add_valid(valid_test, 'valid')
def update(self, fobj):
""""Update the boosters for one iteration"""
self.booster.update(fobj=fobj)
def eval(self, feval):
""""Evaluate the CVBooster for one iteration."""
return self.booster.eval_valid(feval)
try:
try:
from sklearn.model_selection import KFold, StratifiedKFold
except ImportError:
from sklearn.cross_validation import KFold, StratifiedKFold
SKLEARN_StratifiedKFold = True
except ImportError:
SKLEARN_StratifiedKFold = False
def _make_n_folds(full_data, nfold, param, seed, fpreproc=None, stratified=False):
"""
Make an n-fold list of CVBooster from random indices.
"""
np.random.seed(seed)
if stratified:
if SKLEARN_StratifiedKFold:
sfk = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
idset = [x[1] for x in sfk.split(X=full_data.get_label(), y=full_data.get_label())]
else:
raise LightGBMError('sklearn needs to be installed in order to use stratified cv')
else:
randidx = np.random.permutation(full_data.num_data())
kstep = int(len(randidx) / nfold)
idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
ret = []
for k in range(nfold):
train_set = full_data.subset(np.concatenate([idset[i] for i in range(nfold) if k != i]))
valid_set = full_data.subset(idset[k])
# run preprocessing on the data set if needed
if fpreproc is not None:
train_set, valid_set, tparam = fpreproc(train_set, valid_set, param.copy())
else:
tparam = param
ret.append(CVBooster(train_set, valid_set, tparam))
return ret
def _agg_cv_result(raw_results):
# pylint: disable=invalid-name
"""
Aggregate cross-validation results.
"""
cvmap = {}
metric_type = {}
for one_result in raw_results:
for one_line in one_result:
key = one_line[1]
metric_type[key] = one_line[3]
if key not in cvmap:
cvmap[key] = []
cvmap[key].append(one_line[2])
results = []
for k, v in cvmap.items():
v = np.array(v)
mean, std = np.mean(v), np.std(v)
results.append(('cv_agg', k, mean, metric_type[k], std))
return results
def cv(params, train_data, num_boost_round=10, nfold=5, stratified=False,
metrics=(), fobj=None, feval=None, train_fields=None, early_stopping_rounds=None,
fpreproc=None, verbose_eval=None, show_stdv=True, seed=0,
callbacks=None):
# pylint: disable = invalid-name
"""Cross-validation with given paramaters.
Parameters
----------
params : dict
Booster params.
train_data : pair, (X, y) or filename of data
Data to be trained.
num_boost_round : int
Number of boosting iterations.
nfold : int
Number of folds in CV.
stratified : bool
Perform stratified sampling.
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings
Evaluation metrics to be watched in CV.
fobj : function
Custom objective function.
feval : function
Custom evaluation function.
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns
transformed versions of those.
verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed
when np.ndarray is returned. If True, progress will be displayed at
boosting stage. If an integer is given, progress will be displayed
at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
seed : int
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
evaluation history : list(string)
"""
if isinstance(metrics, str):
metrics = [metrics]
if isinstance(params, list):
params = dict(params)
if not 'metric' in params:
params['metric'] = []
else:
if is_str(params['metric']):
params['metric'] = params['metric'].split(',')
else:
params['metric'] = list(params['metric'])
if metrics is not None and len(metrics) > 0:
params['metric'].extend(metrics)
train_set = _construct_dataset(train_data, None, params, train_fields)
results = {}
cvfolds = _make_n_folds(train_set, nfold, params, seed, fpreproc, stratified)
# setup callbacks
callbacks = [] if callbacks is None else callbacks
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
verbose=False))
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval, show_stdv=show_stdv))
callbacks_before_iter = [
cb for cb in callbacks if cb.__dict__.get('before_iteration', False)]
callbacks_after_iter = [
cb for cb in callbacks if not cb.__dict__.get('before_iteration', False)]
for i in range(num_boost_round):
for cb in callbacks_before_iter:
cb(callback.CallbackEnv(model=None,
cvfolds=cvfolds,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=None))
for fold in cvfolds:
fold.update(fobj)
res = _agg_cv_result([f.eval(feval) for f in cvfolds])
for _, key, mean, _, std in res:
if key + '-mean' not in results:
results[key + '-mean'] = []
if key + '-std' not in results:
results[key + '-std'] = []
results[key + '-mean'].append(mean)
results[key + '-std'].append(std)
try:
for cb in callbacks_after_iter:
cb(callback.CallbackEnv(model=None,
cvfolds=cvfolds,
iteration=i,
begin_iteration=0,
end_iteration=num_boost_round,
evaluation_result_list=res))
except callback.EarlyStopException as e:
for k in results.keys():
results[k] = results[k][:(e.best_iteration + 1)]
break
return results
import os
import platform
import sys
def find_lib_path():
"""Find the path to LightGBM library files.
Returns
-------
lib_path: list(string)
List of all found library path to LightGBM
"""
curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
dll_path = [curr_path, os.path.join(curr_path, '../../lib/'),
os.path.join(curr_path, '../../'),
os.path.join(curr_path, './lib/'),
os.path.join(sys.prefix, 'lightgbm')]
if os.name == 'nt':
dll_path.append(os.path.join(curr_path, '../../windows/x64/Dll/'))
dll_path.append(os.path.join(curr_path, './windows/x64/Dll/'))
dll_path = [os.path.join(p, 'lib_lightgbm.dll') for p in dll_path]
else:
dll_path = [os.path.join(p, 'lib_lightgbm.so') for p in dll_path]
lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
if not lib_path:
raise Exception('Cannot find lightgbm Library')
return lib_path
"""Scikit-Learn Wrapper interface for LightGBM."""
from __future__ import absolute_import
import numpy as np
from .basic import LightGBMError, Predictor, Dataset, Booster, is_str
from .engine import train
# sklearn
try:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
SKLEARN_INSTALLED = True
LGBMModelBase = BaseEstimator
LGBMRegressorBase = RegressorMixin
LGBMClassifierBase = ClassifierMixin
LGBMLabelEncoder = LabelEncoder
except ImportError:
SKLEARN_INSTALLED = False
LGBMModelBase = object
LGBMClassifierBase = object
LGBMRegressorBase = object
LGBMLabelEncoder = None
def _point_wise_objective(func):
"""Decorate an objective function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples] or shape[n_samples* n_class]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def inner(preds, dataset):
"""internal function"""
labels = dataset.get_label()
grad, hess = func(labels, preds)
"""weighted for objective"""
weight = dataset.get_weight()
if weight is not None:
"""only one class"""
if len(weight) == len(grad):
grad = np.multiply(grad, weight)
hess = np.multiply(hess, weight)
else:
num_data = len(weight)
num_class = len(grad) // num_data
if num_class * num_data != len(grad):
raise ValueError("length of grad and hess should equal with num_class * num_data")
for k in range(num_class):
for i in range(num_data):
idx = k * num_data + i
grad[idx] *= weight[i]
hess[idx] *= weight[i]
return grad, hess
return inner
class LGBMModel(LGBMModelBase):
"""Implementation of the Scikit-Learn API for LightGBM.
Parameters
----------
num_leaves : int
Maximum tree leaves for base learners.
max_depth : int
Maximum tree depth for base learners, -1 means no limit.
learning_rate : float
Boosting learning rate
n_estimators : int
Number of boosted trees to fit.
silent : boolean
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
nthread : int
Number of parallel threads
min_split_gain : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child(leaf)
min_child_samples : int
Minimum number of data need in a child(leaf)
subsample : float
Subsample ratio of the training instance.
subsample_freq : int
frequence of subsample, <=0 means no enable
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
reg_alpha : float
L1 regularization term on weights
reg_lambda : float
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
is_unbalance : bool
Is unbalance for binary classification
seed : int
Random number seed.
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
The predicted values
grad: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the second derivative for each sample point
for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
"""
def __init__(self, num_leaves=31, max_depth=-1,
learning_rate=0.1, n_estimators=10, max_bin=255,
silent=True, objective="regression",
nthread=-1, min_split_gain=0, min_child_weight=5, min_child_samples=10,
subsample=1, subsample_freq=1, colsample_bytree=1,
reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
is_unbalance=False, seed=0):
if not SKLEARN_INSTALLED:
raise LightGBMError('sklearn needs to be installed in order to use this module')
self.num_leaves = num_leaves
self.max_depth = max_depth
self.learning_rate = learning_rate
self.n_estimators = n_estimators
self.max_bin = max_bin
self.silent = silent
self.objective = objective
self.nthread = nthread
self.min_split_gain = min_split_gain
self.min_child_weight = min_child_weight
self.min_child_samples = min_child_samples
self.subsample = subsample
self.subsample_freq = subsample_freq
self.colsample_bytree = colsample_bytree
self.reg_alpha = reg_alpha
self.reg_lambda = reg_lambda
self.scale_pos_weight = scale_pos_weight
self.is_unbalance = is_unbalance
self.seed = seed
self._Booster = None
if callable(self.objective):
self.fobj = _point_wise_objective(self.objective)
else:
self.fobj = None
def booster(self):
"""Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
"""
if self._Booster is None:
raise LightGBMError('need to call fit beforehand')
return self._Booster
def get_params(self, deep=False):
"""Get parameters"""
params = super(LGBMModel, self).get_params(deep=deep)
params['verbose'] = 0 if self.silent else 1
if self.nthread <= 0:
params.pop('nthread', None)
return params
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
"""
Fit the gradient boosting model
Parameters
----------
X : array_like
Feature matrix
y : array_like
Labels
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call
signature is func(y_predicted, dataset) where dataset will be a
Dataset fobject such that you may need to call the get_label
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
other_params: dict
other parameters
"""
evals_result = {}
params = self.get_params()
if other_params is not None:
params.update(other_params)
if self.fobj:
params["objective"] = "None"
else:
params["objective"] = self.objective
if callable(eval_metric):
feval = eval_metric
elif is_str(eval_metric) or isinstance(eval_metric, list):
feval = None
params.update({'metric': eval_metric})
else:
feval = None
feval = eval_metric if callable(eval_metric) else None
self._Booster = train(params, (X, y),
self.n_estimators, valid_datas=eval_set,
early_stopping_rounds=early_stopping_rounds,
evals_result=evals_result, fobj=self.fobj, feval=feval,
verbose_eval=verbose, train_fields=train_fields, valid_fields=valid_fields)
if evals_result:
for val in evals_result.items():
evals_result_key = list(val[1].keys())[0]
evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
self.evals_result_ = evals_result
if early_stopping_rounds is not None:
self.best_iteration = self._Booster.best_iteration
return self
def predict(self, data, raw_score=False, num_iteration=0):
return self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
def apply(self, X, num_iteration=0):
"""Return the predicted leaf every tree for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
"""
return self.booster().predict(X,
pred_leaf=True,
num_iteration=num_iteration)
def evals_result(self):
"""Return the evaluation results.
Returns
-------
evals_result : dictionary
"""
if self.evals_result_:
evals_result = self.evals_result_
else:
raise LightGBMError('No results.')
return evals_result
class LGBMRegressor(LGBMModel, LGBMRegressorBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM regression.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
class LGBMClassifier(LGBMModel, LGBMClassifierBase):
__doc__ = """Implementation of the scikit-learn API for LightGBM classification.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
self.classes_ = np.unique(y)
self.n_classes_ = len(self.classes_)
if other_params is None:
other_params = {}
if self.n_classes_ > 2:
# Switch to using a multiclass objective in the underlying LGBM instance
self.objective = "multiclass"
other_params['num_class'] = self.n_classes_
else:
self.objective = "binary"
self._le = LGBMLabelEncoder().fit(y)
training_labels = self._le.transform(y)
if eval_set is not None:
eval_set = list( (x[0], self._le.transform(x[1])) for x in eval_set )
super(LGBMClassifier, self).fit(X, training_labels, eval_set,
eval_metric, early_stopping_rounds,
verbose, train_fields, valid_fields,
other_params)
return self
def predict(self, data, raw_score=False, num_iteration=0):
class_probs = self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
if len(class_probs.shape) > 1:
column_indexes = np.argmax(class_probs, axis=1)
else:
column_indexes = np.repeat(0, class_probs.shape[0])
column_indexes[class_probs > 0.5] = 1
return self._le.inverse_transform(column_indexes)
def predict_proba(self, data, raw_score=False, num_iteration=0):
class_probs = self.booster().predict(data,
raw_score=raw_score,
num_iteration=num_iteration)
if self.n_classes_ > 2:
return class_probs
else:
classone_probs = class_probs
classzero_probs = 1.0 - classone_probs
return np.vstack((classzero_probs, classone_probs)).transpose()
def _group_wise_objective(func):
"""Decorate an objective function
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, group, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
group : array_like of shape
group size data of data
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples] or shape[n_samples* n_class]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def inner(preds, dataset):
"""internal function"""
labels = dataset.get_label()
group = dataset.get_group()
if group is None:
raise ValueError("group should not be None for ranking task")
grad, hess = func(labels, group, preds)
"""weighted for objective"""
weight = dataset.get_weight()
if weight is not None:
"""only one class"""
if len(weight) == len(grad):
grad = np.multiply(grad, weight)
hess = np.multiply(hess, weight)
else:
raise ValueError("lenght of grad and hess should equal with num_data")
return grad, hess
return inner
class LGBMRanker(LGBMModel):
__doc__ = """Implementation of the scikit-learn API for LightGBM ranking application.
""" + '\n'.join(LGBMModel.__doc__.split('\n')[2:])
def fit(self, X, y, eval_set=None, eval_metric=None,
early_stopping_rounds=None, verbose=True,
train_fields=None, valid_fields=None, other_params=None):
"""check group data"""
if "group" not in train_fields:
raise ValueError("should set group in train_fields for ranking task")
if eval_set is not None:
if valid_fields is None:
raise ValueError("valid_fields cannot be None when eval_set is not None")
elif len(valid_fields) != len(eval_set):
raise ValueError("lenght of valid_fields should equal with eval_set")
else:
for inner in valid_fields:
if "group" not in inner:
raise ValueError("should set group in valid_fields for ranking task")
if callable(self.objective):
self.fobj = _group_wise_objective(self.objective)
else:
self.objective = "lambdarank"
self.fobj = None
super(LGBMRanker, self).fit(X, y, eval_set, eval_metric,
early_stopping_rounds, verbose,
train_fields, valid_fields,
other_params)
return self
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment