Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
16d1853d
Commit
16d1853d
authored
Dec 01, 2016
by
Guolin Ke
Committed by
GitHub
Dec 01, 2016
Browse files
Merge pull request #94 from Microsoft/python-package
Python package (#11)
parents
65e711a2
29cf97e9
Changes
40
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2730 additions
and
129 deletions
+2730
-129
.travis.yml
.travis.yml
+5
-1
examples/python-guide/simple_example.py
examples/python-guide/simple_example.py
+10
-0
include/LightGBM/bin.h
include/LightGBM/bin.h
+12
-0
include/LightGBM/boosting.h
include/LightGBM/boosting.h
+43
-7
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+183
-99
include/LightGBM/config.h
include/LightGBM/config.h
+16
-4
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+65
-9
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+1
-1
include/LightGBM/feature.h
include/LightGBM/feature.h
+10
-0
include/LightGBM/meta.h
include/LightGBM/meta.h
+0
-1
include/LightGBM/metric.h
include/LightGBM/metric.h
+1
-2
include/LightGBM/tree.h
include/LightGBM/tree.h
+0
-4
include/LightGBM/utils/log.h
include/LightGBM/utils/log.h
+5
-1
python-package/README.rst
python-package/README.rst
+19
-0
python-package/lightgbm/__init__.py
python-package/lightgbm/__init__.py
+23
-0
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+1262
-0
python-package/lightgbm/callback.py
python-package/lightgbm/callback.py
+194
-0
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+416
-0
python-package/lightgbm/libpath.py
python-package/lightgbm/libpath.py
+27
-0
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+438
-0
No files found.
.travis.yml
View file @
16d1853d
...
...
@@ -21,9 +21,13 @@ script:
-
cd $TRAVIS_BUILD_DIR
-
mkdir build && cd build && cmake .. && make -j
-
cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
-
cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
-
cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
-
cd $TRAVIS_BUILD_DIR
-
rm -rf build && mkdir build && cd build && cmake -DUSE_MPI=ON ..&& make -j
-
cd $TRAVIS_BUILD_DIR/tests/c_api_test && python test.py
-
cd $TRAVIS_BUILD_DIR/python-package && python setup.py install
-
cd $TRAVIS_BUILD_DIR/tests/python_package_test && python test_basic.py && python test_sklearn.py
notifications
:
email
:
false
...
...
examples/python-guide/simple_example.py
0 → 100644
View file @
16d1853d
import
numpy
as
np
import
random
import
lightgbm
as
lgb
from
sklearn
import
datasets
,
metrics
,
model_selection
rng
=
np
.
random
.
RandomState
(
2016
)
X
,
y
=
datasets
.
make_classification
(
n_samples
=
10000
,
n_features
=
100
)
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMClassifier
(
n_estimators
=
100
).
fit
(
x_train
,
y_train
,
[(
x_test
,
y_test
)],
eval_metric
=
"auc"
)
include/LightGBM/bin.h
View file @
16d1853d
...
...
@@ -51,6 +51,18 @@ public:
explicit
BinMapper
(
const
void
*
memory
);
~
BinMapper
();
bool
CheckAlign
(
const
BinMapper
&
other
)
const
{
if
(
num_bin_
!=
other
.
num_bin_
)
{
return
false
;
}
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
if
(
bin_upper_bound_
[
i
]
!=
other
.
bin_upper_bound_
[
i
])
{
return
false
;
}
}
return
true
;
}
/*! \brief Get number of bins */
inline
int
num_bin
()
const
{
return
num_bin_
;
}
/*! \brief True if bin is trival (contains only one bin) */
...
...
include/LightGBM/boosting.h
View file @
16d1853d
...
...
@@ -35,12 +35,34 @@ public:
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
=
0
;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
virtual
void
MergeFrom
(
const
Boosting
*
other
)
=
0
;
/*!
* \brief Reset training data for current boosting
* \param config Configs for boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metric
*/
virtual
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
=
0
;
/*!
* \brief Reset shrinkage_rate data for current boosting
* \param shrinkage_rate Configs for boosting
*/
virtual
void
ResetShrinkageRate
(
double
shrinkage_rate
)
=
0
;
/*!
* \brief Add a validation data
* \param valid_data Validation data
* \param valid_metrics Metric for validation data
*/
virtual
void
AddDataset
(
const
Dataset
*
valid_data
,
virtual
void
Add
Valid
Dataset
(
const
Dataset
*
valid_data
,
const
std
::
vector
<
const
Metric
*>&
valid_metrics
)
=
0
;
/*!
...
...
@@ -52,6 +74,19 @@ public:
*/
virtual
bool
TrainOneIter
(
const
score_t
*
gradient
,
const
score_t
*
hessian
,
bool
is_eval
)
=
0
;
/*!
* \brief Rollback one iteration
*/
virtual
void
RollbackOneIter
()
=
0
;
/*!
* \brief return current iteration
*/
virtual
int
GetCurrentIteration
()
const
=
0
;
/*!
* \brief Eval metrics and check is met early stopping or not
*/
virtual
bool
EvalAndCheckEarlyStopping
()
=
0
;
/*!
* \brief Get evaluation result at data_idx data
...
...
@@ -73,7 +108,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
virtual
void
GetPredictAt
(
int
data_idx
,
score_t
*
result
,
data_size_t
*
out_len
)
const
=
0
;
virtual
void
GetPredictAt
(
int
data_idx
,
score_t
*
result
,
data_size_t
*
out_len
)
=
0
;
/*!
* \brief Prediction for one record, not sigmoid transform
...
...
@@ -99,11 +134,10 @@ public:
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
*/
virtual
void
SaveModelToFile
(
int
num_
used_model
,
bool
is_finish
,
const
char
*
filename
)
=
0
;
virtual
void
SaveModelToFile
(
int
num_
iterations
,
const
char
*
filename
)
const
=
0
;
/*!
* \brief Restore from a serialized string
...
...
@@ -127,7 +161,7 @@ public:
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
virtual
int
NumberOf
Sub
Model
s
()
const
=
0
;
virtual
int
NumberOf
Total
Model
()
const
=
0
;
/*!
* \brief Get number of classes
...
...
@@ -138,7 +172,7 @@ public:
/*!
* \brief Set number of used model for prediction
*/
virtual
void
SetNum
UsedModel
(
int
num_used_model
)
=
0
;
virtual
void
SetNum
IterationForPred
(
int
num_iteration
)
=
0
;
/*!
* \brief Get Type name of this boosting object
...
...
@@ -151,6 +185,8 @@ public:
/*! \brief Disable copy */
Boosting
(
const
Boosting
&
)
=
delete
;
static
void
LoadFileToBoosting
(
Boosting
*
boosting
,
const
char
*
filename
);
/*!
* \brief Create boosting object
* \param type Type of boosting
...
...
include/LightGBM/c_api.h
View file @
16d1853d
...
...
@@ -3,7 +3,9 @@
#include <cstdint>
#include <exception>
#include <stdexcept>
#include <cstring>
#include <string>
/*!
* To avoid type conversion on large data, most of our expose interface support both for float_32 and float_64.
* Except following:
...
...
@@ -38,7 +40,7 @@ typedef void* BoosterHandle;
/*!
* \brief get string message of the last error
* all function in this file will return 0 when succe
ss
* all function in this file will return 0 when succe
ed
* and -1 when an error occured,
* \return const char* error inforomation
*/
...
...
@@ -53,38 +55,29 @@ DllExport const char* LGBM_GetLastError();
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out a loaded dataset
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_
Create
DatasetFromFile
(
const
char
*
filename
,
DllExport
int
LGBM_Dataset
Create
FromFile
(
const
char
*
filename
,
const
char
*
parameters
,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
);
/*!
* \brief load data set from binary file like the command_line LightGBM do
* \param filename the name of the file
* \param out a loaded dataset
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_CreateDatasetFromBinaryFile
(
const
char
*
filename
,
DatesetHandle
*
out
);
/*!
* \brief create a dataset from CSR format
* \param indptr pointer to row headers
* \param indptr_type
* \param indptr_type
type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param data_type
type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nindptr number of rows in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns
; when it's set to 0, then guess from data
* \param num_col number of columns
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_
Create
DatasetFromCSR
(
const
void
*
indptr
,
DllExport
int
LGBM_Dataset
Create
FromCSR
(
const
void
*
indptr
,
int
indptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
...
...
@@ -99,19 +92,19 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
/*!
* \brief create a dataset from CSC format
* \param col_ptr pointer to col headers
* \param col_ptr_type
* \param col_ptr_type
type of col_ptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param ncol_ptr number of
row
s in the matrix + 1
* \param data_type
type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param ncol_ptr number of
col
s in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_row number of rows
; when it's set to 0, then guess from data
* \param num_row number of rows
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_
Create
DatasetFromCSC
(
const
void
*
col_ptr
,
DllExport
int
LGBM_Dataset
Create
FromCSC
(
const
void
*
col_ptr
,
int
col_ptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
...
...
@@ -126,16 +119,16 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
/*!
* \brief create dataset from dense matrix
* \param data pointer to the data space
* \param data_type
0
* \param data_type
type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param is_row_major 1 for row major, 0 for column major
* \param parameters additional parameters
* \param reference used to align bin mapper with other dataset, nullptr means don't used
* \param out created dataset
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_
Create
DatasetFromMat
(
const
void
*
data
,
DllExport
int
LGBM_Dataset
Create
FromMat
(
const
void
*
data
,
int
data_type
,
int32_t
nrow
,
int32_t
ncol
,
...
...
@@ -144,9 +137,25 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
);
/*!
* \brief Create subset of a data
* \param handle handle of full dataset
* \param used_row_indices Indices used in subset
* \param num_used_row_indices len of used_row_indices
* \param parameters additional parameters
* \param out subset of data
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_DatasetGetSubset
(
const
DatesetHandle
*
handle
,
const
int32_t
*
used_row_indices
,
int32_t
num_used_row_indices
,
const
char
*
parameters
,
DatesetHandle
*
out
);
/*!
* \brief free space for dataset
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_DatasetFree
(
DatesetHandle
handle
);
...
...
@@ -154,19 +163,21 @@ DllExport int LGBM_DatasetFree(DatesetHandle handle);
* \brief save dateset to binary file
* \param handle a instance of dataset
* \param filename file name
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_DatasetSaveBinary
(
DatesetHandle
handle
,
const
char
*
filename
);
/*!
* \brief set vector to a content in info
* Note: group and group only work for C_API_DTYPE_INT32
* label and weight only work for C_API_DTYPE_FLOAT32
* \param handle a instance of dataset
* \param field_name field name, can be label, weight, group
* \param field_name field name, can be label, weight, group
, group_id
* \param field_data pointer to vector
* \param num_element number of element in field_data
* \param type
float_32:0, int32_t:1
* \return 0 when succe
ss
, -1 when failure happens
* \param type
C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_DatasetSetField
(
DatesetHandle
handle
,
const
char
*
field_name
,
...
...
@@ -180,8 +191,8 @@ DllExport int LGBM_DatasetSetField(DatesetHandle handle,
* \param field_name field name
* \param out_len used to set result length
* \param out_ptr pointer to the result
* \param out_type
float_32:0, int32_t:1
* \return 0 when succe
ss
, -1 when failure happens
* \param out_type
C_API_DTYPE_FLOAT32 or C_API_DTYPE_INT32
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_DatasetGetField
(
DatesetHandle
handle
,
const
char
*
field_name
,
...
...
@@ -193,7 +204,7 @@ DllExport int LGBM_DatasetGetField(DatesetHandle handle,
* \brief get number of data.
* \param handle the handle to the dataset
* \param out The address to hold number of data
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_DatasetGetNumData
(
DatesetHandle
handle
,
int64_t
*
out
);
...
...
@@ -202,7 +213,7 @@ DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
* \brief get number of features
* \param handle the handle to the dataset
* \param out The output of number of features
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_DatasetGetNumFeature
(
DatesetHandle
handle
,
int64_t
*
out
);
...
...
@@ -212,42 +223,82 @@ DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
/*!
* \brief create an new boosting learner
* \param train_data training data set
* \param valid_datas validation data sets
* \param valid_names names of validation data sets
* \param n_valid_datas number of validation set
* \param parameters format: 'key1=value1 key2=value2'
* \prama out handle of created Booster
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterCreate
(
const
DatesetHandle
train_data
,
const
DatesetHandle
valid_datas
[],
const
char
*
valid_names
[],
int
n_valid_datas
,
const
char
*
parameters
,
BoosterHandle
*
out
);
/*!
* \brief load an existing boosting from model file
* \param filename filename of model
* \param out_num_iterations number of iterations of this booster
* \param out handle of created Booster
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_Booster
Load
FromModelfile
(
DllExport
int
LGBM_Booster
Create
FromModelfile
(
const
char
*
filename
,
int64_t
*
out_num_iterations
,
BoosterHandle
*
out
);
/*!
* \brief free obj in handle
* \param handle handle to be freed
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterFree
(
BoosterHandle
handle
);
/*!
* \brief Merge model in two booster to first handle
* \param handle handle, will merge other handle to this
* \param other_handle
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterMerge
(
BoosterHandle
handle
,
BoosterHandle
other_handle
);
/*!
* \brief Add new validation to booster
* \param handle handle
* \param valid_data validation data set
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterAddValidData
(
BoosterHandle
handle
,
const
DatesetHandle
valid_data
);
/*!
* \brief Reset training data for booster
* \param handle handle
* \param train_data training data set
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterResetTrainingData
(
BoosterHandle
handle
,
const
DatesetHandle
train_data
);
/*!
* \brief Reset config for current booster
* \param handle handle
* \param parameters format: 'key1=value1 key2=value2'
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterResetParameter
(
BoosterHandle
handle
,
const
char
*
parameters
);
/*!
* \brief Get number of class
* \param handle handle
* \param out_len number of class
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterGetNumClasses
(
BoosterHandle
handle
,
int64_t
*
out_len
);
/*!
* \brief update the model in one round
* \param handle handle
* \param is_finished 1 means finised(cannot split any more)
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterUpdateOneIter
(
BoosterHandle
handle
,
int
*
is_finished
);
...
...
@@ -258,7 +309,7 @@ DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished);
* \param grad gradient statistics
* \param hess second order gradient statistics
* \param is_finished 1 means finised(cannot split any more)
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterUpdateOneIterCustom
(
BoosterHandle
handle
,
const
float
*
grad
,
...
...
@@ -266,81 +317,106 @@ DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
int
*
is_finished
);
/*!
* \brief
get evaluation for training data and validation data
* \brief
Rollback one iteration
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result the string containing evaluation statistics, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterEval
(
BoosterHandle
handle
,
int
data
,
int64_t
*
out_len
,
float
*
out_results
);
DllExport
int
LGBM_BoosterRollbackOneIter
(
BoosterHandle
handle
);
/*!
* \brief get raw score for training data, used to calculate gradients outside
* \brief Get iteration of current boosting rounds
* \param out_iteration iteration of boosting rounds
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterGetCurrentIteration
(
BoosterHandle
handle
,
int64_t
*
out_iteration
);
/*!
* \brief Get number of eval
* \param out_len total number of eval results
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterGetEvalCounts
(
BoosterHandle
handle
,
int64_t
*
out_len
);
/*!
* \brief Get Name of eval
* \param out_len total number of eval results
* \param out_strs names of eval result
* \return 0 when succeed, -1 when failure happens
*/
DllExport
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int64_t
*
out_len
,
char
**
out_strs
);
/*!
* \brief get evaluation for training data and validation data
Note: 1. you should call LGBM_BoosterGetEvalNames first to get the name of evaluation results
2. should pre-allocate memory for out_results, you can get its length by LGBM_BoosterGetEvalCounts
* \param handle handle
* \param data_idx 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result
used to set a pointer to array
* \return 0 when succe
ss
, -1 when failure happens
* \param out_result
float arrary contains result
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterGetScore
(
BoosterHandle
handle
,
DllExport
int
LGBM_BoosterGetEval
(
BoosterHandle
handle
,
int
data_idx
,
int64_t
*
out_len
,
const
float
*
*
out_result
);
float
*
out_result
s
);
/*!
* \brief Get prediction for training data and validation data
this can be used to support customized eval function
this can be used to support customized eval function
Note: should pre-allocate memory for out_result, its length is equal to num_class * num_data
* \param handle handle
* \param data 0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param data
_idx
0:training data, 1: 1st valid data, 2:2nd valid data ...
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterGetPredict
(
BoosterHandle
handle
,
int
data
,
int
data
_idx
,
int64_t
*
out_len
,
float
*
out_result
);
/*!
* \brief make prediction for file
* \param handle handle
* \param predict_type
* 0:normal, with transform (if needed)
* 1:raw score
* 2:leaf index
* \param n_used_trees number of used tree, < 0 means no limit
* \param data_has_header data file has header or not
* \param data_filename filename of data file
* \param data_has_header data file has header or not
* \param predict_type
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param result_filename filename of result file
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredictForFile
(
BoosterHandle
handle
,
int
predict_type
,
int64_t
n_used_trees
,
int
data_has_header
,
const
char
*
data_filename
,
int
data_has_header
,
int
predict_type
,
int64_t
num_iteration
,
const
char
*
result_filename
);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param indptr pointer to row headers
* \param indptr_type
* \param indptr_type
type of indptr, can be C_API_DTYPE_INT32 or C_API_DTYPE_INT64
* \param indices findex
* \param data fvalue
* \param data_type
* \param data_type
type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nindptr number of rows in the matrix + 1
* \param nelem number of nonzero elements in the matrix
* \param num_col number of columns; when it's set to 0, then guess from data
* \param predict_type
* 0:normal, with transform (if needed)
* 1:raw score
* 2:leaf index
* \param n_used_trees number of used tree, < 0 means no limit
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredictForCSR
(
BoosterHandle
handle
,
const
void
*
indptr
,
...
...
@@ -352,24 +428,29 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t
nelem
,
int64_t
num_col
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
);
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
);
/*!
* \brief make prediction for an new data set
* Note: should pre-allocate memory for out_result,
* for noraml and raw score: its length is equal to num_class * num_data
* for leaf index, its length is equal to num_class * num_data * num_iteration
* \param handle handle
* \param data pointer to the data space
* \param data_type
* \param data_type
type of data pointer, can be C_API_DTYPE_FLOAT32 or C_API_DTYPE_FLOAT64
* \param nrow number of rows
* \param ncol number columns
* \param is_row_major 1 for row major, 0 for column major
* \param predict_type
* 0:normal, with transform (if needed)
* 1:raw score
* 2:leaf index
* \param n_used_trees number of used tree, < 0 means no limit
* C_API_PREDICT_NORMAL: normal prediction, with transform (if needed)
* C_API_PREDICT_RAW_SCORE: raw score
* C_API_PREDICT_LEAF_INDEX: leaf index
* \param num_iteration number of iteration for prediction, <= 0 means no limit
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredictForMat
(
BoosterHandle
handle
,
const
void
*
data
,
...
...
@@ -378,18 +459,19 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t
ncol
,
int
is_row_major
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
);
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
);
/*!
* \brief save model into file
* \param handle handle
* \param num_
used_model
, < 0 means
no limit
* \param num_
iteration
, <
=
0 means
save all
* \param filename file name
* \return 0 when succe
ss
, -1 when failure happens
* \return 0 when succe
ed
, -1 when failure happens
*/
DllExport
int
LGBM_BoosterSaveModel
(
BoosterHandle
handle
,
int
num_
used_model
,
int
num_
iteration
,
const
char
*
filename
);
...
...
@@ -413,13 +495,15 @@ ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indi
std
::
vector
<
double
>
SampleFromOneColumn
(
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
data
,
const
std
::
vector
<
int
>&
indices
);
#if defined(_MSC_VER)
// exception handle and error msg
static
std
::
string
&
LastErrorMsg
()
{
static
std
::
string
err_msg
(
"Everything is fine"
);
return
err_msg
;
}
static
char
*
LastErrorMsg
()
{
static
__declspec
(
thread
)
char
err_msg
[
512
]
=
"Everything is fine"
;
return
err_msg
;
}
#else
static
char
*
LastErrorMsg
()
{
static
thread_local
char
err_msg
[
512
]
=
"Everything is fine"
;
return
err_msg
;
}
#endif
inline
void
LGBM_SetLastError
(
const
char
*
msg
)
{
LastErrorMsg
()
=
msg
;
std
::
strcpy
(
LastErrorMsg
()
,
msg
)
;
}
inline
int
LGBM_APIHandleException
(
const
std
::
exception
&
ex
)
{
...
...
include/LightGBM/config.h
View file @
16d1853d
...
...
@@ -72,6 +72,8 @@ public:
inline
bool
GetBool
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
,
const
std
::
string
&
name
,
bool
*
out
);
static
std
::
unordered_map
<
std
::
string
,
std
::
string
>
Str2Map
(
const
char
*
parameters
);
};
/*! \brief Types of boosting */
...
...
@@ -97,7 +99,7 @@ public:
std
::
string
output_result
=
"LightGBM_predict_result.txt"
;
std
::
string
input_model
=
""
;
int
verbosity
=
1
;
int
num_
model
_predict
=
NO_LIMIT
;
int
num_
iteration
_predict
=
-
1
;
bool
is_pre_partition
=
false
;
bool
is_enable_sparse
=
true
;
bool
use_two_round_loading
=
false
;
...
...
@@ -136,6 +138,8 @@ public:
bool
is_unbalance
=
false
;
// for multiclass
int
num_class
=
1
;
// Balancing of positive and negative weights
double
scale_pos_weight
=
1.0
f
;
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
};
...
...
@@ -164,12 +168,12 @@ public:
int
feature_fraction_seed
=
2
;
double
feature_fraction
=
1.0
f
;
// max cache size(unit:MB) for historical histogram. < 0 means not limit
double
histogram_pool_size
=
NO_LIMIT
;
double
histogram_pool_size
=
-
1.0
f
;
// max depth of tree model.
// Still grow tree by leaf-wise, but limit the max depth to avoid over-fitting
// And the max leaves will be min(num_leaves, pow(2, max_depth - 1))
// max_depth < 0 means not limit
int
max_depth
=
NO_LIMIT
;
int
max_depth
=
-
1
;
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
};
...
...
@@ -231,7 +235,7 @@ public:
MetricConfig
metric_config
;
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
override
;
void
LoadFromString
(
const
char
*
str
);
private:
void
GetBoostingType
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
);
...
...
@@ -328,17 +332,22 @@ struct ParameterAlias {
{
"ndcg_at"
,
"ndcg_eval_at"
},
{
"min_data_per_leaf"
,
"min_data_in_leaf"
},
{
"min_data"
,
"min_data_in_leaf"
},
{
"min_child_samples"
,
"min_data_in_leaf"
},
{
"min_sum_hessian_per_leaf"
,
"min_sum_hessian_in_leaf"
},
{
"min_sum_hessian"
,
"min_sum_hessian_in_leaf"
},
{
"min_hessian"
,
"min_sum_hessian_in_leaf"
},
{
"min_child_weight"
,
"min_sum_hessian_in_leaf"
},
{
"num_leaf"
,
"num_leaves"
},
{
"sub_feature"
,
"feature_fraction"
},
{
"colsample_bytree"
,
"feature_fraction"
},
{
"num_iteration"
,
"num_iterations"
},
{
"num_tree"
,
"num_iterations"
},
{
"num_round"
,
"num_iterations"
},
{
"num_trees"
,
"num_iterations"
},
{
"num_rounds"
,
"num_iterations"
},
{
"sub_row"
,
"bagging_fraction"
},
{
"subsample"
,
"bagging_fraction"
},
{
"subsample_freq"
,
"bagging_freq"
},
{
"shrinkage_rate"
,
"learning_rate"
},
{
"tree"
,
"tree_learner"
},
{
"num_machine"
,
"num_machines"
},
...
...
@@ -361,6 +370,9 @@ struct ParameterAlias {
{
"blacklist"
,
"ignore_column"
},
{
"predict_raw_score"
,
"is_predict_raw_score"
},
{
"predict_leaf_index"
,
"is_predict_leaf_index"
},
{
"min_split_gain"
,
"min_gain_to_split"
},
{
"reg_alpha"
,
"lambda_l1"
},
{
"reg_lambda"
,
"lambda_l2"
},
{
"num_classes"
,
"num_class"
}
});
std
::
unordered_map
<
std
::
string
,
std
::
string
>
tmp_map
;
...
...
include/LightGBM/dataset.h
View file @
16d1853d
...
...
@@ -13,6 +13,7 @@
#include <functional>
#include <string>
#include <unordered_set>
#include <mutex>
namespace
LightGBM
{
...
...
@@ -46,6 +47,13 @@ public:
*/
void
Init
(
const
char
*
data_filename
,
const
int
num_class
);
/*!
* \brief init as subset
* \param metadata Filename of data
* \param used_indices
* \param num_used_indices
*/
void
Init
(
const
Metadata
&
metadata
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
*/
...
...
@@ -76,13 +84,14 @@ public:
void
CheckOrPartition
(
data_size_t
num_all_data
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
);
void
SetLabel
(
const
float
*
label
,
data_size_t
len
);
void
SetWeights
(
const
float
*
weights
,
data_size_t
len
);
void
SetQueryBoundaries
(
const
data_size_t
*
query_boundaries
,
data_size_t
len
);
void
SetQueryId
(
const
data_size_t
*
query_id
,
data_size_t
len
);
/*!
* \brief Set initial scores
* \param init_score Initial scores, this class will manage memory for init_score.
...
...
@@ -141,8 +150,13 @@ public:
* \brief Get weights, if not exists, will return nullptr
* \return Pointer of weights
*/
inline
const
float
*
weights
()
const
{
return
weights_
.
data
();
}
inline
const
float
*
weights
()
const
{
if
(
weights_
.
size
()
>
0
)
{
return
weights_
.
data
();
}
else
{
return
nullptr
;
}
}
/*!
* \brief Get data boundaries on queries, if not exists, will return nullptr
...
...
@@ -151,8 +165,13 @@ public:
* is the data indices for query i.
* \return Pointer of data boundaries on queries
*/
inline
const
data_size_t
*
query_boundaries
()
const
{
return
query_boundaries_
.
data
();
}
inline
const
data_size_t
*
query_boundaries
()
const
{
if
(
query_boundaries_
.
size
()
>
0
)
{
return
query_boundaries_
.
data
();
}
else
{
return
nullptr
;
}
}
/*!
* \brief Get Number of queries
...
...
@@ -164,13 +183,25 @@ public:
* \brief Get weights for queries, if not exists, will return nullptr
* \return Pointer of weights for queries
*/
inline
const
float
*
query_weights
()
const
{
return
query_weights_
.
data
();
}
inline
const
float
*
query_weights
()
const
{
if
(
query_weights_
.
size
()
>
0
)
{
return
query_weights_
.
data
();
}
else
{
return
nullptr
;
}
}
/*!
* \brief Get initial scores, if not exists, will return nullptr
* \return Pointer of initial scores
*/
inline
const
float
*
init_score
()
const
{
return
init_score_
.
data
();
}
inline
const
float
*
init_score
()
const
{
if
(
init_score_
.
size
()
>
0
)
{
return
init_score_
.
data
();
}
else
{
return
nullptr
;
}
}
/*! \brief Disable copy */
Metadata
&
operator
=
(
const
Metadata
&
)
=
delete
;
...
...
@@ -210,6 +241,8 @@ private:
std
::
vector
<
float
>
init_score_
;
/*! \brief Queries data */
std
::
vector
<
data_size_t
>
queries_
;
/*! \brief mutex for threading safe call */
std
::
mutex
mutex_
;
};
...
...
@@ -253,6 +286,27 @@ public:
/*! \brief Destructor */
~
Dataset
();
bool
CheckAlign
(
const
Dataset
&
other
)
const
{
if
(
num_features_
!=
other
.
num_features_
)
{
return
false
;
}
if
(
num_total_features_
!=
other
.
num_total_features_
)
{
return
false
;
}
if
(
num_class_
!=
other
.
num_class_
)
{
return
false
;
}
if
(
label_idx_
!=
other
.
label_idx_
)
{
return
false
;
}
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
if
(
!
features_
[
i
]
->
CheckAlign
(
*
(
other
.
features_
[
i
].
get
())))
{
return
false
;
}
}
return
true
;
}
inline
void
PushOneRow
(
int
tid
,
data_size_t
row_idx
,
const
std
::
vector
<
double
>&
feature_values
)
{
for
(
size_t
i
=
0
;
i
<
feature_values
.
size
()
&&
i
<
static_cast
<
size_t
>
(
num_total_features_
);
++
i
)
{
int
feature_idx
=
used_feature_map_
[
i
];
...
...
@@ -282,6 +336,8 @@ public:
}
}
Dataset
*
Subset
(
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
is_enable_sparse
)
const
;
void
FinishLoad
();
bool
SetFloatField
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
);
...
...
@@ -348,12 +404,12 @@ private:
int
num_class_
;
/*! \brief Store some label level data*/
Metadata
metadata_
;
/*! \brief True if dataset is loaded from binary file */
bool
is_loading_from_binfile_
;
/*! \brief index of label column */
int
label_idx_
=
0
;
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief store feature names */
static
const
char
*
binary_file_token
;
};
}
// namespace LightGBM
...
...
include/LightGBM/dataset_loader.h
View file @
16d1853d
...
...
@@ -49,7 +49,7 @@ private:
void
ExtractFeaturesFromFile
(
const
char
*
filename
,
const
Parser
*
parser
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
,
Dataset
*
dataset
);
/*! \brief Check can load from binary file */
bool
CheckCanLoadFromBin
(
const
char
*
filename
);
std
::
string
CheckCanLoadFromBin
(
const
char
*
filename
);
const
IOConfig
&
io_config_
;
/*! \brief Random generator*/
...
...
include/LightGBM/feature.h
View file @
16d1853d
...
...
@@ -63,6 +63,13 @@ public:
~
Feature
()
{
}
bool
CheckAlign
(
const
Feature
&
other
)
const
{
if
(
feature_index_
!=
other
.
feature_index_
)
{
return
false
;
}
return
bin_mapper_
->
CheckAlign
(
*
(
other
.
bin_mapper_
.
get
()));
}
/*!
* \brief Push one record, will auto convert to bin and push to bin data
* \param tid Thread id
...
...
@@ -73,6 +80,9 @@ public:
unsigned
int
bin
=
bin_mapper_
->
ValueToBin
(
value
);
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
inline
void
PushBin
(
int
tid
,
data_size_t
line_idx
,
unsigned
int
bin
)
{
bin_data_
->
Push
(
tid
,
line_idx
,
bin
);
}
inline
void
FinishLoad
()
{
bin_data_
->
FinishLoad
();
}
/*! \brief Index of this feature */
inline
int
feature_index
()
const
{
return
feature_index_
;
}
...
...
include/LightGBM/meta.h
View file @
16d1853d
...
...
@@ -24,7 +24,6 @@ using ReduceFunction = std::function<void(const char*, char*, int)>;
using
PredictFunction
=
std
::
function
<
std
::
vector
<
double
>
(
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
)
>
;
#define NO_LIMIT (-1)
#define NO_SPECIFIC (-1)
}
// namespace LightGBM
...
...
include/LightGBM/metric.h
View file @
16d1853d
...
...
@@ -24,8 +24,7 @@ public:
* \param metadata Label data
* \param num_data Number of data
*/
virtual
void
Init
(
const
char
*
test_name
,
const
Metadata
&
metadata
,
data_size_t
num_data
)
=
0
;
virtual
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
=
0
;
virtual
const
std
::
vector
<
std
::
string
>&
GetName
()
const
=
0
;
...
...
include/LightGBM/tree.h
View file @
16d1853d
...
...
@@ -101,10 +101,6 @@ public:
/*! \brief Serialize this object by string*/
std
::
string
ToString
();
/*! \brief Disable copy */
Tree
&
operator
=
(
const
Tree
&
)
=
delete
;
/*! \brief Disable copy */
Tree
(
const
Tree
&
)
=
delete
;
private:
/*!
* \brief Find leaf index of which record belongs by data
...
...
include/LightGBM/utils/log.h
View file @
16d1853d
...
...
@@ -89,7 +89,11 @@ private:
// a trick to use static variable in header file.
// May be not good, but avoid to use an additional cpp file
static
LogLevel
&
GetLevel
()
{
static
LogLevel
level
;
return
level
;
}
#if defined(_MSC_VER)
static
LogLevel
&
GetLevel
()
{
static
__declspec
(
thread
)
LogLevel
level
=
LogLevel
::
Info
;
return
level
;
}
#else
static
LogLevel
&
GetLevel
()
{
static
thread_local
LogLevel
level
=
LogLevel
::
Info
;
return
level
;
}
#endif
};
...
...
python-package/README.rst
0 → 100644
View file @
16d1853d
LightGBM Python Package
=======================
Installation
------------
1. Following `Installation Guide <https://github.com/Microsoft/LightGBM/wiki/Installation-Guide>`__ to build first.
For the windows user, please change the build config to ``DLL``.
2. Install with ``cd python-package; python setpy.py install``
Note: Make sure you have `setuptools <https://pypi.python.org/pypi/setuptools>`__
Examples
--------
- Refer also to the walk through examples in `python-guide
folder <https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide>`__
python-package/lightgbm/__init__.py
0 → 100644
View file @
16d1853d
# coding: utf-8
"""LightGBM, Light Gradient Boosting Machine.
Contributors: https://github.com/Microsoft/LightGBM/graphs/contributors
"""
from
__future__
import
absolute_import
import
os
from
.basic
import
Predictor
,
Dataset
,
Booster
from
.engine
import
train
,
cv
try
:
from
.sklearn
import
LGBMModel
,
LGBMRegressor
,
LGBMClassifier
,
LGBMRanker
except
ImportError
:
pass
__version__
=
0.1
__all__
=
[
'Dataset'
,
'Booster'
,
'train'
,
'cv'
,
'LGBMModel'
,
'LGBMRegressor'
,
'LGBMClassifier'
,
'LGBMRanker'
]
\ No newline at end of file
python-package/lightgbm/basic.py
0 → 100644
View file @
16d1853d
This diff is collapsed.
Click to expand it.
python-package/lightgbm/callback.py
0 → 100644
View file @
16d1853d
from
__future__
import
absolute_import
import
collections
class
EarlyStopException
(
Exception
):
"""Exception of early stopping.
Parameters
----------
best_iteration : int
The best iteration stopped.
"""
def
__init__
(
self
,
best_iteration
):
super
(
EarlyStopException
,
self
).
__init__
()
self
.
best_iteration
=
best_iteration
# Callback environment used by callbacks
CallbackEnv
=
collections
.
namedtuple
(
"LightGBMCallbackEnv"
,
[
"model"
,
"cvfolds"
,
"iteration"
,
"begin_iteration"
,
"end_iteration"
,
"evaluation_result_list"
])
def
_format_eval_result
(
value
,
show_stdv
=
True
):
"""format metric string"""
if
len
(
value
)
==
4
:
return
'%s_%s:%g'
%
(
value
[
0
],
value
[
1
],
value
[
2
])
elif
len
(
value
)
==
5
:
if
show_stdv
:
return
'%s_%s:%g+%g'
%
(
value
[
0
],
value
[
1
],
value
[
2
],
value
[
4
])
else
:
return
'%s_%s:%g'
%
(
value
[
0
],
value
[
1
],
value
[
2
])
else
:
raise
ValueError
(
"wrong metric value"
)
def
print_evaluation
(
period
=
1
,
show_stdv
=
True
):
"""Create a callback that print evaluation result.
Parameters
----------
period : int
The period to log the evaluation results
show_stdv : bool, optional
Whether show stdv if provided
Returns
-------
callback : function
A callback that print evaluation every period iterations.
"""
def
callback
(
env
):
"""internal function"""
if
len
(
env
.
evaluation_result_list
)
==
0
or
period
is
False
:
return
if
(
env
.
iteration
%
period
==
0
or
env
.
iteration
+
1
==
env
.
begin_iteration
):
result
=
'
\t
'
.
join
([
_format_eval_result
(
x
,
show_stdv
)
for
x
in
env
.
evaluation_result_list
])
print
(
'[%d]
\t
%s
\n
'
%
(
env
.
iteration
,
result
))
return
callback
def
record_evaluation
(
eval_result
):
"""Create a call back that records the evaluation history into eval_result.
Parameters
----------
eval_result : dict
A dictionary to store the evaluation results.
Returns
-------
callback : function
The requested callback function.
"""
if
not
isinstance
(
eval_result
,
dict
):
raise
TypeError
(
'eval_result has to be a dictionary'
)
eval_result
.
clear
()
def
init
(
env
):
"""internal function"""
for
data_name
,
eval_name
,
_
,
_
in
env
.
evaluation_result_list
:
if
data_name
not
in
eval_result
:
eval_result
[
data_name
]
=
{}
if
eval_name
not
in
eval_result
[
data_name
]:
eval_result
[
data_name
][
eval_name
]
=
[]
def
callback
(
env
):
"""internal function"""
if
len
(
eval_result
)
==
0
:
init
(
env
)
for
data_name
,
eval_name
,
result
,
_
in
env
.
evaluation_result_list
:
eval_result
[
data_name
][
eval_name
].
append
(
result
)
return
callback
def
reset_learning_rate
(
learning_rates
):
"""Reset learning rate after iteration 1
NOTE: the initial learning rate will still take in-effect on first iteration.
Parameters
----------
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
Returns
-------
callback : function
The requested callback function.
"""
def
callback
(
env
):
"""internal function"""
booster
=
env
.
model
i
=
env
.
iteration
if
isinstance
(
learning_rates
,
list
):
if
len
(
learning_rates
)
!=
env
.
end_iteration
:
raise
ValueError
(
"Length of list 'learning_rates' has to equal 'num_boost_round'."
)
booster
.
reset_parameter
({
'learning_rate'
:
learning_rates
[
i
]})
else
:
booster
.
reset_parameter
({
'learning_rate'
:
learning_rates
(
i
,
env
.
end_iteration
)})
callback
.
before_iteration
=
True
return
callback
def
early_stop
(
stopping_rounds
,
verbose
=
True
):
"""Create a callback that activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Parameters
----------
stopp_rounds : int
The stopping rounds before the trend occur.
verbose : optional, bool
Whether to print message about early stopping information.
Returns
-------
callback : function
The requested callback function.
"""
factor_to_bigger_better
=
{}
best_score
=
{}
best_iter
=
{}
best_msg
=
{}
def
init
(
env
):
"""internal function"""
bst
=
env
.
model
if
len
(
env
.
evaluation_result_list
)
==
0
:
raise
ValueError
(
'For early stopping you need at least one set in evals.'
)
if
verbose
:
msg
=
"Will train until hasn't improved in {} rounds.
\n
"
print
(
msg
.
format
(
stopping_rounds
))
for
i
in
range
(
len
(
env
.
evaluation_result_list
)):
best_score
[
i
]
=
float
(
'-inf'
)
best_iter
[
i
]
=
0
if
verbose
:
best_msg
[
i
]
=
""
factor_to_bigger_better
[
i
]
=
-
1.0
if
env
.
evaluation_result_list
[
i
][
3
]:
factor_to_bigger_better
[
i
]
=
1.0
def
callback
(
env
):
"""internal function"""
if
len
(
best_score
)
==
0
:
init
(
env
)
for
i
in
range
(
len
(
env
.
evaluation_result_list
)):
score
=
env
.
evaluation_result_list
[
i
][
2
]
*
factor_to_bigger_better
[
i
]
if
score
>
best_score
[
i
]:
best_score
[
i
]
=
score
best_iter
[
i
]
=
env
.
iteration
if
verbose
:
best_msg
[
i
]
=
'[%d]
\t
%s'
%
(
env
.
iteration
,
'
\t
'
.
join
([
_format_eval_result
(
x
)
for
x
in
env
.
evaluation_result_list
]))
else
:
if
env
.
iteration
-
best_iter
[
i
]
>=
stopping_rounds
:
if
env
.
model
is
not
None
:
env
.
model
.
set_attr
(
best_iteration
=
str
(
best_iter
[
i
]))
if
verbose
:
print
(
'early stopping, best message is:
\n
{} '
.
format
(
best_msg
[
i
]))
raise
EarlyStopException
(
best_iter
[
i
])
return
callback
python-package/lightgbm/engine.py
0 → 100644
View file @
16d1853d
"""Training Library containing training routines of LightGBM."""
from
__future__
import
absolute_import
import
numpy
as
np
from
.basic
import
LightGBMError
,
Predictor
,
Dataset
,
Booster
,
is_str
from
.
import
callback
def
_construct_dataset
(
X_y
,
reference
=
None
,
params
=
None
,
other_fields
=
None
,
predictor
=
None
):
if
'max_bin'
in
params
:
max_bin
=
int
(
params
[
'max_bin'
])
else
:
max_bin
=
255
weight
=
None
group
=
None
init_score
=
None
if
other_fields
is
not
None
:
if
not
isinstance
(
other_fields
,
dict
):
raise
TypeError
(
"other filed data should be dict type"
)
weight
=
None
if
'weight'
not
in
other_fields
else
other_fields
[
'weight'
]
group
=
None
if
'group'
not
in
other_fields
else
other_fields
[
'group'
]
init_score
=
None
if
'init_score'
not
in
other_fields
else
other_fields
[
'init_score'
]
if
is_str
(
X_y
):
data
=
X_y
label
=
None
else
:
if
len
(
X_y
)
!=
2
:
raise
TypeError
(
"should pass (data, label) pair"
)
data
=
X_y
[
0
]
label
=
X_y
[
1
]
if
reference
is
None
:
ret
=
Dataset
(
data
,
label
=
label
,
max_bin
=
max_bin
,
weight
=
weight
,
group
=
group
,
predictor
=
predictor
,
params
=
params
)
else
:
ret
=
reference
.
create_valid
(
data
,
label
=
label
,
weight
=
weight
,
group
=
group
,
params
=
params
)
if
init_score
is
not
None
:
ret
.
set_init_score
(
init_score
)
return
ret
def
train
(
params
,
train_data
,
num_boost_round
=
100
,
valid_datas
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
train_fields
=
None
,
valid_fields
=
None
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""Train with given parameters.
Parameters
----------
params : dict
params.
train_data : pair, (X, y) or filename of data
Data to be trained.
num_boost_round: int
Number of boosting iterations.
valid_datas: list of pairs (valid_X, valid_y) or filename of data
List of data to be evaluated during training
valid_names: list of string
names of valid_datas
fobj : function
Customized objective function.
feval : function
Customized evaluation function.
Note: should return (eval_name, eval_result, is_higher_better) of list of this
init_model : file name of lightgbm model or 'Booster' instance
model used for continued train
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will add 'best_iteration' field
evals_result: dict or None
This dictionary used to store all evaluation results of all the items in valid_datas.
Example: with a valid_datas containing [valid_set, train_set] and valid_names containing ['eval', 'train'] and
a paramater containing ('metric':'logloss')
Returns: {'train': {'logloss': ['0.48253', '0.35953', ...]},
'eval': {'logloss': ['0.480385', '0.357756', ...]}}
passed with None means no using this function
verbose_eval : bool or int
Requires at least one item in evals.
If `verbose_eval` is True then the evaluation metric on the validation set is
printed at each boosting stage.
If `verbose_eval` is an integer then the evaluation metric on the validation set
is printed at every given `verbose_eval` boosting stage. The last boosting stage
/ the boosting stage found by using `early_stopping_rounds` is also printed.
Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
is printed every 4 boosting stages, instead of every boosting stage.
learning_rates: list or function
List of learning rate for each boosting round
or a customized function that calculates learning_rate in terms of
current number of round and the total number of boosting round (e.g. yields
learning rate decay)
- list l: learning_rate = l[current_round]
- function f: learning_rate = f(current_round, total_boost_round)
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
booster : a trained booster model
"""
"""create predictor first"""
if
is_str
(
init_model
):
predictor
=
Predictor
(
model_file
=
init_model
)
elif
isinstance
(
init_model
,
Booster
):
predictor
=
init_model
.
to_predictor
()
elif
isinstance
(
init_model
,
Predictor
):
predictor
=
init_model
else
:
predictor
=
None
"""create dataset"""
train_set
=
_construct_dataset
(
train_data
,
None
,
params
,
train_fields
,
predictor
)
is_valid_contain_train
=
False
train_data_name
=
"training"
valid_sets
=
[]
name_valid_sets
=
[]
if
valid_datas
is
not
None
:
for
i
in
range
(
len
(
valid_datas
)):
other_fields
=
None
if
valid_fields
is
None
else
valid_fields
[
i
]
"""reduce cost for prediction training data"""
if
valid_datas
[
i
]
is
train_data
:
is_valid_contain_train
=
True
if
valid_names
is
not
None
:
train_data_name
=
valid_names
[
i
]
continue
valid_set
=
_construct_dataset
(
valid_datas
[
i
],
train_set
,
params
,
other_fields
,
predictor
)
valid_sets
.
append
(
valid_set
)
if
valid_names
is
not
None
:
name_valid_sets
.
append
(
valid_names
[
i
])
else
:
name_valid_sets
.
append
(
'valid_'
+
str
(
i
))
"""process callbacks"""
callbacks
=
[]
if
callbacks
is
None
else
callbacks
# Most of legacy advanced options becomes callbacks
if
isinstance
(
verbose_eval
,
bool
)
and
verbose_eval
:
callbacks
.
append
(
callback
.
print_evaluation
())
else
:
if
isinstance
(
verbose_eval
,
int
):
callbacks
.
append
(
callback
.
print_evaluation
(
verbose_eval
))
if
early_stopping_rounds
is
not
None
:
callbacks
.
append
(
callback
.
early_stop
(
early_stopping_rounds
,
verbose
=
bool
(
verbose_eval
)))
if
learning_rates
is
not
None
:
callbacks
.
append
(
callback
.
reset_learning_rate
(
learning_rates
))
if
evals_result
is
not
None
:
callbacks
.
append
(
callback
.
record_evaluation
(
evals_result
))
callbacks_before_iter
=
[
cb
for
cb
in
callbacks
if
cb
.
__dict__
.
get
(
'before_iteration'
,
False
)]
callbacks_after_iter
=
[
cb
for
cb
in
callbacks
if
not
cb
.
__dict__
.
get
(
'before_iteration'
,
False
)]
"""construct booster"""
if
'metric'
in
params
:
if
is_str
(
params
[
'metric'
]):
params
[
'metric'
]
=
params
[
'metric'
].
split
(
','
)
else
:
params
[
'metric'
]
=
list
(
params
[
'metric'
])
booster
=
Booster
(
params
=
params
,
train_set
=
train_set
)
if
is_valid_contain_train
:
booster
.
set_train_data_name
(
train_data_name
)
for
i
in
range
(
len
(
valid_sets
)):
booster
.
add_valid
(
valid_sets
[
i
],
name_valid_sets
[
i
])
"""start training"""
for
i
in
range
(
num_boost_round
):
for
cb
in
callbacks_before_iter
:
cb
(
callback
.
CallbackEnv
(
model
=
booster
,
cvfolds
=
None
,
iteration
=
i
,
begin_iteration
=
0
,
end_iteration
=
num_boost_round
,
evaluation_result_list
=
None
))
booster
.
update
(
fobj
=
fobj
)
evaluation_result_list
=
[]
# check evaluation result.
if
len
(
valid_sets
)
!=
0
:
if
is_valid_contain_train
:
evaluation_result_list
.
extend
(
booster
.
eval_train
(
feval
))
evaluation_result_list
.
extend
(
booster
.
eval_valid
(
feval
))
try
:
for
cb
in
callbacks_after_iter
:
cb
(
callback
.
CallbackEnv
(
model
=
booster
,
cvfolds
=
None
,
iteration
=
i
,
begin_iteration
=
0
,
end_iteration
=
num_boost_round
,
evaluation_result_list
=
evaluation_result_list
))
except
callback
.
EarlyStopException
:
break
if
booster
.
attr
(
'best_iteration'
)
is
not
None
:
booster
.
best_iteration
=
int
(
booster
.
attr
(
'best_iteration'
))
else
:
booster
.
best_iteration
=
num_boost_round
-
1
return
booster
class
CVBooster
(
object
):
""""Auxiliary datastruct to hold one fold of CV."""
def
__init__
(
self
,
train_set
,
valid_test
,
params
):
""""Initialize the CVBooster"""
self
.
train_set
=
train_set
self
.
valid_test
=
valid_test
self
.
booster
=
Booster
(
params
=
params
,
train_set
=
train_set
)
self
.
booster
.
add_valid
(
valid_test
,
'valid'
)
def
update
(
self
,
fobj
):
""""Update the boosters for one iteration"""
self
.
booster
.
update
(
fobj
=
fobj
)
def
eval
(
self
,
feval
):
""""Evaluate the CVBooster for one iteration."""
return
self
.
booster
.
eval_valid
(
feval
)
try
:
try
:
from
sklearn.model_selection
import
KFold
,
StratifiedKFold
except
ImportError
:
from
sklearn.cross_validation
import
KFold
,
StratifiedKFold
SKLEARN_StratifiedKFold
=
True
except
ImportError
:
SKLEARN_StratifiedKFold
=
False
def
_make_n_folds
(
full_data
,
nfold
,
param
,
seed
,
fpreproc
=
None
,
stratified
=
False
):
"""
Make an n-fold list of CVBooster from random indices.
"""
np
.
random
.
seed
(
seed
)
if
stratified
:
if
SKLEARN_StratifiedKFold
:
sfk
=
StratifiedKFold
(
n_splits
=
nfold
,
shuffle
=
True
,
random_state
=
seed
)
idset
=
[
x
[
1
]
for
x
in
sfk
.
split
(
X
=
full_data
.
get_label
(),
y
=
full_data
.
get_label
())]
else
:
raise
LightGBMError
(
'sklearn needs to be installed in order to use stratified cv'
)
else
:
randidx
=
np
.
random
.
permutation
(
full_data
.
num_data
())
kstep
=
int
(
len
(
randidx
)
/
nfold
)
idset
=
[
randidx
[(
i
*
kstep
):
min
(
len
(
randidx
),
(
i
+
1
)
*
kstep
)]
for
i
in
range
(
nfold
)]
ret
=
[]
for
k
in
range
(
nfold
):
train_set
=
full_data
.
subset
(
np
.
concatenate
([
idset
[
i
]
for
i
in
range
(
nfold
)
if
k
!=
i
]))
valid_set
=
full_data
.
subset
(
idset
[
k
])
# run preprocessing on the data set if needed
if
fpreproc
is
not
None
:
train_set
,
valid_set
,
tparam
=
fpreproc
(
train_set
,
valid_set
,
param
.
copy
())
else
:
tparam
=
param
ret
.
append
(
CVBooster
(
train_set
,
valid_set
,
tparam
))
return
ret
def
_agg_cv_result
(
raw_results
):
# pylint: disable=invalid-name
"""
Aggregate cross-validation results.
"""
cvmap
=
{}
metric_type
=
{}
for
one_result
in
raw_results
:
for
one_line
in
one_result
:
key
=
one_line
[
1
]
metric_type
[
key
]
=
one_line
[
3
]
if
key
not
in
cvmap
:
cvmap
[
key
]
=
[]
cvmap
[
key
].
append
(
one_line
[
2
])
results
=
[]
for
k
,
v
in
cvmap
.
items
():
v
=
np
.
array
(
v
)
mean
,
std
=
np
.
mean
(
v
),
np
.
std
(
v
)
results
.
append
((
'cv_agg'
,
k
,
mean
,
metric_type
[
k
],
std
))
return
results
def
cv
(
params
,
train_data
,
num_boost_round
=
10
,
nfold
=
5
,
stratified
=
False
,
metrics
=
(),
fobj
=
None
,
feval
=
None
,
train_fields
=
None
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
callbacks
=
None
):
# pylint: disable = invalid-name
"""Cross-validation with given paramaters.
Parameters
----------
params : dict
Booster params.
train_data : pair, (X, y) or filename of data
Data to be trained.
num_boost_round : int
Number of boosting iterations.
nfold : int
Number of folds in CV.
stratified : bool
Perform stratified sampling.
folds : a KFold or StratifiedKFold instance
Sklearn KFolds or StratifiedKFolds.
metrics : string or list of strings
Evaluation metrics to be watched in CV.
fobj : function
Custom objective function.
feval : function
Custom evaluation function.
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
Last entry in evaluation history is the one from best iteration.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns
transformed versions of those.
verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed
when np.ndarray is returned. If True, progress will be displayed at
boosting stage. If an integer is given, progress will be displayed
at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
seed : int
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
Returns
-------
evaluation history : list(string)
"""
if
isinstance
(
metrics
,
str
):
metrics
=
[
metrics
]
if
isinstance
(
params
,
list
):
params
=
dict
(
params
)
if
not
'metric'
in
params
:
params
[
'metric'
]
=
[]
else
:
if
is_str
(
params
[
'metric'
]):
params
[
'metric'
]
=
params
[
'metric'
].
split
(
','
)
else
:
params
[
'metric'
]
=
list
(
params
[
'metric'
])
if
metrics
is
not
None
and
len
(
metrics
)
>
0
:
params
[
'metric'
].
extend
(
metrics
)
train_set
=
_construct_dataset
(
train_data
,
None
,
params
,
train_fields
)
results
=
{}
cvfolds
=
_make_n_folds
(
train_set
,
nfold
,
params
,
seed
,
fpreproc
,
stratified
)
# setup callbacks
callbacks
=
[]
if
callbacks
is
None
else
callbacks
if
early_stopping_rounds
is
not
None
:
callbacks
.
append
(
callback
.
early_stop
(
early_stopping_rounds
,
verbose
=
False
))
if
isinstance
(
verbose_eval
,
bool
)
and
verbose_eval
:
callbacks
.
append
(
callback
.
print_evaluation
(
show_stdv
=
show_stdv
))
else
:
if
isinstance
(
verbose_eval
,
int
):
callbacks
.
append
(
callback
.
print_evaluation
(
verbose_eval
,
show_stdv
=
show_stdv
))
callbacks_before_iter
=
[
cb
for
cb
in
callbacks
if
cb
.
__dict__
.
get
(
'before_iteration'
,
False
)]
callbacks_after_iter
=
[
cb
for
cb
in
callbacks
if
not
cb
.
__dict__
.
get
(
'before_iteration'
,
False
)]
for
i
in
range
(
num_boost_round
):
for
cb
in
callbacks_before_iter
:
cb
(
callback
.
CallbackEnv
(
model
=
None
,
cvfolds
=
cvfolds
,
iteration
=
i
,
begin_iteration
=
0
,
end_iteration
=
num_boost_round
,
evaluation_result_list
=
None
))
for
fold
in
cvfolds
:
fold
.
update
(
fobj
)
res
=
_agg_cv_result
([
f
.
eval
(
feval
)
for
f
in
cvfolds
])
for
_
,
key
,
mean
,
_
,
std
in
res
:
if
key
+
'-mean'
not
in
results
:
results
[
key
+
'-mean'
]
=
[]
if
key
+
'-std'
not
in
results
:
results
[
key
+
'-std'
]
=
[]
results
[
key
+
'-mean'
].
append
(
mean
)
results
[
key
+
'-std'
].
append
(
std
)
try
:
for
cb
in
callbacks_after_iter
:
cb
(
callback
.
CallbackEnv
(
model
=
None
,
cvfolds
=
cvfolds
,
iteration
=
i
,
begin_iteration
=
0
,
end_iteration
=
num_boost_round
,
evaluation_result_list
=
res
))
except
callback
.
EarlyStopException
as
e
:
for
k
in
results
.
keys
():
results
[
k
]
=
results
[
k
][:(
e
.
best_iteration
+
1
)]
break
return
results
python-package/lightgbm/libpath.py
0 → 100644
View file @
16d1853d
import
os
import
platform
import
sys
def
find_lib_path
():
"""Find the path to LightGBM library files.
Returns
-------
lib_path: list(string)
List of all found library path to LightGBM
"""
curr_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
os
.
path
.
expanduser
(
__file__
)))
dll_path
=
[
curr_path
,
os
.
path
.
join
(
curr_path
,
'../../lib/'
),
os
.
path
.
join
(
curr_path
,
'../../'
),
os
.
path
.
join
(
curr_path
,
'./lib/'
),
os
.
path
.
join
(
sys
.
prefix
,
'lightgbm'
)]
if
os
.
name
==
'nt'
:
dll_path
.
append
(
os
.
path
.
join
(
curr_path
,
'../../windows/x64/Dll/'
))
dll_path
.
append
(
os
.
path
.
join
(
curr_path
,
'./windows/x64/Dll/'
))
dll_path
=
[
os
.
path
.
join
(
p
,
'lib_lightgbm.dll'
)
for
p
in
dll_path
]
else
:
dll_path
=
[
os
.
path
.
join
(
p
,
'lib_lightgbm.so'
)
for
p
in
dll_path
]
lib_path
=
[
p
for
p
in
dll_path
if
os
.
path
.
exists
(
p
)
and
os
.
path
.
isfile
(
p
)]
if
not
lib_path
:
raise
Exception
(
'Cannot find lightgbm Library'
)
return
lib_path
python-package/lightgbm/sklearn.py
0 → 100644
View file @
16d1853d
"""Scikit-Learn Wrapper interface for LightGBM."""
from
__future__
import
absolute_import
import
numpy
as
np
from
.basic
import
LightGBMError
,
Predictor
,
Dataset
,
Booster
,
is_str
from
.engine
import
train
# sklearn
try
:
from
sklearn.base
import
BaseEstimator
from
sklearn.base
import
RegressorMixin
,
ClassifierMixin
from
sklearn.preprocessing
import
LabelEncoder
SKLEARN_INSTALLED
=
True
LGBMModelBase
=
BaseEstimator
LGBMRegressorBase
=
RegressorMixin
LGBMClassifierBase
=
ClassifierMixin
LGBMLabelEncoder
=
LabelEncoder
except
ImportError
:
SKLEARN_INSTALLED
=
False
LGBMModelBase
=
object
LGBMClassifierBase
=
object
LGBMRegressorBase
=
object
LGBMLabelEncoder
=
None
def
_point_wise_objective
(
func
):
"""Decorate an objective function
Note: for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples] or shape[n_samples* n_class]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def
inner
(
preds
,
dataset
):
"""internal function"""
labels
=
dataset
.
get_label
()
grad
,
hess
=
func
(
labels
,
preds
)
"""weighted for objective"""
weight
=
dataset
.
get_weight
()
if
weight
is
not
None
:
"""only one class"""
if
len
(
weight
)
==
len
(
grad
):
grad
=
np
.
multiply
(
grad
,
weight
)
hess
=
np
.
multiply
(
hess
,
weight
)
else
:
num_data
=
len
(
weight
)
num_class
=
len
(
grad
)
//
num_data
if
num_class
*
num_data
!=
len
(
grad
):
raise
ValueError
(
"length of grad and hess should equal with num_class * num_data"
)
for
k
in
range
(
num_class
):
for
i
in
range
(
num_data
):
idx
=
k
*
num_data
+
i
grad
[
idx
]
*=
weight
[
i
]
hess
[
idx
]
*=
weight
[
i
]
return
grad
,
hess
return
inner
class
LGBMModel
(
LGBMModelBase
):
"""Implementation of the Scikit-Learn API for LightGBM.
Parameters
----------
num_leaves : int
Maximum tree leaves for base learners.
max_depth : int
Maximum tree depth for base learners, -1 means no limit.
learning_rate : float
Boosting learning rate
n_estimators : int
Number of boosted trees to fit.
silent : boolean
Whether to print messages while running boosting.
objective : string or callable
Specify the learning task and the corresponding learning objective or
a custom objective function to be used (see note below).
nthread : int
Number of parallel threads
min_split_gain : float
Minimum loss reduction required to make a further partition on a leaf node of the tree.
min_child_weight : int
Minimum sum of instance weight(hessian) needed in a child(leaf)
min_child_samples : int
Minimum number of data need in a child(leaf)
subsample : float
Subsample ratio of the training instance.
subsample_freq : int
frequence of subsample, <=0 means no enable
colsample_bytree : float
Subsample ratio of columns when constructing each tree.
reg_alpha : float
L1 regularization term on weights
reg_lambda : float
L2 regularization term on weights
scale_pos_weight : float
Balancing of positive and negative weights.
is_unbalance : bool
Is unbalance for binary classification
seed : int
Random number seed.
Note
----
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature
``objective(y_true, y_pred) -> grad, hess``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class]
The predicted values
grad: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples] or shape[n_samples* n_class]
The value of the second derivative for each sample point
for multi-class task, the y_pred is group by class_id first, then group by row_id
if you want to get i-th row y_pred in j-th class, the access way is y_pred[j*num_data+i]
and you should group grad and hess in this way as well
"""
def
__init__
(
self
,
num_leaves
=
31
,
max_depth
=-
1
,
learning_rate
=
0.1
,
n_estimators
=
10
,
max_bin
=
255
,
silent
=
True
,
objective
=
"regression"
,
nthread
=-
1
,
min_split_gain
=
0
,
min_child_weight
=
5
,
min_child_samples
=
10
,
subsample
=
1
,
subsample_freq
=
1
,
colsample_bytree
=
1
,
reg_alpha
=
0
,
reg_lambda
=
0
,
scale_pos_weight
=
1
,
is_unbalance
=
False
,
seed
=
0
):
if
not
SKLEARN_INSTALLED
:
raise
LightGBMError
(
'sklearn needs to be installed in order to use this module'
)
self
.
num_leaves
=
num_leaves
self
.
max_depth
=
max_depth
self
.
learning_rate
=
learning_rate
self
.
n_estimators
=
n_estimators
self
.
max_bin
=
max_bin
self
.
silent
=
silent
self
.
objective
=
objective
self
.
nthread
=
nthread
self
.
min_split_gain
=
min_split_gain
self
.
min_child_weight
=
min_child_weight
self
.
min_child_samples
=
min_child_samples
self
.
subsample
=
subsample
self
.
subsample_freq
=
subsample_freq
self
.
colsample_bytree
=
colsample_bytree
self
.
reg_alpha
=
reg_alpha
self
.
reg_lambda
=
reg_lambda
self
.
scale_pos_weight
=
scale_pos_weight
self
.
is_unbalance
=
is_unbalance
self
.
seed
=
seed
self
.
_Booster
=
None
if
callable
(
self
.
objective
):
self
.
fobj
=
_point_wise_objective
(
self
.
objective
)
else
:
self
.
fobj
=
None
def
booster
(
self
):
"""Get the underlying lightgbm Booster of this model.
This will raise an exception when fit was not called
Returns
-------
booster : a lightgbm booster of underlying model
"""
if
self
.
_Booster
is
None
:
raise
LightGBMError
(
'need to call fit beforehand'
)
return
self
.
_Booster
def
get_params
(
self
,
deep
=
False
):
"""Get parameters"""
params
=
super
(
LGBMModel
,
self
).
get_params
(
deep
=
deep
)
params
[
'verbose'
]
=
0
if
self
.
silent
else
1
if
self
.
nthread
<=
0
:
params
.
pop
(
'nthread'
,
None
)
return
params
def
fit
(
self
,
X
,
y
,
eval_set
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
train_fields
=
None
,
valid_fields
=
None
,
other_params
=
None
):
"""
Fit the gradient boosting model
Parameters
----------
X : array_like
Feature matrix
y : array_like
Labels
eval_set : list, optional
A list of (X, y) tuple pairs to use as a validation set for early-stopping
eval_metric : str, list of str, callable, optional
If a str, should be a built-in evaluation metric to use.
If callable, a custom evaluation metric. The call
signature is func(y_predicted, dataset) where dataset will be a
Dataset fobject such that you may need to call the get_label
method. And it must return (eval_name->str, eval_result->float, is_bigger_better->Bool)
early_stopping_rounds : int
verbose : bool
If `verbose` and an evaluation set is used, writes the evaluation
train_fields : dict
other data file in training data. e.g. train_fields['weight'] is weight data
support fields: weight, group, init_score
valid_fields : dict
other data file in training data. e.g. valid_fields[0]['weight'] is weight data for first valid data
support fields: weight, group, init_score
other_params: dict
other parameters
"""
evals_result
=
{}
params
=
self
.
get_params
()
if
other_params
is
not
None
:
params
.
update
(
other_params
)
if
self
.
fobj
:
params
[
"objective"
]
=
"None"
else
:
params
[
"objective"
]
=
self
.
objective
if
callable
(
eval_metric
):
feval
=
eval_metric
elif
is_str
(
eval_metric
)
or
isinstance
(
eval_metric
,
list
):
feval
=
None
params
.
update
({
'metric'
:
eval_metric
})
else
:
feval
=
None
feval
=
eval_metric
if
callable
(
eval_metric
)
else
None
self
.
_Booster
=
train
(
params
,
(
X
,
y
),
self
.
n_estimators
,
valid_datas
=
eval_set
,
early_stopping_rounds
=
early_stopping_rounds
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
verbose_eval
=
verbose
,
train_fields
=
train_fields
,
valid_fields
=
valid_fields
)
if
evals_result
:
for
val
in
evals_result
.
items
():
evals_result_key
=
list
(
val
[
1
].
keys
())[
0
]
evals_result
[
val
[
0
]][
evals_result_key
]
=
val
[
1
][
evals_result_key
]
self
.
evals_result_
=
evals_result
if
early_stopping_rounds
is
not
None
:
self
.
best_iteration
=
self
.
_Booster
.
best_iteration
return
self
def
predict
(
self
,
data
,
raw_score
=
False
,
num_iteration
=
0
):
return
self
.
booster
().
predict
(
data
,
raw_score
=
raw_score
,
num_iteration
=
num_iteration
)
def
apply
(
self
,
X
,
num_iteration
=
0
):
"""Return the predicted leaf every tree for each sample.
Parameters
----------
X : array_like, shape=[n_samples, n_features]
Input features matrix.
ntree_limit : int
Limit number of trees in the prediction; defaults to 0 (use all trees).
Returns
-------
X_leaves : array_like, shape=[n_samples, n_trees]
"""
return
self
.
booster
().
predict
(
X
,
pred_leaf
=
True
,
num_iteration
=
num_iteration
)
def
evals_result
(
self
):
"""Return the evaluation results.
Returns
-------
evals_result : dictionary
"""
if
self
.
evals_result_
:
evals_result
=
self
.
evals_result_
else
:
raise
LightGBMError
(
'No results.'
)
return
evals_result
class
LGBMRegressor
(
LGBMModel
,
LGBMRegressorBase
):
__doc__
=
"""Implementation of the scikit-learn API for LightGBM regression.
"""
+
'
\n
'
.
join
(
LGBMModel
.
__doc__
.
split
(
'
\n
'
)[
2
:])
class
LGBMClassifier
(
LGBMModel
,
LGBMClassifierBase
):
__doc__
=
"""Implementation of the scikit-learn API for LightGBM classification.
"""
+
'
\n
'
.
join
(
LGBMModel
.
__doc__
.
split
(
'
\n
'
)[
2
:])
def
fit
(
self
,
X
,
y
,
eval_set
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
train_fields
=
None
,
valid_fields
=
None
,
other_params
=
None
):
self
.
classes_
=
np
.
unique
(
y
)
self
.
n_classes_
=
len
(
self
.
classes_
)
if
other_params
is
None
:
other_params
=
{}
if
self
.
n_classes_
>
2
:
# Switch to using a multiclass objective in the underlying LGBM instance
self
.
objective
=
"multiclass"
other_params
[
'num_class'
]
=
self
.
n_classes_
else
:
self
.
objective
=
"binary"
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
training_labels
=
self
.
_le
.
transform
(
y
)
if
eval_set
is
not
None
:
eval_set
=
list
(
(
x
[
0
],
self
.
_le
.
transform
(
x
[
1
]))
for
x
in
eval_set
)
super
(
LGBMClassifier
,
self
).
fit
(
X
,
training_labels
,
eval_set
,
eval_metric
,
early_stopping_rounds
,
verbose
,
train_fields
,
valid_fields
,
other_params
)
return
self
def
predict
(
self
,
data
,
raw_score
=
False
,
num_iteration
=
0
):
class_probs
=
self
.
booster
().
predict
(
data
,
raw_score
=
raw_score
,
num_iteration
=
num_iteration
)
if
len
(
class_probs
.
shape
)
>
1
:
column_indexes
=
np
.
argmax
(
class_probs
,
axis
=
1
)
else
:
column_indexes
=
np
.
repeat
(
0
,
class_probs
.
shape
[
0
])
column_indexes
[
class_probs
>
0.5
]
=
1
return
self
.
_le
.
inverse_transform
(
column_indexes
)
def
predict_proba
(
self
,
data
,
raw_score
=
False
,
num_iteration
=
0
):
class_probs
=
self
.
booster
().
predict
(
data
,
raw_score
=
raw_score
,
num_iteration
=
num_iteration
)
if
self
.
n_classes_
>
2
:
return
class_probs
else
:
classone_probs
=
class_probs
classzero_probs
=
1.0
-
classone_probs
return
np
.
vstack
((
classzero_probs
,
classone_probs
)).
transpose
()
def
_group_wise_objective
(
func
):
"""Decorate an objective function
Parameters
----------
func: callable
Expects a callable with signature ``func(y_true, group, y_pred)``:
y_true: array_like of shape [n_samples]
The target values
group : array_like of shape
group size data of data
y_pred: array_like of shape [n_samples] or shape[n_samples* n_class] (for multi-class)
The predicted values
Returns
-------
new_func: callable
The new objective function as expected by ``lightgbm.engine.train``.
The signature is ``new_func(preds, dataset)``:
preds: array_like, shape [n_samples] or shape[n_samples* n_class]
The predicted values
dataset: ``dataset``
The training set from which the labels will be extracted using
``dataset.get_label()``
"""
def
inner
(
preds
,
dataset
):
"""internal function"""
labels
=
dataset
.
get_label
()
group
=
dataset
.
get_group
()
if
group
is
None
:
raise
ValueError
(
"group should not be None for ranking task"
)
grad
,
hess
=
func
(
labels
,
group
,
preds
)
"""weighted for objective"""
weight
=
dataset
.
get_weight
()
if
weight
is
not
None
:
"""only one class"""
if
len
(
weight
)
==
len
(
grad
):
grad
=
np
.
multiply
(
grad
,
weight
)
hess
=
np
.
multiply
(
hess
,
weight
)
else
:
raise
ValueError
(
"lenght of grad and hess should equal with num_data"
)
return
grad
,
hess
return
inner
class
LGBMRanker
(
LGBMModel
):
__doc__
=
"""Implementation of the scikit-learn API for LightGBM ranking application.
"""
+
'
\n
'
.
join
(
LGBMModel
.
__doc__
.
split
(
'
\n
'
)[
2
:])
def
fit
(
self
,
X
,
y
,
eval_set
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
train_fields
=
None
,
valid_fields
=
None
,
other_params
=
None
):
"""check group data"""
if
"group"
not
in
train_fields
:
raise
ValueError
(
"should set group in train_fields for ranking task"
)
if
eval_set
is
not
None
:
if
valid_fields
is
None
:
raise
ValueError
(
"valid_fields cannot be None when eval_set is not None"
)
elif
len
(
valid_fields
)
!=
len
(
eval_set
):
raise
ValueError
(
"lenght of valid_fields should equal with eval_set"
)
else
:
for
inner
in
valid_fields
:
if
"group"
not
in
inner
:
raise
ValueError
(
"should set group in valid_fields for ranking task"
)
if
callable
(
self
.
objective
):
self
.
fobj
=
_group_wise_objective
(
self
.
objective
)
else
:
self
.
objective
=
"lambdarank"
self
.
fobj
=
None
super
(
LGBMRanker
,
self
).
fit
(
X
,
y
,
eval_set
,
eval_metric
,
early_stopping_rounds
,
verbose
,
train_fields
,
valid_fields
,
other_params
)
return
self
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment