Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
422c0ef7
Commit
422c0ef7
authored
Nov 23, 2016
by
Guolin Ke
Browse files
almost finish, need some tests
parent
fc383361
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
564 additions
and
190 deletions
+564
-190
include/LightGBM/boosting.h
include/LightGBM/boosting.h
+3
-3
include/LightGBM/c_api.h
include/LightGBM/c_api.h
+27
-26
include/LightGBM/config.h
include/LightGBM/config.h
+1
-1
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+418
-81
src/application/application.cpp
src/application/application.cpp
+2
-2
src/boosting/dart.hpp
src/boosting/dart.hpp
+11
-4
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+16
-16
src/boosting/gbdt.h
src/boosting/gbdt.h
+12
-9
src/c_api.cpp
src/c_api.cpp
+67
-44
src/io/config.cpp
src/io/config.cpp
+1
-1
tests/c_api_test/test.py
tests/c_api_test/test.py
+6
-3
No files found.
include/LightGBM/boosting.h
View file @
422c0ef7
...
...
@@ -73,7 +73,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
virtual
void
GetPredictAt
(
int
data_idx
,
score_t
*
result
,
data_size_t
*
out_len
)
const
=
0
;
virtual
void
GetPredictAt
(
int
data_idx
,
score_t
*
result
,
data_size_t
*
out_len
)
=
0
;
/*!
* \brief Prediction for one record, not sigmoid transform
...
...
@@ -127,7 +127,7 @@ public:
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
virtual
int
NumberOf
Sub
Model
s
()
const
=
0
;
virtual
int
NumberOf
Total
Model
()
const
=
0
;
/*!
* \brief Get number of classes
...
...
@@ -138,7 +138,7 @@ public:
/*!
* \brief Set number of used model for prediction
*/
virtual
void
SetNum
UsedModel
(
int
num_used_model
)
=
0
;
virtual
void
SetNum
IterationForPred
(
int
num_iteration
)
=
0
;
/*!
* \brief Get Type name of this boosting object
...
...
include/LightGBM/c_api.h
View file @
422c0ef7
...
...
@@ -230,11 +230,13 @@ DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
/*!
* \brief load an existing boosting from model file
* \param filename filename of model
* \param out_num_total_model number of total models
* \param out handle of created Booster
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterCreateFromModelfile
(
const
char
*
filename
,
int64_t
*
out_num_total_model
,
BoosterHandle
*
out
);
/*!
...
...
@@ -244,6 +246,12 @@ DllExport int LGBM_BoosterCreateFromModelfile(
*/
DllExport
int
LGBM_BoosterFree
(
BoosterHandle
handle
);
/*!
* \brief Get number of class
* \return number of class
*/
DllExport
int
LGBM_BoosterGetNumClasses
(
BoosterHandle
handle
,
int64_t
*
out_len
);
/*!
* \brief update the model in one round
* \param handle handle
...
...
@@ -276,7 +284,7 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len);
* \brief Get number of eval
* \return total number of eval result
*/
DllExport
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int64_t
*
out_len
,
const
char
**
*
out_strs
);
DllExport
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int64_t
*
out_len
,
char
**
out_strs
);
/*!
* \brief get evaluation for training data and validation data
...
...
@@ -291,17 +299,6 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
int64_t
*
out_len
,
float
*
out_results
);
/*!
* \brief get raw score for training data, used to calculate gradients outside
* \param handle handle
* \param out_len len of output result
* \param out_result used to set a pointer to array
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterGetTrainingScore
(
BoosterHandle
handle
,
int64_t
*
out_len
,
const
float
**
out_result
);
/*!
* \brief Get prediction for training data and validation data
this can be used to support customized eval function
...
...
@@ -319,21 +316,21 @@ DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
/*!
* \brief make prediction for file
* \param handle handle
* \param data_filename filename of data file
* \param data_has_header data file has header or not
* \param predict_type
* 0:raw score
* 1:with transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param data_has_header data file has header or not
* \param data_filename filename of data file
* \param num_iteration number of iteration for prediction
* \param result_filename filename of result file
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterPredictForFile
(
BoosterHandle
handle
,
int
predict_type
,
int64_t
n_used_trees
,
int
data_has_header
,
const
char
*
data_filename
,
int
data_has_header
,
int
predict_type
,
int64_t
num_iteration
,
const
char
*
result_filename
);
/*!
...
...
@@ -351,7 +348,8 @@ DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
* 0:raw score
* 1:with transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param num_iteration number of iteration for prediction
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
*/
...
...
@@ -365,8 +363,9 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t
nelem
,
int64_t
num_col
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
);
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
);
/*!
* \brief make prediction for an new data set
...
...
@@ -380,7 +379,8 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
* 0:raw score
* 1:with transform(if needed)
* 2:leaf index
* \param n_used_trees number of used tree
* \param num_iteration number of iteration for prediction
* \param out_len len of output result
* \param out_result used to set a pointer to array, should allocate memory before call this function
* \return 0 when success, -1 when failure happens
*/
...
...
@@ -391,18 +391,19 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t
ncol
,
int
is_row_major
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
);
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
);
/*!
* \brief save model into file
* \param handle handle
* \param num_
used_model
* \param num_
iteration
* \param filename file name
* \return 0 when success, -1 when failure happens
*/
DllExport
int
LGBM_BoosterSaveModel
(
BoosterHandle
handle
,
int
num_
used_model
,
int
num_
iteration
,
const
char
*
filename
);
...
...
include/LightGBM/config.h
View file @
422c0ef7
...
...
@@ -97,7 +97,7 @@ public:
std
::
string
output_result
=
"LightGBM_predict_result.txt"
;
std
::
string
input_model
=
""
;
int
verbosity
=
1
;
int
num_
model
_predict
=
NO_LIMIT
;
int
num_
iteration
_predict
=
NO_LIMIT
;
bool
is_pre_partition
=
false
;
bool
is_enable_sparse
=
true
;
bool
use_two_round_loading
=
false
;
...
...
python-package/lightgbm/basic.py
View file @
422c0ef7
...
...
@@ -6,6 +6,7 @@ import os
import
ctypes
import
collections
import
re
import
tempfile
import
numpy
as
np
import
scipy.sparse
...
...
@@ -111,7 +112,7 @@ def c_array(ctype, values):
return
(
ctype
*
len
(
values
))(
*
values
)
def
dict_to_str
(
data
):
if
len
(
data
)
==
0
:
if
data
is
None
or
len
(
data
)
==
0
:
return
""
pairs
=
[]
for
key
in
data
:
...
...
@@ -131,10 +132,10 @@ def c_float_array(data):
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
if
data
.
dtype
==
np
.
float32
:
ptr_data
=
c_array
(
ctypes
.
c_float
,
data
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_float
)
type_data
=
C_API_DTYPE_FLOAT32
elif
data
.
dtype
==
np
.
float64
:
ptr_data
=
c_array
(
ctypes
.
c_double
,
data
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_double
)
type_data
=
C_API_DTYPE_FLOAT64
else
:
raise
TypeError
(
"expected np.float32 or np.float64, met type({})"
.
format
(
data
.
dtype
))
...
...
@@ -148,10 +149,10 @@ def c_int_array(data):
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
if
data
.
dtype
==
np
.
int32
:
ptr_data
=
c_array
(
ctypes
.
c_int32
,
data
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_int32
)
type_data
=
C_API_DTYPE_INT32
elif
data
.
dtype
==
np
.
int64
:
ptr_data
=
c_array
(
ctypes
.
c_int64
,
data
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_int64
)
type_data
=
C_API_DTYPE_INT64
else
:
raise
TypeError
(
"expected np.int32 or np.int64, met type({})"
.
format
(
data
.
dtype
))
...
...
@@ -206,6 +207,7 @@ class Dataset(object):
self
.
raw_data
=
data
else
:
self
.
raw_data
=
None
self
.
data_has_header
=
False
"""process for args"""
params
=
{}
params
[
"max_bin"
]
=
max_bin
...
...
@@ -223,6 +225,10 @@ class Dataset(object):
raise
TypeError
(
'Reference dataset should be None or dataset instance'
)
"""start construct data"""
if
is_str
(
data
):
"""check data has header or not"""
if
"has_header"
in
params
or
"header"
in
params
:
if
params
[
"has_header"
].
lower
()
==
"true"
or
params
[
"header"
].
lower
()
==
"true"
:
data_has_header
=
True
self
.
handle
=
ctypes
.
c_void_p
()
_safe_call
(
_LIB
.
LGBM_CreateDatasetFromFile
(
c_str
(
data
),
...
...
@@ -230,17 +236,21 @@ class Dataset(object):
ref_dataset
,
ctypes
.
byref
(
self
.
handle
)))
elif
isinstance
(
data
,
scipy
.
sparse
.
csr_matrix
):
self
.
_init_from_csr
(
data
,
params_str
,
ref_dataset
)
elif
isinstance
(
data
,
scipy
.
sparse
.
csc_matrix
):
self
.
_init_from_csc
(
data
,
params_str
,
ref_dataset
)
self
.
__init_from_csr
(
data
,
params_str
,
ref_dataset
)
elif
isinstance
(
data
,
np
.
ndarray
):
self
.
_init_from_np
y
2d
(
data
,
params_str
,
ref_dataset
)
self
.
_
_
init_from_np2d
(
data
,
params_str
,
ref_dataset
)
else
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
self
.
_init_from_csr
(
csr
)
if
self
.
raw_data
is
not
None
:
self
.
raw_data
=
csr
self
.
__init_from_csr
(
csr
)
except
:
raise
TypeError
(
'can not initialize Dataset from {}'
.
format
(
type
(
data
).
__name__
))
self
.
__label
=
None
self
.
__weight
=
None
self
.
__init_score
=
None
self
.
__group
=
None
if
label
is
not
None
:
self
.
set_label
(
label
)
if
weight
is
not
None
:
...
...
@@ -252,55 +262,7 @@ class Dataset(object):
def
free_raw_data
(
self
):
self
.
raw_data
=
None
def
_init_from_csr
(
self
,
csr
,
params_str
,
ref_dataset
):
"""
Initialize data from a CSR matrix.
"""
if
len
(
csr
.
indices
)
!=
len
(
csr
.
data
):
raise
ValueError
(
'length mismatch: {} vs {}'
.
format
(
len
(
csr
.
indices
),
len
(
csr
.
data
)))
self
.
handle
=
ctypes
.
c_void_p
()
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csr
.
data
)
_safe_call
(
_LIB
.
LGBM_CreateDatasetFromCSR
(
ptr_indptr
,
type_ptr_indptr
,
c_array
(
ctypes
.
c_int32
,
csr
.
indices
),
ptr_data
,
type_ptr_data
,
len
(
csr
.
indptr
),
len
(
csr
.
data
),
csr
.
shape
[
1
],
c_str
(
params_str
),
ref_dataset
,
ctypes
.
byref
(
self
.
handle
)))
def
_init_from_csc
(
self
,
csr
,
params_str
,
ref_dataset
):
"""
Initialize data from a CSC matrix.
"""
if
len
(
csc
.
indices
)
!=
len
(
csc
.
data
):
raise
ValueError
(
'length mismatch: {} vs {}'
.
format
(
len
(
csc
.
indices
),
len
(
csc
.
data
)))
self
.
handle
=
ctypes
.
c_void_p
()
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csc
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csc
.
data
)
_safe_call
(
_LIB
.
LGBM_CreateDatasetFromCSC
(
ptr_indptr
,
type_ptr_indptr
,
c_array
(
ctypes
.
c_int32
,
csc
.
indices
),
ptr_data
,
type_ptr_data
,
len
(
csc
.
indptr
),
len
(
csc
.
data
),
csc
.
shape
[
0
],
c_str
(
params_str
),
ref_dataset
,
ctypes
.
byref
(
self
.
handle
)))
def
_init_from_npy2d
(
self
,
mat
,
params_str
,
ref_dataset
):
def
__init_from_np2d
(
self
,
mat
,
params_str
,
ref_dataset
):
"""
Initialize data from a 2-D numpy matrix.
"""
...
...
@@ -325,6 +287,30 @@ class Dataset(object):
ref_dataset
,
ctypes
.
byref
(
self
.
handle
)))
def
__init_from_csr
(
self
,
csr
,
params_str
,
ref_dataset
):
"""
Initialize data from a CSR matrix.
"""
if
len
(
csr
.
indices
)
!=
len
(
csr
.
data
):
raise
ValueError
(
'length mismatch: {} vs {}'
.
format
(
len
(
csr
.
indices
),
len
(
csr
.
data
)))
self
.
handle
=
ctypes
.
c_void_p
()
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csr
.
data
)
_safe_call
(
_LIB
.
LGBM_CreateDatasetFromCSR
(
ptr_indptr
,
type_ptr_indptr
,
csr
.
indices
.
ctypes
.
data_as
(
ctypes
.
c_int32
),
ptr_data
,
type_ptr_data
,
len
(
csr
.
indptr
),
len
(
csr
.
data
),
csr
.
shape
[
1
],
c_str
(
params_str
),
ref_dataset
,
ctypes
.
byref
(
self
.
handle
)))
def
__del__
(
self
):
_safe_call
(
_LIB
.
LGBM_DatasetFree
(
self
.
handle
))
...
...
@@ -371,10 +357,10 @@ class Dataset(object):
if
not
is_numpy_1d_array
(
data
):
raise
TypeError
(
"Unknow type({})"
.
format
(
type
(
data
).
__name__
))
if
data
.
dtype
==
np
.
float32
:
ptr_data
=
c_array
(
ctypes
.
c_float
,
data
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_float
)
type_data
=
C_API_DTYPE_FLOAT32
elif
data
.
dtype
==
np
.
int32
:
ptr_data
=
c_array
(
ctypes
.
c_int32
,
data
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_int32
)
type_data
=
C_API_DTYPE_INT32
else
:
raise
TypeError
(
"excepted np.float32 or np.int32, met type({})"
.
format
(
data
.
dtype
))
...
...
@@ -409,6 +395,7 @@ class Dataset(object):
label
=
list_to_1d_numpy
(
label
,
np
.
float32
)
if
label
.
dtype
!=
np
.
float32
:
label
=
label
.
astype
(
np
.
float32
,
copy
=
False
)
self
.
__label
=
label
self
.
set_field
(
'label'
,
label
)
def
set_weight
(
self
,
weight
):
...
...
@@ -422,6 +409,7 @@ class Dataset(object):
weight
=
list_to_1d_numpy
(
weight
,
np
.
float32
)
if
weight
.
dtype
!=
np
.
float32
:
weight
=
weight
.
astype
(
np
.
float32
,
copy
=
False
)
self
.
__weight
=
weight
self
.
set_field
(
'weight'
,
weight
)
def
set_init_score
(
self
,
score
):
...
...
@@ -434,6 +422,7 @@ class Dataset(object):
score
=
list_to_1d_numpy
(
score
,
np
.
float32
)
if
score
.
dtype
!=
np
.
float32
:
score
=
score
.
astype
(
np
.
float32
,
copy
=
False
)
self
.
__init_score
=
init_score
self
.
set_field
(
'init_score'
,
score
)
def
set_group
(
self
,
group
):
...
...
@@ -447,6 +436,7 @@ class Dataset(object):
group
=
list_to_1d_numpy
(
group
,
np
.
int32
)
if
group
.
dtype
!=
np
.
int32
:
group
=
group
.
astype
(
np
.
int32
,
copy
=
False
)
self
.
__group
=
group
self
.
set_field
(
'group'
,
group
)
def
set_group_id
(
self
,
group_id
):
...
...
@@ -470,7 +460,9 @@ class Dataset(object):
-------
label : array
"""
return
self
.
get_field
(
'label'
)
if
self
.
__label
is
None
:
self
.
__label
=
self
.
get_field
(
'label'
)
return
self
.
__label
def
get_weight
(
self
):
"""Get the weight of the Dataset.
...
...
@@ -479,7 +471,9 @@ class Dataset(object):
-------
weight : array
"""
return
self
.
get_field
(
'weight'
)
if
self
.
__weight
is
None
:
self
.
__weight
=
self
.
get_field
(
'weight'
)
return
self
.
__weight
def
get_init_score
(
self
):
"""Get the initial score of the Dataset.
...
...
@@ -488,7 +482,20 @@ class Dataset(object):
-------
init_score : array
"""
return
self
.
get_field
(
'init_score'
)
if
self
.
__init_score
is
None
:
self
.
__init_score
=
self
.
get_field
(
'init_score'
)
return
self
.
__init_score
def
get_group
(
self
):
"""Get the initial score of the Dataset.
Returns
-------
init_score : array
"""
if
self
.
__group
is
None
:
self
.
__group
=
self
.
get_field
(
'group'
)
return
self
.
__group
def
num_data
(
self
):
"""Get the number of rows in the Dataset.
...
...
@@ -553,6 +560,9 @@ class Dataset(object):
else
:
self
.
_feature_names
=
None
C_API_PREDICT_NORMAL
=
0
C_API_PREDICT_RAW_SCORE
=
1
C_API_PREDICT_LEAF_INDEX
=
2
class
Booster
(
object
):
""""A Booster of of LightGBM.
...
...
@@ -560,12 +570,9 @@ class Booster(object):
feature_names
=
None
def
__init__
(
self
,
params
=
None
,
train_set
=
None
,
valid_sets
=
None
,
name_valid_sets
=
None
,
model_file
=
None
,
fobj
=
None
):
def
__init__
(
self
,
params
=
None
,
train_set
=
None
,
valid_sets
=
None
,
name_valid_sets
=
None
,
model_file
=
None
):
# pylint: disable=invalid-name
"""Initialize the Booster.
...
...
@@ -580,15 +587,17 @@ class Booster(object):
name_valid_sets : List of string
name of validation datasets
model_file : string
Path to the model file.
Path to the model file.
If tarin_set is not None, used for continued train.
else used for loading model prediction task
"""
self
.
handle
=
ctypes
.
c_void_p
()
if
train_set
is
not
None
:
"""Training task"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
'training data should be Dataset instance, met{}'
.
format
(
type
(
train_set
).
__name__
))
valid_handles
=
None
valid_cnames
=
None
n_valid
=
0
if
valid_sets
is
not
None
:
for
valid
in
valid_sets
:
...
...
@@ -596,36 +605,364 @@ class Booster(object):
raise
TypeError
(
'valid data should be Dataset instance, met{}'
.
format
(
type
(
valid
).
__name__
))
valid_handles
=
c_array
(
ctypes
.
c_void_p
,
[
valid
.
handle
for
valid
in
valid_sets
])
if
name_valid_sets
is
None
:
name_valid_sets
=
[
"valid_{}"
.
format
(
x
)
for
x
in
range
(
len
(
valid_sets
))
]
name_valid_sets
=
[
"valid_{}"
.
format
(
x
+
1
)
for
x
in
range
(
len
(
valid_sets
))
]
if
len
(
valid_sets
)
!=
len
(
name_valid_sets
):
raise
Exception
(
'len of valid_sets should be equal with len of name_valid_sets'
)
valid_cnames
=
c_array
(
ctypes
.
c_char_p
,
[
c_str
(
x
)
for
x
in
name_valid_sets
])
n_valid
=
len
(
valid_sets
)
ref_input_model
=
None
params_str
=
dict_to_str
(
params
)
if
model_file
is
not
None
:
ref_input_model
=
c_str
(
model_file
)
"""construct booster object"""
_safe_call
(
LIB
.
LGBM_BoosterCreate
(
_safe_call
(
_
LIB
.
LGBM_BoosterCreate
(
train_set
.
handle
,
valid_handles
,
valid_cnames
,
n_valid
,
params_str
,
c_str
(
params_str
)
,
ref_input_model
,
ctypes
.
byref
(
self
.
handle
)))
"""if need to continue train"""
if
model_file
is
not
None
:
self
.
init_continue_train
(
train_set
)
self
.
__
init_continue_train
(
train_set
)
if
valid_sets
is
not
None
:
for
valid
in
valid_sets
:
self
.
init_continue_train
(
valid
)
self
.
__init_continue_train
(
valid
)
"""save reference to data"""
self
.
train_set
=
train_set
self
.
valid_sets
=
valid_sets
self
.
name_valid_sets
=
name_valid_sets
self
.
__num_dataset
=
1
+
n_valid
self
.
__training_score
=
None
out_len
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
ctypes
.
byref
(
out_len
)))
self
.
__num_class
=
out_len
.
value
"""buffer for inner predict"""
self
.
__inner_predict_buffer
=
[
None
for
_
in
range
(
self
.
__num_dataset
)]
"""Get num of inner evals"""
_safe_call
(
_LIB
.
LGBM_BoosterGetEvalCounts
(
self
.
handle
,
ctypes
.
byref
(
out_len
)))
self
.
__num_inner_eval
=
out_len
.
value
if
self
.
__num_inner_eval
>
0
:
"""Get name of evals"""
string_buffers
=
[
ctypes
.
create_string_buffer
(
255
)
for
i
in
range
(
self
.
__num_inner_eval
)]
ptr_string_buffers
=
(
ctypes
.
c_char_p
*
self
.
__num_inner_eval
)(
*
map
(
ctypes
.
addressof
,
string_buffers
))
_safe_call
(
_LIB
.
LGBM_BoosterGetEvalNames
(
self
.
handle
,
ctypes
.
byref
(
out_len
),
ptr_string_buffers
))
if
self
.
__num_inner_eval
!=
out_len
.
value
:
raise
ValueError
(
"size of eval names doesn't equal with num_evals"
)
self
.
__name_inner_eval
=
[]
for
i
in
range
(
self
.
__num_inner_eval
):
self
.
__name_inner_eval
.
append
(
string_buffers
[
i
].
value
.
decode
())
elif
model_file
is
not
None
:
_safe_call
(
_LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
model_file
),
ctypes
.
byref
(
self
.
handle
)))
"""Prediction task"""
out_num_total_model
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
model_file
),
ctypes
.
byref
(
out_num_total_model
),
ctypes
.
byref
(
self
.
handle
)))
self
.
__num_total_model
=
out_num_total_model
.
value
out_len
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
ctypes
.
byref
(
out_len
)))
self
.
__num_class
=
out_len
.
value
else
:
raise
TypeError
(
'At least need training dataset or model file to create booster instance'
)
def
__del__
(
self
):
_LIB
.
LGBM_BoosterFree
(
self
.
handle
)
_safe_call
(
_LIB
.
LGBM_BoosterFree
(
self
.
handle
))
def
update
(
self
,
fobj
=
None
):
"""
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
fobj : function
Customized objective function.
Returns
-------
is_finished, bool
"""
is_finished
=
ctypes
.
c_int
(
0
)
if
fobj
is
None
:
_safe_call
(
_LIB
.
LGBM_BoosterUpdateOneIter
(
self
.
handle
,
ctypes
.
byref
(
is_finished
)))
return
is_finished
.
value
==
1
else
:
grad
,
hess
=
fobj
(
self
.
__inner_predict
(
0
),
self
.
train_set
)
return
self
.
boost
(
grad
,
hess
)
def
boost
(
self
,
grad
,
hess
):
"""
Boost the booster for one iteration, with customized gradient statistics.
Note: for multi-class task, the score is group by class_id first, then group by row_id
if you want to get i-th row score in j-th class, the access way is score[j*num_data+i]
and you should group grad and hess in this way as well
Parameters
----------
grad : 1d numpy with dtype=float32
The first order of gradient.
hess : 1d numpy with dtype=float32
The second order of gradient.
Returns
-------
is_finished, bool
"""
if
not
is_numpy_1d_array
(
grad
)
and
not
is_numpy_1d_array
(
hess
):
raise
TypeError
(
'type of grad / hess should be 1d numpy object'
)
if
not
grad
.
dtype
==
np
.
float32
and
not
hess
.
dtype
==
np
.
float32
:
raise
TypeError
(
'type of grad / hess should be np.float32'
)
if
len
(
grad
)
!=
len
(
hess
):
raise
ValueError
(
'grad / hess length mismatch: {} / {}'
.
format
(
len
(
grad
),
len
(
hess
)))
is_finished
=
ctypes
.
c_int
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterUpdateOneIterCustom
(
self
.
handle
,
grad
.
ctypes
.
data_as
(
ctypes
.
c_float
),
hess
.
ctypes
.
data_as
(
ctypes
.
c_float
),
ctypes
.
byref
(
is_finished
)))
return
is_finished
.
value
==
1
def
eval_train
(
self
,
feval
=
None
):
"""Evaluate for training data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result string.
"""
return
self
.
__inner_eval
(
"training"
,
0
,
feval
)
def
eval_valid
(
self
,
feval
=
None
):
"""Evaluate for validation data
Parameters
----------
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result string.
"""
ret
=
[]
for
i
in
range
(
1
,
self
.
__num_dataset
):
ret
.
append
(
self
.
__inner_eval
(
self
.
name_valid_sets
[
i
-
1
],
i
,
feval
))
return
'
\n
'
.
join
(
ret
)
def
save_model
(
self
,
filename
,
num_iteration
=-
1
):
_safe_call
(
_LIB
.
LGBM_BoosterSaveModel
(
self
.
handle
,
num_iteration
,
c_str
(
filename
)))
def
predict
(
self
,
data
,
num_iteration
=-
1
,
raw_score
=
False
,
pred_leaf
=
False
,
data_has_header
=
False
,
is_reshape
=
True
):
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"cannot use Dataset instance for prediction, please use raw data instead"
)
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
predict_type
=
cC_API_PREDICT_RAW_SCORE
if
pred_leaf
:
predict_type
=
C_API_PREDICT_LEAF_INDEX
int_data_has_header
=
0
if
data_has_header
:
int_data_has_header
=
1
if
is_str
(
data
):
tmp_pred_fname
=
tempfile
.
NamedTemporaryFile
(
prefix
=
"lightgbm_tmp_pred_"
).
name
_safe_call
(
_LIB
.
LGBM_BoosterPredictForFile
(
self
.
handle
,
c_str
(
data
),
int_data_has_header
,
predict_type
,
num_iteration
,
c_str
(
tmp_pred_fname
)))
lines
=
open
(
tmp_pred_fname
,
"r"
).
readlines
()
nrow
=
len
(
lines
)
preds
=
[]
for
line
in
lines
:
for
token
in
line
.
split
(
'
\t
'
):
preds
.
append
(
float
(
token
))
preds
=
np
.
array
(
preds
,
copy
=
False
)
os
.
remove
(
tmp_pred_fname
)
elif
isinstance
(
data
,
scipy
.
sparse
.
csr_matrix
):
preds
,
nrow
=
self
.
__pred_for_csr
(
data
,
num_iteration
,
predict_type
)
elif
isinstance
(
data
,
np
.
ndarray
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
,
num_iteration
,
predict_type
)
else
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
res
=
self
.
__pred_for_csr
(
csr
,
num_iteration
,
predict_type
)
except
:
raise
TypeError
(
'can not predict data for type {}'
.
format
(
type
(
data
).
__name__
))
if
pred_leaf
:
preds
=
preds
.
astype
(
np
.
int32
)
if
preds
.
size
!=
nrow
and
is_reshape
:
if
preds
.
size
%
nrow
==
0
:
ncol
=
int
(
preds
.
size
/
nrow
)
preds
=
preds
.
reshape
(
nrow
,
ncol
)
else
:
raise
ValueError
(
'len of predict result(%d) cannot be divide nrow(%d)'
%
(
preds
.
size
,
nrow
)
)
return
preds
def
__pred_for_np2d
(
self
,
mat
,
num_iteration
,
predict_type
):
"""
Predict for a 2-D numpy matrix.
"""
if
len
(
mat
.
shape
)
!=
2
:
raise
ValueError
(
'Input numpy.ndarray must be 2 dimensional'
)
if
mat
.
dtype
==
np
.
float32
or
mat
.
dtype
==
np
.
float64
:
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
mat
.
dtype
,
copy
=
False
)
else
:
"""change non-float data to float data, need to copy"""
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
ptr_data
,
type_ptr_data
=
c_float_array
(
data
)
n_preds
=
self
.
__num_class
*
mat
.
shape
[
0
]
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
__num_class
n_preds
*=
used_iteration
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
_safe_call
(
LIB
.
LGBM_BoosterPredictForMat
(
self
.
handle
,
ptr_data
,
type_ptr_data
,
mat
.
shape
[
0
],
mat
.
shape
[
1
],
C_API_IS_ROW_MAJOR
,
predict_type
,
num_iteration
,
ctypes
.
byref
(
out_num_preds
),
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"incorrect number for predict result"
)
return
preds
,
mat
.
shape
[
0
]
def
__pred_for_csr
(
self
,
csr
,
num_iteration
,
predict_type
):
"""
Predict for a csr data
"""
nrow
=
len
(
csr
.
indptr
)
-
1
n_preds
=
self
.
__num_class
*
nrow
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
__num_class
n_preds
*=
used_iteration
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csr
.
data
)
_safe_call
(
LIB
.
LGBM_BoosterPredictForCSR
(
self
.
handle
,
ptr_indptr
,
type_ptr_indptr
,
csr
.
indices
.
ctypes
.
data_as
(
ctypes
.
c_int32
),
ptr_data
,
type_ptr_data
,
len
(
csr
.
indptr
),
len
(
csr
.
data
),
csr
.
shape
[
1
],
predict_type
,
num_iteration
,
ctypes
.
byref
(
out_num_preds
),
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"incorrect number for predict result"
)
return
preds
,
nrow
def
__inner_eval
(
self
,
data_name
,
data_idx
,
feval
=
None
):
if
data_idx
>=
self
.
__num_dataset
:
raise
ValueError
(
"data_idx should be smaller than number of dataset"
)
ret
=
[]
if
self
.
__num_inner_eval
>
0
:
result
=
np
.
array
([
0.0
for
_
in
range
(
self
.
__num_inner_eval
)],
dtype
=
np
.
float32
)
out_len
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetEval
(
self
.
handle
,
data_idx
,
ctypes
.
byref
(
out_len
),
result
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))))
if
out_len
.
value
!=
self
.
__num_inner_eval
:
raise
ValueError
(
"incorrect number of eval results"
)
for
i
in
range
(
self
.
__num_inner_eval
):
ret
.
append
(
'%s %s : %f'
%
(
data_name
,
self
.
__name_inner_eval
[
i
],
result
[
i
]))
if
feval
is
not
None
:
if
data_idx
==
0
:
cur_data
=
self
.
train_set
else
:
cur_data
=
self
.
valid_sets
[
data_idx
-
1
]
feval_ret
=
feval
(
self
.
__inner_predict
(
data_idx
),
cur_data
)
if
isinstance
(
feval_ret
,
list
):
for
name
,
val
in
feval_ret
:
ret
.
append
(
'%s %s : %f'
%
(
data_name
,
name
,
val
))
else
:
name
,
val
=
feval_ret
ret
.
append
(
'%s %s : %f'
%
(
data_name
,
name
,
val
))
return
'
\t
'
.
join
(
ret
)
def
__inner_predict
(
self
,
data_idx
):
if
data_idx
>=
self
.
__num_dataset
:
raise
ValueError
(
"data_idx should be smaller than number of dataset"
)
if
self
.
__inner_predict_buffer
[
data_idx
]
is
None
:
if
data_idx
==
0
:
num_data
=
self
.
train_set
.
num_data
()
*
self
.
__num_class
else
:
num_data
=
self
.
valid_sets
[
data_idx
-
1
].
num_data
()
*
self
.
__num_class
self
.
__inner_predict_buffer
[
data_idx
]
=
\
np
.
array
([
0.0
for
_
in
range
(
num_data
)],
dtype
=
np
.
float32
,
copy
=
False
)
out_len
=
ctypes
.
c_int64
(
0
)
data_ptr
=
self
.
__inner_predict_buffer
[
data_idx
].
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
_safe_call
(
_LIB
.
LGBM_BoosterGetPredict
(
self
.
handle
,
data_idx
,
ctypes
.
byref
(
out_len
),
data_ptr
))
if
out_len
.
value
!=
len
(
self
.
__inner_predict_buffer
[
data_idx
]):
raise
ValueError
(
"incorrect number of predict results for data %d"
%
(
data_idx
)
)
return
self
.
__inner_predict_buffer
[
data_idx
]
def
__init_continue_train
(
self
,
dataset
):
if
dataset
.
raw_data
is
None
:
raise
ValueError
(
"should set is_continue_train=True in dataset while need to continue train"
)
init_score
=
self
.
predict
(
dataset
.
raw_data
,
raw_score
=
True
,
data_has_header
=
dataset
.
data_has_header
,
is_reshape
=
False
)
dataset
.
set_init_score
(
init_score
)
dataset
.
free_raw_data
()
#tmp test
train_data
=
Dataset
(
'../../examples/binary_classification/binary.train'
)
test_data
=
Dataset
(
'../../examples/binary_classification/binary.test'
,
reference
=
train_data
)
param
=
{
"metric"
:
"l2,l1"
}
lgb
=
Booster
(
train_set
=
train_data
,
valid_sets
=
[
test_data
],
params
=
param
)
for
i
in
range
(
100
):
lgb
.
update
()
print
(
lgb
.
eval_valid
())
print
(
lgb
.
eval_train
())
print
(
lgb
.
predict
(
'../../examples/binary_classification/binary.train'
))
\ No newline at end of file
src/application/application.cpp
View file @
422c0ef7
...
...
@@ -108,7 +108,7 @@ void Application::LoadData() {
// prediction is needed if using input initial model(continued train)
PredictFunction
predict_fun
=
nullptr
;
// need to continue training
if
(
boosting_
->
NumberOf
Sub
Model
s
()
>
0
)
{
if
(
boosting_
->
NumberOf
Total
Model
()
>
0
)
{
Predictor
predictor
(
boosting_
.
get
(),
true
,
false
);
predict_fun
=
predictor
.
GetPredictFunction
();
}
...
...
@@ -235,7 +235,7 @@ void Application::Train() {
void
Application
::
Predict
()
{
boosting_
->
SetNum
UsedModel
(
config_
.
io_config
.
num_
model
_predict
);
boosting_
->
SetNum
IterationForPred
(
config_
.
io_config
.
num_
iteration
_predict
);
// create predictor
Predictor
predictor
(
boosting_
.
get
(),
config_
.
io_config
.
is_predict_raw_score
,
config_
.
io_config
.
is_predict_leaf_index
);
...
...
src/boosting/dart.hpp
View file @
422c0ef7
...
...
@@ -43,6 +43,7 @@ public:
* \brief one training iteration
*/
bool
TrainOneIter
(
const
score_t
*
gradient
,
const
score_t
*
hessian
,
bool
is_eval
)
override
{
is_update_score_cur_iter_
=
false
;
GBDT
::
TrainOneIter
(
gradient
,
hessian
,
false
);
// normalize
Normalize
();
...
...
@@ -58,20 +59,24 @@ public:
* \return training score
*/
const
score_t
*
GetTrainingScore
(
data_size_t
*
out_len
)
override
{
DroppingTrees
();
if
(
!
is_update_score_cur_iter_
)
{
// only drop one time in one iteration
DroppingTrees
();
is_update_score_cur_iter_
=
true
;
}
*
out_len
=
train_score_updater_
->
num_data
()
*
num_class_
;
return
train_score_updater_
->
score
();
}
/*!
* \brief save model to file
* \param num_
used_model number of model that want to save,
-1 means save all
* \param num_
iteration
-1 means save all
* \param is_finish is training finished or not
* \param filename filename that want to save to
*/
void
SaveModelToFile
(
int
num_
used_model
,
bool
is_finish
,
const
char
*
filename
)
override
{
void
SaveModelToFile
(
int
num_
iteration
,
bool
is_finish
,
const
char
*
filename
)
override
{
// only save model once when is_finish = true
if
(
is_finish
&&
saved_model_size_
<
0
)
{
GBDT
::
SaveModelToFile
(
num_
used_model
,
is_finish
,
filename
);
GBDT
::
SaveModelToFile
(
num_
iteration
,
is_finish
,
filename
);
}
}
/*!
...
...
@@ -133,6 +138,8 @@ private:
double
drop_rate_
;
/*! \brief Random generator, used to select dropping trees */
Random
random_for_drop_
;
/*! \brief Flag that the score is update on current iter or not*/
bool
is_update_score_cur_iter_
;
};
}
// namespace LightGBM
...
...
src/boosting/gbdt.cpp
View file @
422c0ef7
...
...
@@ -16,7 +16,7 @@
namespace
LightGBM
{
GBDT
::
GBDT
()
:
saved_model_size_
(
-
1
),
num_
used_model
_
(
0
)
{
GBDT
::
GBDT
()
:
saved_model_size_
(
-
1
),
num_
iteration_for_pred
_
(
0
)
{
}
...
...
@@ -29,7 +29,7 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O
gbdt_config_
=
config
;
iter_
=
0
;
saved_model_size_
=
-
1
;
num_
used_model
_
=
0
;
num_
iteration_for_pred
_
=
0
;
max_feature_idx_
=
0
;
early_stopping_round_
=
gbdt_config_
->
early_stopping_round
;
shrinkage_rate_
=
gbdt_config_
->
learning_rate
;
...
...
@@ -296,24 +296,23 @@ const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
return
train_score_updater_
->
score
();
}
void
GBDT
::
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
const
{
void
GBDT
::
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
{
CHECK
(
data_idx
>=
0
&&
data_idx
<=
static_cast
<
int
>
(
valid_metrics_
.
size
()));
std
::
vector
<
double
>
ret
;
const
score_t
*
raw_scores
=
nullptr
;
data_size_t
num_data
=
0
;
if
(
data_idx
==
0
)
{
raw_scores
=
train_score_updater_
->
score
(
);
raw_scores
=
GetTrainingScore
(
out_len
);
num_data
=
train_score_updater_
->
num_data
();
}
else
{
auto
used_idx
=
data_idx
-
1
;
raw_scores
=
valid_score_updater_
[
used_idx
]
->
score
();
num_data
=
valid_score_updater_
[
used_idx
]
->
num_data
();
*
out_len
=
num_data
*
num_class_
;
}
*
out_len
=
num_data
*
num_class_
;
if
(
num_class_
>
1
)
{
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
std
::
vector
<
double
>
tmp_result
;
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
...
...
@@ -325,12 +324,12 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len)
}
}
}
else
if
(
sigmoid_
>
0.0
f
){
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
out_result
[
i
]
=
static_cast
<
score_t
>
(
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
2.0
f
*
sigmoid_
*
raw_scores
[
i
])));
}
}
else
{
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
out_result
[
i
]
=
raw_scores
[
i
];
}
...
...
@@ -348,7 +347,7 @@ void GBDT::Boosting() {
GetGradients
(
GetTrainingScore
(
&
num_score
),
gradients_
.
data
(),
hessians_
.
data
());
}
void
GBDT
::
SaveModelToFile
(
int
num_
used_model
,
bool
is_finish
,
const
char
*
filename
)
{
void
GBDT
::
SaveModelToFile
(
int
num_
iteration
,
bool
is_finish
,
const
char
*
filename
)
{
// first time to this function, open file
if
(
saved_model_size_
<
0
)
{
model_output_file_
.
open
(
filename
);
...
...
@@ -373,10 +372,11 @@ void GBDT::SaveModelToFile(int num_used_model, bool is_finish, const char* filen
if
(
!
model_output_file_
.
is_open
())
{
return
;
}
if
(
num_used_model
==
NO_LIMIT
)
{
int
num_used_model
=
0
;
if
(
num_iteration
==
NO_LIMIT
)
{
num_used_model
=
static_cast
<
int
>
(
models_
.
size
());
}
else
{
num_used_model
=
num_
used_model
*
num_class_
;
num_used_model
=
num_
iteration
*
num_class_
;
}
int
rest
=
num_used_model
-
early_stopping_round_
*
num_class_
;
// output tree models
...
...
@@ -452,7 +452,7 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
}
}
Log
::
Info
(
"Finished loading %d models"
,
models_
.
size
());
num_
used_model
_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
num_
iteration_for_pred
_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
}
std
::
string
GBDT
::
FeatureImportance
()
const
{
...
...
@@ -486,7 +486,7 @@ std::string GBDT::FeatureImportance() const {
std
::
vector
<
double
>
GBDT
::
PredictRaw
(
const
double
*
value
)
const
{
std
::
vector
<
double
>
ret
(
num_class_
,
0.0
f
);
for
(
int
i
=
0
;
i
<
num_
used_model
_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
iteration_for_pred
_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
ret
[
j
]
+=
models_
[
i
*
num_class_
+
j
]
->
Predict
(
value
);
}
...
...
@@ -496,7 +496,7 @@ std::vector<double> GBDT::PredictRaw(const double* value) const {
std
::
vector
<
double
>
GBDT
::
Predict
(
const
double
*
value
)
const
{
std
::
vector
<
double
>
ret
(
num_class_
,
0.0
f
);
for
(
int
i
=
0
;
i
<
num_
used_model
_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
iteration_for_pred
_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
ret
[
j
]
+=
models_
[
i
*
num_class_
+
j
]
->
Predict
(
value
);
}
...
...
@@ -512,7 +512,7 @@ std::vector<double> GBDT::Predict(const double* value) const {
std
::
vector
<
int
>
GBDT
::
PredictLeafIndex
(
const
double
*
value
)
const
{
std
::
vector
<
int
>
ret
;
for
(
int
i
=
0
;
i
<
num_
used_model
_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
iteration_for_pred
_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
ret
.
push_back
(
models_
[
i
*
num_class_
+
j
]
->
PredictLeafIndex
(
value
));
}
...
...
src/boosting/gbdt.h
View file @
422c0ef7
...
...
@@ -73,7 +73,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
void
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
const
override
;
void
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
override
;
/*!
* \brief Predtion for one record without sigmoid transformation
...
...
@@ -98,11 +98,11 @@ public:
/*!
* \brief save model to file
* \param num_
used_model number of model that want to save,
-1 means save all
* \param num_
iteration
-1 means save all
* \param is_finish is training finished or not
* \param filename filename that want to save to
*/
virtual
void
SaveModelToFile
(
int
num_
used_model
,
bool
is_finish
,
const
char
*
filename
)
override
;
virtual
void
SaveModelToFile
(
int
num_
iteration
,
bool
is_finish
,
const
char
*
filename
)
override
;
/*!
* \brief Restore from a serialized string
*/
...
...
@@ -119,11 +119,12 @@ public:
*/
inline
int
LabelIdx
()
const
override
{
return
label_idx_
;
}
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
inline
int
NumberOf
Sub
Model
s
()
const
override
{
return
static_cast
<
int
>
(
models_
.
size
());
}
inline
int
NumberOf
Total
Model
()
const
override
{
return
static_cast
<
int
>
(
models_
.
size
());
}
/*!
* \brief Get number of classes
...
...
@@ -132,11 +133,13 @@ public:
inline
int
NumberOfClasses
()
const
override
{
return
num_class_
;
}
/*!
* \brief Set number of
used model
for prediction
* \brief Set number of
iterations
for prediction
*/
inline
void
SetNumUsedModel
(
int
num_used_model
)
{
if
(
num_used_model
>=
0
)
{
num_used_model_
=
static_cast
<
int
>
(
num_used_model
/
num_class_
);
inline
void
SetNumIterationForPred
(
int
num_iteration
)
override
{
if
(
num_iteration
>
0
)
{
num_iteration_for_pred_
=
num_iteration
;
}
else
{
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
}
}
...
...
@@ -236,7 +239,7 @@ protected:
/*! \brief File to write models */
std
::
ofstream
model_output_file_
;
/*! \brief number of used model */
int
num_
used_model
_
;
int
num_
iteration_for_pred
_
;
/*! \brief Shrinkage rate for one iteration */
double
shrinkage_rate_
;
};
...
...
src/c_api.cpp
View file @
422c0ef7
...
...
@@ -95,8 +95,8 @@ public:
return
boosting_
->
TrainOneIter
(
gradients
,
hessians
,
false
);
}
void
PrepareForPrediction
(
int
num_
used_model
,
int
predict_type
)
{
boosting_
->
SetNum
UsedModel
(
num_used_model
);
void
PrepareForPrediction
(
int
num_
iteration
,
int
predict_type
)
{
boosting_
->
SetNum
IterationForPred
(
num_iteration
);
bool
is_predict_leaf
=
false
;
bool
is_raw_score
=
false
;
if
(
predict_type
==
C_API_PREDICT_LEAF_INDEX
)
{
...
...
@@ -109,6 +109,10 @@ public:
predictor_
.
reset
(
new
Predictor
(
boosting_
.
get
(),
is_raw_score
,
is_predict_leaf
));
}
void
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
{
boosting_
->
GetPredictAt
(
data_idx
,
out_result
,
out_len
);
}
std
::
vector
<
double
>
Predict
(
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
features
)
{
return
predictor_
->
GetPredictFunction
()(
features
);
}
...
...
@@ -117,8 +121,8 @@ public:
predictor_
->
Predict
(
data_filename
,
result_filename
,
data_has_header
);
}
void
SaveModelToFile
(
int
num_
used_model
,
const
char
*
filename
)
{
boosting_
->
SaveModelToFile
(
num_
used_model
,
true
,
filename
);
void
SaveModelToFile
(
int
num_
iteration
,
const
char
*
filename
)
{
boosting_
->
SaveModelToFile
(
num_
iteration
,
true
,
filename
);
}
int
GetEvalCounts
()
const
{
...
...
@@ -129,22 +133,25 @@ public:
return
ret
;
}
int
GetEvalNames
(
const
char
**
*
out_strs
)
const
{
int
GetEvalNames
(
char
**
out_strs
)
const
{
int
idx
=
0
;
for
(
const
auto
&
metric
:
train_metric_
)
{
for
(
const
auto
&
name
:
metric
->
GetName
())
{
*
(
out_strs
[
idx
++
])
=
name
.
c_str
();
int
j
=
0
;
auto
name_cstr
=
name
.
c_str
();
while
(
name_cstr
[
j
]
!=
'\0'
)
{
out_strs
[
idx
][
j
]
=
name_cstr
[
j
];
++
j
;
}
out_strs
[
idx
][
j
]
=
'\0'
;
++
idx
;
}
}
return
idx
;
}
const
Boosting
*
GetBoosting
()
const
{
return
boosting_
.
get
();
}
const
float
*
GetTrainingScore
(
int
*
out_len
)
const
{
return
boosting_
->
GetTrainingScore
(
out_len
);
}
const
inline
int
NumberOfClasses
()
const
{
return
boosting_
->
NumberOfClasses
();
}
private:
std
::
unique_ptr
<
Boosting
>
boosting_
;
...
...
@@ -449,9 +456,12 @@ DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
DllExport
int
LGBM_BoosterCreateFromModelfile
(
const
char
*
filename
,
int64_t
*
num_total_model
,
BoosterHandle
*
out
)
{
API_BEGIN
();
*
out
=
new
Booster
(
filename
);
auto
ret
=
std
::
unique_ptr
<
Booster
>
(
new
Booster
(
filename
));
*
num_total_model
=
static_cast
<
int64_t
>
(
ret
->
GetBoosting
()
->
NumberOfTotalModel
());
*
out
=
ret
.
release
();
API_END
();
}
...
...
@@ -461,6 +471,13 @@ DllExport int LGBM_BoosterFree(BoosterHandle handle) {
API_END
();
}
DllExport
int
LGBM_BoosterGetNumClasses
(
BoosterHandle
handle
,
int64_t
*
out_len
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetBoosting
()
->
NumberOfClasses
();
API_END
();
}
DllExport
int
LGBM_BoosterUpdateOneIter
(
BoosterHandle
handle
,
int
*
is_finished
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
...
...
@@ -501,7 +518,7 @@ DllExport int LGBM_BoosterGetEvalCounts(BoosterHandle handle, int64_t* out_len)
* \brief Get number of eval
* \return total number of eval result
*/
DllExport
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int64_t
*
out_len
,
const
char
**
*
out_strs
)
{
DllExport
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int64_t
*
out_len
,
char
**
out_strs
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetEvalNames
(
out_strs
);
...
...
@@ -524,39 +541,27 @@ DllExport int LGBM_BoosterGetEval(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterGetTrainingScore
(
BoosterHandle
handle
,
int64_t
*
out_len
,
const
float
**
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
int
len
=
0
;
*
out_result
=
ref_booster
->
GetTrainingScore
(
&
len
);
*
out_len
=
static_cast
<
int64_t
>
(
len
);
API_END
();
}
DllExport
int
LGBM_BoosterGetPredict
(
BoosterHandle
handle
,
int
data
,
int64_t
*
out_len
,
float
*
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
auto
boosting
=
ref_booster
->
GetBoosting
();
int
len
=
0
;
boost
ing
->
GetPredictAt
(
data
,
out_result
,
&
len
);
ref_
boost
er
->
GetPredictAt
(
data
,
out_result
,
&
len
);
*
out_len
=
static_cast
<
int64_t
>
(
len
);
API_END
();
}
DllExport
int
LGBM_BoosterPredictForFile
(
BoosterHandle
handle
,
int
predict_type
,
int64_t
n_used_trees
,
int
data_has_header
,
const
char
*
data_filename
,
int
data_has_header
,
int
predict_type
,
int64_t
num_iteration
,
const
char
*
result_filename
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
_used_trees
),
predict_type
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
um_iteration
),
predict_type
);
bool
bool_data_has_header
=
data_has_header
>
0
?
true
:
false
;
ref_booster
->
PredictForFile
(
data_filename
,
result_filename
,
bool_data_has_header
);
API_END
();
...
...
@@ -572,23 +577,32 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t
nelem
,
int64_t
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
)
{
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
_used_trees
),
predict_type
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
um_iteration
),
predict_type
);
auto
get_row_fun
=
RowFunctionFromCSR
(
indptr
,
indptr_type
,
indices
,
data
,
data_type
,
nindptr
,
nelem
);
int
num_class
=
ref_booster
->
NumberOfClasses
();
int
num_preb_in_one_row
=
ref_booster
->
GetBoosting
()
->
NumberOfClasses
();
if
(
predict_type
==
C_API_PREDICT_LEAF_INDEX
)
{
if
(
num_iteration
>
0
)
{
num_preb_in_one_row
*=
static_cast
<
int
>
(
num_iteration
);
}
else
{
num_preb_in_one_row
*=
ref_booster
->
GetBoosting
()
->
NumberOfTotalModel
()
/
num_preb_in_one_row
;
}
}
int
nrow
=
static_cast
<
int
>
(
nindptr
-
1
);
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
auto
one_row
=
get_row_fun
(
i
);
auto
predicton_result
=
ref_booster
->
Predict
(
one_row
);
for
(
int
j
=
0
;
j
<
num
_c
l
as
s
;
++
j
)
{
out_result
[
i
*
num_
class
+
j
]
=
predicton_result
[
j
];
for
(
int
j
=
0
;
j
<
static
_cas
t
<
int
>
(
predicton_result
.
size
())
;
++
j
)
{
out_result
[
i
*
num_
preb_in_one_row
+
j
]
=
static_cast
<
float
>
(
predicton_result
[
j
]
)
;
}
}
*
out_len
=
nrow
*
num_preb_in_one_row
;
API_END
();
}
...
...
@@ -599,31 +613,40 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t
ncol
,
int
is_row_major
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
)
{
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
_used_trees
),
predict_type
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
um_iteration
),
predict_type
);
auto
get_row_fun
=
RowPairFunctionFromDenseMatric
(
data
,
nrow
,
ncol
,
data_type
,
is_row_major
);
int
num_class
=
ref_booster
->
NumberOfClasses
();
int
num_preb_in_one_row
=
ref_booster
->
GetBoosting
()
->
NumberOfClasses
();
if
(
predict_type
==
C_API_PREDICT_LEAF_INDEX
)
{
if
(
num_iteration
>
0
)
{
num_preb_in_one_row
*=
static_cast
<
int
>
(
num_iteration
);
}
else
{
num_preb_in_one_row
*=
ref_booster
->
GetBoosting
()
->
NumberOfTotalModel
()
/
num_preb_in_one_row
;
}
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
auto
one_row
=
get_row_fun
(
i
);
auto
predicton_result
=
ref_booster
->
Predict
(
one_row
);
for
(
int
j
=
0
;
j
<
num
_c
l
as
s
;
++
j
)
{
out_result
[
i
*
num_
class
+
j
]
=
predicton_result
[
j
];
for
(
int
j
=
0
;
j
<
static
_cas
t
<
int
>
(
predicton_result
.
size
())
;
++
j
)
{
out_result
[
i
*
num_
preb_in_one_row
+
j
]
=
static_cast
<
float
>
(
predicton_result
[
j
]
)
;
}
}
*
out_len
=
nrow
*
num_preb_in_one_row
;
API_END
();
}
DllExport
int
LGBM_BoosterSaveModel
(
BoosterHandle
handle
,
int
num_
used_model
,
int
num_
iteration
,
const
char
*
filename
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
SaveModelToFile
(
num_
used_model
,
filename
);
ref_booster
->
SaveModelToFile
(
num_
iteration
,
filename
);
API_END
();
}
...
...
src/io/config.cpp
View file @
422c0ef7
...
...
@@ -183,7 +183,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt
(
params
,
"data_random_seed"
,
&
data_random_seed
);
GetString
(
params
,
"data"
,
&
data_filename
);
GetInt
(
params
,
"verbose"
,
&
verbosity
);
GetInt
(
params
,
"num_
model
_predict"
,
&
num_
model
_predict
);
GetInt
(
params
,
"num_
iteration
_predict"
,
&
num_
iteration
_predict
);
GetInt
(
params
,
"bin_construct_sample_cnt"
,
&
bin_construct_sample_cnt
);
GetBool
(
params
,
"is_pre_partition"
,
&
is_pre_partition
);
GetBool
(
params
,
"is_enable_sparse"
,
&
is_enable_sparse
);
...
...
tests/c_api_test/test.py
View file @
422c0ef7
...
...
@@ -190,14 +190,16 @@ def test_booster():
test_free_dataset
(
train
)
test_free_dataset
(
test
[
0
])
booster2
=
ctypes
.
c_void_p
()
LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
'model.txt'
),
ctypes
.
byref
(
booster2
))
num_total_model
=
ctypes
.
c_long
()
LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
'model.txt'
),
ctypes
.
byref
(
num_total_model
),
ctypes
.
byref
(
booster2
))
data
=
[]
inp
=
open
(
'../../examples/binary_classification/binary.test'
,
'r'
)
for
line
in
inp
.
readlines
():
data
.
append
(
[
float
(
x
)
for
x
in
line
.
split
(
'
\t
'
)[
1
:]]
)
inp
.
close
()
mat
=
np
.
array
(
data
)
preb
=
np
.
zeros
((
mat
.
shape
[
0
],
1
),
dtype
=
np
.
float64
)
preb
=
np
.
zeros
(
mat
.
shape
[
0
],
dtype
=
np
.
float32
)
num_preb
=
ctypes
.
c_long
()
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
copy
=
False
)
LIB
.
LGBM_BoosterPredictForMat
(
booster2
,
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_void_p
)),
...
...
@@ -207,8 +209,9 @@ def test_booster():
1
,
1
,
50
,
ctypes
.
byref
(
num_preb
),
preb
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
)))
LIB
.
LGBM_BoosterPredictForFile
(
booster2
,
1
,
50
,
0
,
c_str
(
'../../examples/binary_classification/binary.test'
),
c_str
(
'preb.txt'
))
LIB
.
LGBM_BoosterPredictForFile
(
booster2
,
c_str
(
'../../examples/binary_classification/binary.test'
),
0
,
0
,
50
,
c_str
(
'preb.txt'
))
LIB
.
LGBM_BoosterFree
(
booster2
)
test_dataset
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment