Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
629fc047
Commit
629fc047
authored
Nov 24, 2016
by
Guolin Ke
Browse files
more flexity python basic object
parent
b41e0f0a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
483 additions
and
358 deletions
+483
-358
include/LightGBM/boosting.h
include/LightGBM/boosting.h
+1
-0
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+424
-313
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+35
-32
src/boosting/gbdt.h
src/boosting/gbdt.h
+17
-0
src/c_api.cpp
src/c_api.cpp
+6
-13
No files found.
include/LightGBM/boosting.h
View file @
629fc047
...
...
@@ -37,6 +37,7 @@ public:
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
virtual
void
MergeFrom
(
const
Boosting
*
other
)
=
0
;
...
...
python-package/lightgbm/basic.py
View file @
629fc047
...
...
@@ -126,16 +126,27 @@ C_API_DTYPE_INT64 =3
"""Matric is row major in python"""
C_API_IS_ROW_MAJOR
=
1
C_API_PREDICT_NORMAL
=
0
C_API_PREDICT_RAW_SCORE
=
1
C_API_PREDICT_LEAF_INDEX
=
2
FIELD_TYPE_MAPPER
=
{
"label"
:
C_API_DTYPE_FLOAT32
,
"wegiht"
:
C_API_DTYPE_FLOAT32
,
"init_score"
:
C_API_DTYPE_FLOAT32
,
"group_id"
:
C_API_DTYPE_INT32
,
"group"
:
C_API_DTYPE_INT32
,
}
def
c_float_array
(
data
):
"""Convert numpy array / list to c float array."""
if
isinstance
(
data
,
list
):
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
if
data
.
dtype
==
np
.
float32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_float
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
)
)
type_data
=
C_API_DTYPE_FLOAT32
elif
data
.
dtype
==
np
.
float64
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_double
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
)
)
type_data
=
C_API_DTYPE_FLOAT64
else
:
raise
TypeError
(
"expected np.float32 or np.float64, met type({})"
.
format
(
data
.
dtype
))
...
...
@@ -149,10 +160,10 @@ def c_int_array(data):
data
=
np
.
array
(
data
,
copy
=
False
)
if
is_numpy_1d_array
(
data
):
if
data
.
dtype
==
np
.
int32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_int32
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
)
)
type_data
=
C_API_DTYPE_INT32
elif
data
.
dtype
==
np
.
int64
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_int64
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int64
)
)
type_data
=
C_API_DTYPE_INT64
else
:
raise
TypeError
(
"expected np.int32 or np.int64, met type({})"
.
format
(
data
.
dtype
))
...
...
@@ -160,19 +171,188 @@ def c_int_array(data):
raise
TypeError
(
"Unknow type({})"
.
format
(
type
(
data
).
__name__
))
return
(
ptr_data
,
type_data
)
class
Predictor
(
object
):
""""A Predictor of LightGBM.
"""
def
__init__
(
self
,
model_file
=
None
,
params
=
None
,
booster_handle
=
None
,
is_manage_handle
=
True
):
# pylint: disable=invalid-name
"""Initialize the Booster.
Parameters
----------
model_file : string
Path to the model file.
params : dict
Parameters for boosters.
"""
self
.
handle
=
ctypes
.
c_void_p
()
self
.
__is_manage_handle
=
True
if
model_file
is
not
None
:
"""Prediction task"""
out_num_total_model
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
model_file
),
ctypes
.
byref
(
out_num_total_model
),
ctypes
.
byref
(
self
.
handle
)))
self
.
__num_total_model
=
out_num_total_model
.
value
tmp_out_len
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
ctypes
.
byref
(
tmp_out_len
)))
self
.
num_class
=
tmp_out_len
.
value
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
is_manage_handle
self
.
handle
=
booster_handle
tmp_out_len
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
ctypes
.
byref
(
tmp_out_len
)))
self
.
num_class
=
tmp_out_len
.
value
_safe_call
(
_LIB
.
LGBM_BoosterGetCurrentIteration
(
self
.
handle
,
ctypes
.
byref
(
tmp_out_len
)))
self
.
__num_total_model
=
self
.
num_class
*
tmp_out_len
.
value
else
:
raise
TypeError
(
'Need Model file to create a booster'
)
def
__del__
(
self
):
if
self
.
__is_manage_handle
:
_safe_call
(
_LIB
.
LGBM_BoosterFree
(
self
.
handle
))
def
predict
(
self
,
data
,
num_iteration
=-
1
,
raw_score
=
False
,
pred_leaf
=
False
,
data_has_header
=
False
,
is_reshape
=
True
):
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"cannot use Dataset instance for prediction, please use raw data instead"
)
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
predict_type
=
C_API_PREDICT_RAW_SCORE
if
pred_leaf
:
predict_type
=
C_API_PREDICT_LEAF_INDEX
int_data_has_header
=
0
if
data_has_header
:
int_data_has_header
=
1
if
is_str
(
data
):
tmp_pred_fname
=
tempfile
.
NamedTemporaryFile
(
prefix
=
"lightgbm_tmp_pred_"
).
name
_safe_call
(
_LIB
.
LGBM_BoosterPredictForFile
(
self
.
handle
,
c_str
(
data
),
int_data_has_header
,
predict_type
,
num_iteration
,
c_str
(
tmp_pred_fname
)))
lines
=
open
(
tmp_pred_fname
,
"r"
).
readlines
()
nrow
=
len
(
lines
)
preds
=
[]
for
line
in
lines
:
for
token
in
line
.
split
(
'
\t
'
):
preds
.
append
(
float
(
token
))
preds
=
np
.
array
(
preds
,
copy
=
False
)
os
.
remove
(
tmp_pred_fname
)
elif
isinstance
(
data
,
scipy
.
sparse
.
csr_matrix
):
preds
,
nrow
=
self
.
__pred_for_csr
(
data
,
num_iteration
,
predict_type
)
elif
isinstance
(
data
,
np
.
ndarray
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
,
num_iteration
,
predict_type
)
else
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
res
=
self
.
__pred_for_csr
(
csr
,
num_iteration
,
predict_type
)
except
:
raise
TypeError
(
'can not predict data for type {}'
.
format
(
type
(
data
).
__name__
))
if
pred_leaf
:
preds
=
preds
.
astype
(
np
.
int32
)
if
preds
.
size
!=
nrow
and
is_reshape
:
if
preds
.
size
%
nrow
==
0
:
ncol
=
int
(
preds
.
size
/
nrow
)
preds
=
preds
.
reshape
(
nrow
,
ncol
)
else
:
raise
ValueError
(
'len of predict result(%d) cannot be divide nrow(%d)'
%
(
preds
.
size
,
nrow
)
)
return
preds
def
__pred_for_np2d
(
self
,
mat
,
num_iteration
,
predict_type
):
"""
Predict for a 2-D numpy matrix.
"""
if
len
(
mat
.
shape
)
!=
2
:
raise
ValueError
(
'Input numpy.ndarray must be 2 dimensional'
)
if
mat
.
dtype
==
np
.
float32
or
mat
.
dtype
==
np
.
float64
:
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
mat
.
dtype
,
copy
=
False
)
else
:
"""change non-float data to float data, need to copy"""
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
ptr_data
,
type_ptr_data
=
c_float_array
(
data
)
n_preds
=
self
.
num_class
*
mat
.
shape
[
0
]
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
num_class
n_preds
*=
used_iteration
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
_safe_call
(
LIB
.
LGBM_BoosterPredictForMat
(
self
.
handle
,
ptr_data
,
type_ptr_data
,
mat
.
shape
[
0
],
mat
.
shape
[
1
],
C_API_IS_ROW_MAJOR
,
predict_type
,
num_iteration
,
ctypes
.
byref
(
out_num_preds
),
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"incorrect number for predict result"
)
return
preds
,
mat
.
shape
[
0
]
def
__pred_for_csr
(
self
,
csr
,
num_iteration
,
predict_type
):
"""
Predict for a csr data
"""
nrow
=
len
(
csr
.
indptr
)
-
1
n_preds
=
self
.
num_class
*
nrow
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
num_class
n_preds
*=
used_iteration
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csr
.
data
)
_safe_call
(
LIB
.
LGBM_BoosterPredictForCSR
(
self
.
handle
,
ptr_indptr
,
type_ptr_indptr
,
csr
.
indices
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
)),
ptr_data
,
type_ptr_data
,
len
(
csr
.
indptr
),
len
(
csr
.
data
),
csr
.
shape
[
1
],
predict_type
,
num_iteration
,
ctypes
.
byref
(
out_num_preds
),
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"incorrect number for predict result"
)
return
preds
,
nrow
class
Dataset
(
object
):
"""Dataset used in LightGBM.
Dataset is a internal data structure that used by LightGBM
You can construct Dataset from numpy.arrays
"""
_feature_names
=
None
def
__init__
(
self
,
data
,
max_bin
=
255
,
reference
=
None
,
label
=
None
,
weight
=
None
,
group_id
=
None
,
silent
=
False
,
feature_names
=
None
,
other_params
=
None
,
is_continue_train
=
False
):
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group_id
=
None
,
predictor
=
None
,
silent
=
False
,
params
=
None
):
"""
Dataset used in LightGBM.
...
...
@@ -181,41 +361,35 @@ class Dataset(object):
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data is string type, it represents the path of txt file,
label : list or numpy 1-D array, optional
Label of the data
max_bin : int, required
max number of discrete bin for features
reference : Other Dataset, optional
If this dataset validation, need to use training data as reference
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group_id : list or numpy 1-D array , optional
group/query id for each instance. Note: if having group/query id, data should group by this id
silent : boolean, optional
Whether print messages during construction
feature_names : list, optional
Set names for features.
other_params: dict, optional
params: dict, optional
other parameters
"""
if
data
is
None
:
self
.
handle
=
None
return
"""save raw data for continue train """
if
is_continue_train
:
self
.
raw_data
=
data
else
:
self
.
raw_data
=
None
self
.
data_has_header
=
False
"""process for args"""
params
=
{}
if
params
is
None
:
params
=
{}
self
.
max_bin
=
max_bin
self
.
predictor
=
predictor
params
[
"max_bin"
]
=
max_bin
if
silent
:
params
[
"verbose"
]
=
0
if
other_params
:
other_params
.
update
(
params
)
params
=
other_params
else
:
params
[
"verbose"
]
=
1
params_str
=
dict_to_str
(
params
)
"""process for reference dataset"""
ref_dataset
=
None
...
...
@@ -228,7 +402,7 @@ class Dataset(object):
"""check data has header or not"""
if
"has_header"
in
params
or
"header"
in
params
:
if
params
[
"has_header"
].
lower
()
==
"true"
or
params
[
"header"
].
lower
()
==
"true"
:
data_has_header
=
True
self
.
data_has_header
=
True
self
.
handle
=
ctypes
.
c_void_p
()
_safe_call
(
_LIB
.
LGBM_CreateDatasetFromFile
(
c_str
(
data
),
...
...
@@ -242,8 +416,6 @@ class Dataset(object):
else
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
if
self
.
raw_data
is
not
None
:
self
.
raw_data
=
csr
self
.
__init_from_csr
(
csr
)
except
:
raise
TypeError
(
'can not initialize Dataset from {}'
.
format
(
type
(
data
).
__name__
))
...
...
@@ -253,14 +425,52 @@ class Dataset(object):
self
.
__group
=
None
if
label
is
not
None
:
self
.
set_label
(
label
)
if
self
.
get_label
()
is
None
:
raise
ValueError
(
"label should not be None"
)
if
weight
is
not
None
:
self
.
set_weight
(
weight
)
if
group_id
is
not
None
:
self
.
set_group_id
(
group_id
)
self
.
feature_names
=
feature_names
# load init score
if
self
.
predictor
is
not
None
and
isinstance
(
self
.
predictor
,
Predictor
):
init_score
=
self
.
predictor
.
predict
(
data
,
raw_score
=
True
,
data_has_header
=
self
.
data_has_header
,
is_reshape
=
False
)
if
self
.
predictor
.
num_class
>
1
:
# need re group init score
new_init_score
=
np
.
zeros
(
init_score
.
size
(),
dtype
=
np
.
float32
)
num_data
=
self
.
num_data
()
for
i
in
range
(
num_data
):
for
j
in
range
(
self
.
predictor
.
num_class
):
new_init_score
[
j
*
num_data
+
i
]
=
init_score
[
i
*
self
.
predictor
.
num_class
+
j
]
init_score
=
new_init_score
self
.
set_init_score
(
init_score
)
def
new_valid_dataset
(
self
,
data
,
label
=
None
,
weight
=
None
,
group_id
=
None
,
silent
=
False
,
params
=
None
):
"""
Create validation data align with current dataset
def
free_raw_data
(
self
):
self
.
raw_data
=
None
Parameters
----------
data : string/numpy array/scipy.sparse
Data source of Dataset.
When data is string type, it represents the path of txt file,
label : list or numpy 1-D array, optional
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group_id : list or numpy 1-D array , optional
group/query id for each instance. Note: if having group/query id, data should group by this id
silent : boolean, optional
Whether print messages during construction
other_params: dict, optional
other parameters
"""
return
Dataset
(
data
,
label
=
label
,
max_bin
=
self
.
max_bin
,
reference
=
self
,
weight
=
weight
,
group_id
=
group_id
,
predictor
=
self
.
predictor
,
silent
=
silent
,
params
=
params
)
def
__init_from_np2d
(
self
,
mat
,
params_str
,
ref_dataset
):
"""
...
...
@@ -301,7 +511,7 @@ class Dataset(object):
_safe_call
(
_LIB
.
LGBM_CreateDatasetFromCSR
(
ptr_indptr
,
type_ptr_indptr
,
csr
.
indices
.
ctypes
.
data_as
(
ctypes
.
c_int32
),
csr
.
indices
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
)
)
,
ptr_data
,
type_ptr_data
,
len
(
csr
.
indptr
),
...
...
@@ -327,19 +537,23 @@ class Dataset(object):
info : array
a numpy array of information of the data
"""
out_len
=
ctypes
.
c_int
32
()
tmp_
out_len
=
ctypes
.
c_int
64
()
out_type
=
ctypes
.
c_int32
()
ret
=
ctypes
.
POINTER
(
ctypes
.
c_void_p
)()
_safe_call
(
_LIB
.
LGBM_DatasetGetField
(
self
.
handle
,
c_str
(
field_name
),
ctypes
.
byref
(
out_len
),
ctypes
.
byref
(
tmp_
out_len
),
ctypes
.
byref
(
ret
),
ctypes
.
byref
(
out_type
)))
if
out_type
.
value
!=
FIELD_TYPE_MAPPER
[
field_name
]:
raise
TypeError
(
"Return type error for get_field"
)
if
tmp_out_len
.
value
==
0
:
return
None
if
out_type
.
value
==
C_API_DTYPE_INT32
:
return
cint32_array_to_numpy
(
ctypes
.
cast
(
ret
,
ctypes
.
POINTER
(
c_int32
),
out_len
.
value
)
)
return
cint32_array_to_numpy
(
ctypes
.
cast
(
ret
,
ctypes
.
POINTER
(
ctypes
.
c_int32
)
)
,
tmp_
out_len
.
value
)
elif
out_type
.
value
==
C_API_DTYPE_FLOAT32
:
return
cfloat32_array_to_numpy
(
ctypes
.
cast
(
ret
,
ctypes
.
POINTER
(
c_float
),
out_len
.
value
)
)
return
cfloat32_array_to_numpy
(
ctypes
.
cast
(
ret
,
ctypes
.
POINTER
(
ctypes
.
c_float
)
)
,
tmp_
out_len
.
value
)
else
:
raise
TypeError
(
"unknow type"
)
...
...
@@ -351,19 +565,29 @@ class Dataset(object):
field_name: str
The field name of the information
data: numpy array or list
data: numpy array or list
or None
The array ofdata to be set
"""
if
data
is
None
:
_safe_call
(
_LIB
.
LGBM_DatasetSetField
(
self
.
handle
,
c_str
(
field_name
),
None
,
0
,
FIELD_TYPE_MAPPER
[
field_name
]))
return
if
not
is_numpy_1d_array
(
data
):
raise
TypeError
(
"Unknow type({})"
.
format
(
type
(
data
).
__name__
))
if
data
.
dtype
==
np
.
float32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_float
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
)
)
type_data
=
C_API_DTYPE_FLOAT32
elif
data
.
dtype
==
np
.
int32
:
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
c_int32
)
ptr_data
=
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
)
)
type_data
=
C_API_DTYPE_INT32
else
:
raise
TypeError
(
"excepted np.float32 or np.int32, met type({})"
.
format
(
data
.
dtype
))
if
type_data
!=
FIELD_TYPE_MAPPER
[
field_name
]:
raise
TypeError
(
"type error for set_field"
)
_safe_call
(
_LIB
.
LGBM_DatasetSetField
(
self
.
handle
,
c_str
(
field_name
),
...
...
@@ -406,9 +630,10 @@ class Dataset(object):
weight : array like
Weight for each data point
"""
weight
=
list_to_1d_numpy
(
weight
,
np
.
float32
)
if
weight
.
dtype
!=
np
.
float32
:
weight
=
weight
.
astype
(
np
.
float32
,
copy
=
False
)
if
weight
is
not
None
:
weight
=
list_to_1d_numpy
(
weight
,
np
.
float32
)
if
weight
.
dtype
!=
np
.
float32
:
weight
=
weight
.
astype
(
np
.
float32
,
copy
=
False
)
self
.
__weight
=
weight
self
.
set_field
(
'weight'
,
weight
)
...
...
@@ -419,10 +644,11 @@ class Dataset(object):
score: array like
"""
score
=
list_to_1d_numpy
(
score
,
np
.
float32
)
if
score
.
dtype
!=
np
.
float32
:
score
=
score
.
astype
(
np
.
float32
,
copy
=
False
)
self
.
__init_score
=
init_score
if
score
is
not
None
:
score
=
list_to_1d_numpy
(
score
,
np
.
float32
)
if
score
.
dtype
!=
np
.
float32
:
score
=
score
.
astype
(
np
.
float32
,
copy
=
False
)
self
.
__init_score
=
score
self
.
set_field
(
'init_score'
,
score
)
def
set_group
(
self
,
group
):
...
...
@@ -433,9 +659,10 @@ class Dataset(object):
group : array like
Group size of each group
"""
group
=
list_to_1d_numpy
(
group
,
np
.
int32
)
if
group
.
dtype
!=
np
.
int32
:
group
=
group
.
astype
(
np
.
int32
,
copy
=
False
)
if
group
is
not
None
:
group
=
list_to_1d_numpy
(
group
,
np
.
int32
)
if
group
.
dtype
!=
np
.
int32
:
group
=
group
.
astype
(
np
.
int32
,
copy
=
False
)
self
.
__group
=
group
self
.
set_field
(
'group'
,
group
)
...
...
@@ -448,9 +675,10 @@ class Dataset(object):
group : array like
group_id of Dataset (used for ranking).
"""
group_id
=
list_to_1d_numpy
(
group_id
,
np
.
int32
)
if
group_id
.
dtype
!=
np
.
int32
:
group_id
=
group_id
.
astype
(
np
.
int32
,
copy
=
False
)
if
group_id
is
not
None
:
group_id
=
list_to_1d_numpy
(
group_id
,
np
.
int32
)
if
group_id
.
dtype
!=
np
.
int32
:
group_id
=
group_id
.
astype
(
np
.
int32
,
copy
=
False
)
self
.
set_field
(
'group_id'
,
group_id
)
def
get_label
(
self
):
...
...
@@ -462,6 +690,8 @@ class Dataset(object):
"""
if
self
.
__label
is
None
:
self
.
__label
=
self
.
get_field
(
'label'
)
if
self
.
__label
is
None
:
raise
TypeError
(
"label should not be None"
)
return
self
.
__label
def
get_weight
(
self
):
...
...
@@ -521,58 +751,11 @@ class Dataset(object):
ctypes
.
byref
(
ret
)))
return
ret
.
value
@
property
def
feature_names
(
self
):
"""Get feature names (column labels).
Returns
-------
feature_names : list
"""
if
self
.
_feature_names
is
None
:
self
.
_feature_names
=
[
'Column_{0}'
.
format
(
i
)
for
i
in
range
(
self
.
num_col
())]
return
self
.
_feature_names
@
feature_names
.
setter
def
feature_names
(
self
,
feature_names
):
"""Set feature names (column labels).
Parameters
----------
feature_names : list
Labels for features
"""
if
feature_names
is
not
None
:
# validate feature name
if
not
isinstance
(
feature_names
,
list
):
feature_names
=
list
(
feature_names
)
if
len
(
feature_names
)
!=
len
(
set
(
feature_names
)):
raise
ValueError
(
'feature_names must be unique'
)
if
len
(
feature_names
)
!=
self
.
num_col
():
msg
=
'feature_names must have the same length as data'
raise
ValueError
(
msg
)
# prohibit to use symbols may affect to parse. e.g. []<
if
not
all
(
isinstance
(
f
,
STRING_TYPES
)
and
not
any
(
x
in
f
for
x
in
set
((
'['
,
']'
,
'<'
)))
for
f
in
feature_names
):
raise
ValueError
(
'feature_names may not contain [, ] or <'
)
self
.
_feature_names
=
feature_names
else
:
self
.
_feature_names
=
None
C_API_PREDICT_NORMAL
=
0
C_API_PREDICT_RAW_SCORE
=
1
C_API_PREDICT_LEAF_INDEX
=
2
class
Booster
(
object
):
""""A Booster of of LightGBM.
"""
feature_names
=
None
def
__init__
(
self
,
params
=
None
,
train_set
=
None
,
valid_sets
=
None
,
name_valid_sets
=
None
,
model_file
=
None
):
def
__init__
(
self
,
params
=
None
,
train_set
=
None
,
model_file
=
None
,
silent
=
False
):
# pylint: disable=invalid-name
"""Initialize the Booster.
...
...
@@ -582,83 +765,46 @@ class Booster(object):
Parameters for boosters.
train_set : Dataset
training dataset
valid_sets : List of Dataset or None
validation datasets
name_valid_sets : List of string
name of validation datasets
model_file : string
Path to the model file.
If tarin_set is not None, used for continued train.
else used for loading model prediction task
"""
self
.
handle
=
ctypes
.
c_void_p
()
self
.
__need_reload_eval_info
=
True
self
.
__is_manage_handle
=
True
if
params
is
None
:
params
=
{}
if
silent
:
params
[
"verbose"
]
=
0
else
:
params
[
"verbose"
]
=
1
if
train_set
is
not
None
:
"""Training task"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
'training data should be Dataset instance, met{}'
.
format
(
type
(
train_set
).
__name__
))
valid_handles
=
None
n_valid
=
0
if
valid_sets
is
not
None
:
for
valid
in
valid_sets
:
if
not
isinstance
(
valid
,
Dataset
):
raise
TypeError
(
'valid data should be Dataset instance, met{}'
.
format
(
type
(
valid
).
__name__
))
valid_handles
=
c_array
(
ctypes
.
c_void_p
,
[
valid
.
handle
for
valid
in
valid_sets
])
if
name_valid_sets
is
None
:
name_valid_sets
=
[
"valid_{}"
.
format
(
x
+
1
)
for
x
in
range
(
len
(
valid_sets
))
]
if
len
(
valid_sets
)
!=
len
(
name_valid_sets
):
raise
Exception
(
'len of valid_sets should be equal with len of name_valid_sets'
)
n_valid
=
len
(
valid_sets
)
ref_input_model
=
None
params_str
=
dict_to_str
(
params
)
if
model_file
is
not
None
:
ref_input_model
=
c_str
(
model_file
)
"""construct booster object"""
_safe_call
(
_LIB
.
LGBM_BoosterCreate
(
train_set
.
handle
,
valid_handles
,
n_valid
,
c_str
(
params_str
),
ref_input_model
,
ctypes
.
byref
(
self
.
handle
)))
"""if need to continue train"""
if
model_file
is
not
None
:
self
.
__init_continue_train
(
train_set
)
if
valid_sets
is
not
None
:
for
valid
in
valid_sets
:
self
.
__init_continue_train
(
valid
)
"""save reference to data"""
self
.
train_set
=
train_set
self
.
valid_sets
=
valid_sets
self
.
name_valid_sets
=
name_valid_sets
self
.
__num_dataset
=
1
+
n_valid
self
.
__training_score
=
None
out_len
=
ctypes
.
c_int64
(
0
)
self
.
valid_sets
=
[]
self
.
name_valid_sets
=
[]
self
.
__num_dataset
=
1
self
.
init_predictor
=
train_set
.
predictor
if
self
.
init_predictor
is
not
None
:
_safe_call
(
_LIB
.
LGBM_BoosterMerge
(
self
.
handle
,
self
.
init_predictor
.
handle
))
out_num_class
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
ctypes
.
byref
(
out_
len
)))
self
.
__num_class
=
out_
len
.
value
ctypes
.
byref
(
out_
num_class
)))
self
.
__num_class
=
out_
num_class
.
value
"""buffer for inner predict"""
self
.
__inner_predict_buffer
=
[
None
for
_
in
range
(
self
.
__num_dataset
)]
"""Get num of inner evals"""
_safe_call
(
_LIB
.
LGBM_BoosterGetEvalCounts
(
self
.
handle
,
ctypes
.
byref
(
out_len
)))
self
.
__num_inner_eval
=
out_len
.
value
if
self
.
__num_inner_eval
>
0
:
"""Get name of evals"""
string_buffers
=
[
ctypes
.
create_string_buffer
(
255
)
for
i
in
range
(
self
.
__num_inner_eval
)]
ptr_string_buffers
=
(
ctypes
.
c_char_p
*
self
.
__num_inner_eval
)(
*
map
(
ctypes
.
addressof
,
string_buffers
))
_safe_call
(
_LIB
.
LGBM_BoosterGetEvalNames
(
self
.
handle
,
ctypes
.
byref
(
out_len
),
ptr_string_buffers
))
if
self
.
__num_inner_eval
!=
out_len
.
value
:
raise
ValueError
(
"size of eval names doesn't equal with num_evals"
)
self
.
__name_inner_eval
=
[]
for
i
in
range
(
self
.
__num_inner_eval
):
self
.
__name_inner_eval
.
append
(
string_buffers
[
i
].
value
.
decode
())
self
.
__inner_predict_buffer
=
[
None
]
self
.
__get_eval_info
()
elif
model_file
is
not
None
:
"""Prediction task"""
out_num_total_model
=
ctypes
.
c_int64
(
0
)
...
...
@@ -667,18 +813,40 @@ class Booster(object):
ctypes
.
byref
(
out_num_total_model
),
ctypes
.
byref
(
self
.
handle
)))
self
.
__num_total_model
=
out_num_total_model
.
value
out_
len
=
ctypes
.
c_int64
(
0
)
out_
num_class
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetNumClasses
(
self
.
handle
,
ctypes
.
byref
(
out_
len
)))
self
.
__num_class
=
out_
len
.
value
ctypes
.
byref
(
out_
num_class
)))
self
.
__num_class
=
out_
num_class
.
value
else
:
raise
TypeError
(
'At least need training dataset or model file to create booster instance'
)
def
__del__
(
self
):
_safe_call
(
_LIB
.
LGBM_BoosterFree
(
self
.
handle
))
if
self
.
handle
is
not
None
and
self
.
__is_manage_handle
:
_safe_call
(
_LIB
.
LGBM_BoosterFree
(
self
.
handle
))
def
add_valid_data
(
self
,
data
,
name
):
if
data
.
predictor
is
not
self
.
init_predictor
:
raise
Exception
(
"Add validation data failed, you should use same predictor for these data"
)
_safe_call
(
_LIB
.
LGBM_BoosterAddValidData
(
self
.
handle
,
data
.
handle
))
self
.
valid_sets
.
append
(
data
)
self
.
name_valid_sets
.
append
(
name
)
self
.
__num_dataset
+=
1
def
update
(
self
,
fobj
=
None
):
def
ResetParameter
(
self
,
params
,
silent
=
False
):
self
.
__need_reload_eval_info
=
True
if
silent
:
params
[
"verbose"
]
=
0
else
:
params
[
"verbose"
]
=
1
params_str
=
dict_to_str
(
params
)
_safe_call
(
_LIB
.
LGBM_BoosterResetParameter
(
self
.
handle
,
c_str
(
params_str
)))
def
update
(
self
,
train_set
=
None
,
fobj
=
None
):
"""
Update for one iteration
Note: for multi-class task, the score is group by class_id first, then group by row_id
...
...
@@ -686,6 +854,7 @@ class Booster(object):
and you should group grad and hess in this way as well
Parameters
----------
train_set : training data, None means use last training data
fobj : function
Customized objective function.
...
...
@@ -693,6 +862,15 @@ class Booster(object):
-------
is_finished, bool
"""
"""need reset training data"""
if
train_set
is
not
None
and
train_set
is
not
self
.
train_set
:
if
train_set
.
predictor
is
not
self
.
init_predictor
:
raise
Exception
(
"Replace training data failed, you should use same predictor for these data"
)
self
.
train_set
=
train_set
_safe_call
(
_LIB
.
LGBM_BoosterResetTrainingData
(
self
.
handle
,
self
.
train_set
.
handle
))
self
.
__inner_predict_buffer
[
0
]
=
None
is_finished
=
ctypes
.
c_int
(
0
)
if
fobj
is
None
:
_safe_call
(
_LIB
.
LGBM_BoosterUpdateOneIter
(
...
...
@@ -701,9 +879,9 @@ class Booster(object):
return
is_finished
.
value
==
1
else
:
grad
,
hess
=
fobj
(
self
.
__inner_predict
(
0
),
self
.
train_set
)
return
self
.
boost
(
grad
,
hess
)
return
self
.
__
boost
(
grad
,
hess
)
def
boost
(
self
,
grad
,
hess
):
def
__
boost
(
self
,
grad
,
hess
):
"""
Boost the booster for one iteration, with customized gradient statistics.
Note: for multi-class task, the score is group by class_id first, then group by row_id
...
...
@@ -729,11 +907,53 @@ class Booster(object):
is_finished
=
ctypes
.
c_int
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterUpdateOneIterCustom
(
self
.
handle
,
grad
.
ctypes
.
data_as
(
ctypes
.
c_float
),
hess
.
ctypes
.
data_as
(
ctypes
.
c_float
),
grad
.
ctypes
.
data_as
(
ctypes
.
ctypes
.
POINTER
(
ctypes
.
c_float
)
)
,
hess
.
ctypes
.
data_as
(
ctypes
.
ctypes
.
POINTER
(
ctypes
.
c_float
)
)
,
ctypes
.
byref
(
is_finished
)))
return
is_finished
.
value
==
1
def
rollback_one_iter
(
self
):
_safe_call
(
_LIB
.
LGBM_BoosterRollbackOneIter
(
self
.
handle
))
def
current_iteration
(
self
):
out_cur_iter
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetCurrentIteration
(
self
.
handle
,
ctypes
.
byref
(
out_cur_iter
)))
return
out_cur_iter
.
value
def
eval
(
self
,
data
,
name
,
feval
=
None
):
"""Evaluate for data
Parameters
----------
data : Dataset object
name : name of data
feval : function
Custom evaluation function.
Returns
-------
result: str
Evaluation result string.
"""
if
not
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"Can only eval for Dataset instance"
)
data_idx
=
-
1
if
data
is
self
.
train_set
:
data_idx
=
0
else
:
for
i
in
range
(
len
(
self
.
valid_sets
)):
if
data
is
self
.
valid_sets
[
i
]:
data_idx
=
i
+
1
break
"""need push new valid data"""
if
data_idx
==
-
1
:
self
.
add_valid_data
(
data
,
name
)
data_idx
=
self
.
__num_dataset
-
1
return
self
.
__inner_eval
(
name
,
data_idx
,
feval
)
def
eval_train
(
self
,
feval
=
None
):
"""Evaluate for training data
...
...
@@ -774,141 +994,28 @@ class Booster(object):
c_str
(
filename
)))
def
predict
(
self
,
data
,
num_iteration
=-
1
,
raw_score
=
False
,
pred_leaf
=
False
,
data_has_header
=
False
,
is_reshape
=
True
):
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"cannot use Dataset instance for prediction, please use raw data instead"
)
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
predict_type
=
cC_API_PREDICT_RAW_SCORE
if
pred_leaf
:
predict_type
=
C_API_PREDICT_LEAF_INDEX
int_data_has_header
=
0
if
data_has_header
:
int_data_has_header
=
1
if
is_str
(
data
):
tmp_pred_fname
=
tempfile
.
NamedTemporaryFile
(
prefix
=
"lightgbm_tmp_pred_"
).
name
_safe_call
(
_LIB
.
LGBM_BoosterPredictForFile
(
self
.
handle
,
c_str
(
data
),
int_data_has_header
,
predict_type
,
num_iteration
,
c_str
(
tmp_pred_fname
)))
lines
=
open
(
tmp_pred_fname
,
"r"
).
readlines
()
nrow
=
len
(
lines
)
preds
=
[]
for
line
in
lines
:
for
token
in
line
.
split
(
'
\t
'
):
preds
.
append
(
float
(
token
))
preds
=
np
.
array
(
preds
,
copy
=
False
)
os
.
remove
(
tmp_pred_fname
)
elif
isinstance
(
data
,
scipy
.
sparse
.
csr_matrix
):
preds
,
nrow
=
self
.
__pred_for_csr
(
data
,
num_iteration
,
predict_type
)
elif
isinstance
(
data
,
np
.
ndarray
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
,
num_iteration
,
predict_type
)
else
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
res
=
self
.
__pred_for_csr
(
csr
,
num_iteration
,
predict_type
)
except
:
raise
TypeError
(
'can not predict data for type {}'
.
format
(
type
(
data
).
__name__
))
if
pred_leaf
:
preds
=
preds
.
astype
(
np
.
int32
)
if
preds
.
size
!=
nrow
and
is_reshape
:
if
preds
.
size
%
nrow
==
0
:
ncol
=
int
(
preds
.
size
/
nrow
)
preds
=
preds
.
reshape
(
nrow
,
ncol
)
else
:
raise
ValueError
(
'len of predict result(%d) cannot be divide nrow(%d)'
%
(
preds
.
size
,
nrow
)
)
return
preds
predictor
=
Predictor
(
booster_handle
=
self
.
handle
,
is_manage_handle
=
False
)
return
predictor
.
predict
(
data
,
num_iteration
,
raw_score
,
pred_leaf
,
data_has_header
,
is_reshape
)
def
__pred_for_np2d
(
self
,
mat
,
num_iteration
,
predict_type
):
"""
Predict for a 2-D numpy matrix.
"""
if
len
(
mat
.
shape
)
!=
2
:
raise
ValueError
(
'Input numpy.ndarray must be 2 dimensional'
)
if
mat
.
dtype
==
np
.
float32
or
mat
.
dtype
==
np
.
float64
:
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
mat
.
dtype
,
copy
=
False
)
else
:
"""change non-float data to float data, need to copy"""
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
ptr_data
,
type_ptr_data
=
c_float_array
(
data
)
n_preds
=
self
.
__num_class
*
mat
.
shape
[
0
]
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
__num_class
n_preds
*=
used_iteration
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
_safe_call
(
LIB
.
LGBM_BoosterPredictForMat
(
self
.
handle
,
ptr_data
,
type_ptr_data
,
mat
.
shape
[
0
],
mat
.
shape
[
1
],
C_API_IS_ROW_MAJOR
,
predict_type
,
num_iteration
,
ctypes
.
byref
(
out_num_preds
),
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"incorrect number for predict result"
)
return
preds
,
mat
.
shape
[
0
]
def
__pred_for_csr
(
self
,
csr
,
num_iteration
,
predict_type
):
"""
Predict for a csr data
"""
nrow
=
len
(
csr
.
indptr
)
-
1
n_preds
=
self
.
__num_class
*
nrow
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
__num_class
n_preds
*=
used_iteration
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
ptr_indptr
,
type_ptr_indptr
=
c_int_array
(
csr
.
indptr
)
ptr_data
,
type_ptr_data
=
c_float_array
(
csr
.
data
)
_safe_call
(
LIB
.
LGBM_BoosterPredictForCSR
(
self
.
handle
,
ptr_indptr
,
type_ptr_indptr
,
csr
.
indices
.
ctypes
.
data_as
(
ctypes
.
c_int32
),
ptr_data
,
type_ptr_data
,
len
(
csr
.
indptr
),
len
(
csr
.
data
),
csr
.
shape
[
1
],
predict_type
,
num_iteration
,
ctypes
.
byref
(
out_num_preds
),
preds
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
))
if
n_preds
!=
out_num_preds
.
value
:
raise
ValueError
(
"incorrect number for predict result"
)
return
preds
,
nrow
def
to_predictor
(
self
):
predictor
=
Predictor
(
booster_handle
=
self
.
handle
,
is_manage_handle
=
True
)
self
.
__is_manage_handle
=
False
return
predictor
def
__inner_eval
(
self
,
data_name
,
data_idx
,
feval
=
None
):
if
data_idx
>=
self
.
__num_dataset
:
raise
ValueError
(
"data_idx should be smaller than number of dataset"
)
self
.
__get_eval_info
()
ret
=
[]
if
self
.
__num_inner_eval
>
0
:
result
=
np
.
array
([
0.0
for
_
in
range
(
self
.
__num_inner_eval
)],
dtype
=
np
.
float32
)
out_len
=
ctypes
.
c_int64
(
0
)
tmp_
out_len
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterGetEval
(
self
.
handle
,
data_idx
,
ctypes
.
byref
(
out_len
),
ctypes
.
byref
(
tmp_
out_len
),
result
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))))
if
out_len
.
value
!=
self
.
__num_inner_eval
:
if
tmp_
out_len
.
value
!=
self
.
__num_inner_eval
:
raise
ValueError
(
"incorrect number of eval results"
)
for
i
in
range
(
self
.
__num_inner_eval
):
ret
.
append
(
'%s %s : %f'
%
(
data_name
,
self
.
__name_inner_eval
[
i
],
result
[
i
]))
...
...
@@ -936,33 +1043,37 @@ class Booster(object):
num_data
=
self
.
valid_sets
[
data_idx
-
1
].
num_data
()
*
self
.
__num_class
self
.
__inner_predict_buffer
[
data_idx
]
=
\
np
.
array
([
0.0
for
_
in
range
(
num_data
)],
dtype
=
np
.
float32
,
copy
=
False
)
out_len
=
ctypes
.
c_int64
(
0
)
tmp_
out_len
=
ctypes
.
c_int64
(
0
)
data_ptr
=
self
.
__inner_predict_buffer
[
data_idx
].
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
))
_safe_call
(
_LIB
.
LGBM_BoosterGetPredict
(
self
.
handle
,
data_idx
,
ctypes
.
byref
(
out_len
),
ctypes
.
byref
(
tmp_
out_len
),
data_ptr
))
if
out_len
.
value
!=
len
(
self
.
__inner_predict_buffer
[
data_idx
]):
if
tmp_
out_len
.
value
!=
len
(
self
.
__inner_predict_buffer
[
data_idx
]):
raise
ValueError
(
"incorrect number of predict results for data %d"
%
(
data_idx
)
)
return
self
.
__inner_predict_buffer
[
data_idx
]
def
__init_continue_train
(
self
,
dataset
):
if
dataset
.
raw_data
is
None
:
raise
ValueError
(
"should set is_continue_train=True in dataset while need to continue train"
)
init_score
=
self
.
predict
(
dataset
.
raw_data
,
raw_score
=
True
,
data_has_header
=
dataset
.
data_has_header
,
is_reshape
=
False
)
dataset
.
set_init_score
(
init_score
)
dataset
.
free_raw_data
()
#tmp test
train_data
=
Dataset
(
'../../examples/binary_classification/binary.train'
)
test_data
=
Dataset
(
'../../examples/binary_classification/binary.test'
,
reference
=
train_data
)
param
=
{
"metric"
:
"l2,l1"
}
lgb
=
Booster
(
train_set
=
train_data
,
valid_sets
=
[
test_data
],
params
=
param
)
for
i
in
range
(
100
):
lgb
.
update
()
print
(
lgb
.
eval_valid
())
print
(
lgb
.
eval_train
())
print
(
lgb
.
predict
(
'../../examples/binary_classification/binary.train'
))
\ No newline at end of file
def
__get_eval_info
(
self
):
if
self
.
__need_reload_eval_info
:
self
.
__need_reload_eval_info
=
False
out_num_eval
=
ctypes
.
c_int64
(
0
)
"""Get num of inner evals"""
_safe_call
(
_LIB
.
LGBM_BoosterGetEvalCounts
(
self
.
handle
,
ctypes
.
byref
(
out_num_eval
)))
self
.
__num_inner_eval
=
out_num_eval
.
value
if
self
.
__num_inner_eval
>
0
:
"""Get name of evals"""
tmp_out_len
=
ctypes
.
c_int64
(
0
)
string_buffers
=
[
ctypes
.
create_string_buffer
(
255
)
for
i
in
range
(
self
.
__num_inner_eval
)]
ptr_string_buffers
=
(
ctypes
.
c_char_p
*
self
.
__num_inner_eval
)(
*
map
(
ctypes
.
addressof
,
string_buffers
))
_safe_call
(
_LIB
.
LGBM_BoosterGetEvalNames
(
self
.
handle
,
ctypes
.
byref
(
tmp_out_len
),
ptr_string_buffers
))
if
self
.
__num_inner_eval
!=
tmp_out_len
.
value
:
raise
ValueError
(
"size of eval names doesn't equal with num_evals"
)
self
.
__name_inner_eval
=
[]
for
i
in
range
(
self
.
__num_inner_eval
):
self
.
__name_inner_eval
.
append
(
string_buffers
[
i
].
value
.
decode
())
src/boosting/gbdt.cpp
View file @
629fc047
...
...
@@ -46,12 +46,12 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
gbdt_config_
=
config
;
early_stopping_round_
=
gbdt_config_
->
early_stopping_round
;
shrinkage_rate_
=
gbdt_config_
->
learning_rate
;
t
ra
in_data_
=
train_data
;
ra
ndom_
=
Random
(
gbdt_config_
->
bagging_seed
)
;
// create tree learner
tree_learner_
.
clear
();
for
(
int
i
=
0
;
i
<
num_class_
;
++
i
)
{
auto
new_tree_learner
=
std
::
unique_ptr
<
TreeLearner
>
(
TreeLearner
::
CreateTreeLearner
(
gbdt_config_
->
tree_learner_type
,
gbdt_config_
->
tree_config
));
new_tree_learner
->
Init
(
train_data
_
);
new_tree_learner
->
Init
(
train_data
);
// init tree learner
tree_learner_
.
push_back
(
std
::
move
(
new_tree_learner
));
}
...
...
@@ -63,42 +63,45 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
training_metrics_
.
push_back
(
metric
);
}
training_metrics_
.
shrink_to_fit
();
// create score tracker
train_score_updater_
.
reset
(
new
ScoreUpdater
(
train_data_
,
num_class_
));
num_data_
=
train_data_
->
num_data
();
// create buffer for gradients and hessians
if
(
object_function_
!=
nullptr
)
{
gradients_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
hessians_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
}
sigmoid_
=
-
1.0
f
;
if
(
object_function_
!=
nullptr
&&
std
::
string
(
object_function_
->
GetName
())
==
std
::
string
(
"binary"
))
{
// only binary classification need sigmoid transform
sigmoid_
=
gbdt_config_
->
sigmoid
;
}
// get max feature index
max_feature_idx_
=
train_data_
->
num_total_features
()
-
1
;
// get label index
label_idx_
=
train_data_
->
label_idx
();
// if need bagging, create buffer
if
(
gbdt_config_
->
bagging_fraction
<
1.0
&&
gbdt_config_
->
bagging_freq
>
0
)
{
out_of_bag_data_indices_
=
std
::
vector
<
data_size_t
>
(
num_data_
);
bag_data_indices_
=
std
::
vector
<
data_size_t
>
(
num_data_
);
}
else
{
out_of_bag_data_cnt_
=
0
;
out_of_bag_data_indices_
.
clear
();
bag_data_cnt_
=
num_data_
;
bag_data_indices_
.
clear
();
}
random_
=
Random
(
gbdt_config_
->
bagging_seed
);
// update score
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
curr_tree
=
i
*
num_class_
+
curr_class
;
train_score_updater_
->
AddScore
(
models_
[
curr_tree
].
get
(),
curr_class
);
if
(
train_data_
!=
train_data
)
{
// not same training data, need reset score and others
// create score tracker
train_score_updater_
.
reset
(
new
ScoreUpdater
(
train_data
,
num_class_
));
// update score
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
curr_tree
=
(
i
+
num_init_iteration_
)
*
num_class_
+
curr_class
;
train_score_updater_
->
AddScore
(
models_
[
curr_tree
].
get
(),
curr_class
);
}
}
num_data_
=
train_data
->
num_data
();
// create buffer for gradients and hessians
if
(
object_function_
!=
nullptr
)
{
gradients_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
hessians_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
}
// get max feature index
max_feature_idx_
=
train_data
->
num_total_features
()
-
1
;
// get label index
label_idx_
=
train_data
->
label_idx
();
// if need bagging, create buffer
if
(
gbdt_config_
->
bagging_fraction
<
1.0
&&
gbdt_config_
->
bagging_freq
>
0
)
{
out_of_bag_data_indices_
=
std
::
vector
<
data_size_t
>
(
num_data_
);
bag_data_indices_
=
std
::
vector
<
data_size_t
>
(
num_data_
);
}
else
{
out_of_bag_data_cnt_
=
0
;
out_of_bag_data_indices_
.
clear
();
bag_data_cnt_
=
num_data_
;
bag_data_indices_
.
clear
();
}
}
train_data_
=
train_data
;
}
void
GBDT
::
AddValidDataset
(
const
Dataset
*
valid_data
,
...
...
@@ -111,7 +114,7 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
// update score
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
curr_tree
=
i
*
num_class_
+
curr_class
;
auto
curr_tree
=
(
i
+
num_init_iteration_
)
*
num_class_
+
curr_class
;
new_score_updater
->
AddScore
(
models_
[
curr_tree
].
get
(),
curr_class
);
}
}
...
...
@@ -232,7 +235,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
void
GBDT
::
RollbackOneIter
()
{
if
(
iter_
==
0
)
{
return
;
}
int
cur_iter
=
iter_
-
1
;
int
cur_iter
=
iter_
+
num_init_iteration_
-
1
;
// reset score
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
curr_tree
=
cur_iter
*
num_class_
+
curr_class
;
...
...
src/boosting/gbdt.h
View file @
629fc047
...
...
@@ -36,12 +36,28 @@ public:
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
void
MergeFrom
(
const
Boosting
*
other
)
override
{
auto
other_gbdt
=
reinterpret_cast
<
const
GBDT
*>
(
other
);
// tmp move to other vector
auto
original_models
=
std
::
move
(
models_
);
models_
=
std
::
vector
<
std
::
unique_ptr
<
Tree
>>
();
// push model from other first
for
(
const
auto
&
tree
:
other_gbdt
->
models_
)
{
auto
new_tree
=
std
::
unique_ptr
<
Tree
>
(
new
Tree
(
*
(
tree
.
get
())));
models_
.
push_back
(
std
::
move
(
new_tree
));
}
num_init_iteration_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
// push model in current object
for
(
const
auto
&
tree
:
original_models
)
{
auto
new_tree
=
std
::
unique_ptr
<
Tree
>
(
new
Tree
(
*
(
tree
.
get
())));
models_
.
push_back
(
std
::
move
(
new_tree
));
}
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
}
/*!
...
...
@@ -266,6 +282,7 @@ protected:
int
num_iteration_for_pred_
;
/*! \brief Shrinkage rate for one iteration */
double
shrinkage_rate_
;
/*! \brief Number of loaded initial models */
int
num_init_iteration_
;
};
...
...
src/c_api.cpp
View file @
629fc047
...
...
@@ -36,7 +36,7 @@ public:
Log
::
Warning
(
"continued train from model is not support for c_api, \
please use continued train with input score"
);
}
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
config_
.
boosting_type
,
""
));
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
config_
.
boosting_type
,
nullptr
));
ConstructObjectAndTrainingMetrics
(
train_data
);
// initialize the boosting
boosting_
->
Init
(
&
config_
.
boosting_config
,
train_data
,
objective_fun_
.
get
(),
...
...
@@ -114,6 +114,10 @@ public:
return
boosting_
->
TrainOneIter
(
gradients
,
hessians
,
false
);
}
void
RollbackOneIter
()
{
boosting_
->
RollbackOneIter
();
}
void
PrepareForPrediction
(
int
num_iteration
,
int
predict_type
)
{
boosting_
->
SetNumIterationForPred
(
num_iteration
);
bool
is_predict_leaf
=
false
;
...
...
@@ -156,24 +160,13 @@ public:
int
idx
=
0
;
for
(
const
auto
&
metric
:
train_metric_
)
{
for
(
const
auto
&
name
:
metric
->
GetName
())
{
int
j
=
0
;
auto
name_cstr
=
name
.
c_str
();
while
(
name_cstr
[
j
]
!=
'\0'
)
{
out_strs
[
idx
][
j
]
=
name_cstr
[
j
];
++
j
;
}
out_strs
[
idx
][
j
]
=
'\0'
;
std
::
strcpy
(
out_strs
[
idx
],
name
.
c_str
());
++
idx
;
}
}
return
idx
;
}
void
RollbackOneIter
()
{
boosting_
->
RollbackOneIter
();
}
const
Boosting
*
GetBoosting
()
const
{
return
boosting_
.
get
();
}
private:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment