Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
19512d82
Commit
19512d82
authored
Nov 27, 2016
by
Guolin Ke
Browse files
remove set_group_id. fixed bug in set num_pred_iterations.
parent
83007b1c
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
45 additions
and
55 deletions
+45
-55
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+43
-55
src/boosting/gbdt.h
src/boosting/gbdt.h
+2
-0
No files found.
python-package/lightgbm/basic.py
View file @
19512d82
...
...
@@ -123,12 +123,17 @@ def c_array(ctype, values):
"""Convert a python array to c array."""
return
(
ctype
*
len
(
values
))(
*
values
)
def
dict_to_str
(
data
):
def
param_
dict_to_str
(
data
):
if
data
is
None
or
len
(
data
)
==
0
:
return
""
pairs
=
[]
for
key
in
data
:
pairs
.
append
(
str
(
key
)
+
'='
+
str
(
data
[
key
]))
for
key
,
val
in
data
.
items
():
if
isinstance
(
val
,
list
):
pairs
.
append
(
str
(
key
)
+
'='
+
','
.
join
(
val
))
elif
isinstance
(
val
,
(
int
,
float
,
str
,
bool
)):
pairs
.
append
(
str
(
key
)
+
'='
+
str
(
val
))
else
:
raise
TypeError
(
'unknow type of parameter:%s , got:%s'
%
(
key
,
type
(
val
).
__name__
))
return
' '
.
join
(
pairs
)
"""marco definition of data type in c_api of LightGBM"""
C_API_DTYPE_FLOAT32
=
0
...
...
@@ -145,7 +150,6 @@ C_API_PREDICT_LEAF_INDEX =2
FIELD_TYPE_MAPPER
=
{
"label"
:
C_API_DTYPE_FLOAT32
,
"wegiht"
:
C_API_DTYPE_FLOAT32
,
"init_score"
:
C_API_DTYPE_FLOAT32
,
"group_id"
:
C_API_DTYPE_INT32
,
"group"
:
C_API_DTYPE_INT32
,
}
...
...
@@ -208,7 +212,7 @@ class Predictor(object):
self
.
handle
,
ctypes
.
byref
(
out_num_class
)))
self
.
num_class
=
out_num_class
.
value
self
.
__num_total_
model
=
out_num_iterations
.
value
*
self
.
num_class
self
.
__num_total_
iteration
=
out_num_iterations
.
value
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
is_manage_handle
self
.
handle
=
booster_handle
...
...
@@ -221,7 +225,7 @@ class Predictor(object):
_safe_call
(
_LIB
.
LGBM_BoosterGetCurrentIteration
(
self
.
handle
,
ctypes
.
byref
(
out_num_iterations
)))
self
.
__num_total_
model
=
out_num_iterations
.
value
*
self
.
num_class
self
.
__num_total_
iteration
=
out_num_iterations
.
value
else
:
raise
TypeError
(
'Need Model file to create a booster'
)
...
...
@@ -261,9 +265,9 @@ class Predictor(object):
predict_type
=
C_API_PREDICT_RAW_SCORE
if
pred_leaf
:
predict_type
=
C_API_PREDICT_LEAF_INDEX
int_data_has_header
=
0
if
data_has_header
:
int_data_has_header
=
1
int_data_has_header
=
1
if
data_has_header
else
0
if
num_iteration
>
self
.
__num_total_iteration
:
num_iteration
=
self
.
__num_total_iteration
if
is_str
(
data
):
tmp_pred_fname
=
tempfile
.
NamedTemporaryFile
(
prefix
=
"lightgbm_tmp_pred_"
).
name
_safe_call
(
_LIB
.
LGBM_BoosterPredictForFile
(
...
...
@@ -303,6 +307,15 @@ class Predictor(object):
raise
ValueError
(
'len of predict result(%d) cannot be divide nrow(%d)'
%
(
preds
.
size
,
nrow
)
)
return
preds
def
__get_num_preds
(
self
,
num_iteration
,
nrow
,
predict_type
):
n_preds
=
self
.
num_class
*
nrow
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
min
(
num_iteration
,
self
.
__num_total_iteration
)
else
:
n_preds
*=
self
.
__num_total_iteration
return
n_preds
def
__pred_for_np2d
(
self
,
mat
,
num_iteration
,
predict_type
):
"""
Predict for a 2-D numpy matrix.
...
...
@@ -316,13 +329,7 @@ class Predictor(object):
"""change non-float data to float data, need to copy"""
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
dtype
=
np
.
float32
)
ptr_data
,
type_ptr_data
=
c_float_array
(
data
)
n_preds
=
self
.
num_class
*
mat
.
shape
[
0
]
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
num_class
n_preds
*=
used_iteration
n_preds
=
self
.
__get_num_preds
(
num_iteration
,
mat
.
shape
[
0
],
predict_type
)
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
_safe_call
(
_LIB
.
LGBM_BoosterPredictForMat
(
...
...
@@ -346,13 +353,7 @@ class Predictor(object):
Predict for a csr data
"""
nrow
=
len
(
csr
.
indptr
)
-
1
n_preds
=
self
.
num_class
*
nrow
if
predict_type
==
C_API_PREDICT_LEAF_INDEX
:
if
num_iteration
>
0
:
n_preds
*=
num_iteration
else
:
used_iteration
=
self
.
__num_total_model
/
self
.
num_class
n_preds
*=
used_iteration
n_preds
=
self
.
__get_num_preds
(
num_iteration
,
nrow
,
predict_type
)
preds
=
np
.
zeros
(
n_preds
,
dtype
=
np
.
float32
)
out_num_preds
=
ctypes
.
c_int64
(
0
)
...
...
@@ -386,7 +387,7 @@ class Dataset(object):
"""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
_id
=
None
,
predictor
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
silent
=
False
,
params
=
None
):
"""
Dataset used in LightGBM.
...
...
@@ -404,8 +405,8 @@ class Dataset(object):
If this dataset validation, need to use training data as reference
weight : list or numpy 1-D array , optional
Weight for each instance.
group
_id
: list or numpy 1-D array , optional
group/query
id for each instance. Note: if having group/query id, data should group by this id
group : list or numpy 1-D array , optional
group/query
size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
...
...
@@ -420,8 +421,7 @@ class Dataset(object):
return
self
.
data_has_header
=
False
"""process for args"""
if
params
is
None
:
params
=
{}
params
=
{}
if
params
is
None
else
params
self
.
max_bin
=
max_bin
self
.
predictor
=
predictor
params
[
"max_bin"
]
=
max_bin
...
...
@@ -429,7 +429,7 @@ class Dataset(object):
params
[
"verbose"
]
=
0
elif
"verbose"
not
in
params
:
params
[
"verbose"
]
=
1
params_str
=
dict_to_str
(
params
)
params_str
=
param_
dict_to_str
(
params
)
"""process for reference dataset"""
ref_dataset
=
None
if
isinstance
(
reference
,
Dataset
):
...
...
@@ -464,8 +464,8 @@ class Dataset(object):
raise
ValueError
(
"label should not be None"
)
if
weight
is
not
None
:
self
.
set_weight
(
weight
)
if
group
_id
is
not
None
:
self
.
set_group
_id
(
group
_id
)
if
group
is
not
None
:
self
.
set_group
(
group
)
# load init score
if
self
.
predictor
is
not
None
and
isinstance
(
self
.
predictor
,
Predictor
):
init_score
=
self
.
predictor
.
predict
(
data
,
...
...
@@ -482,7 +482,7 @@ class Dataset(object):
init_score
=
new_init_score
self
.
set_init_score
(
init_score
)
def
create_valid
(
self
,
data
,
label
=
None
,
weight
=
None
,
group
_id
=
None
,
def
create_valid
(
self
,
data
,
label
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
params
=
None
):
"""
Create validation data align with current dataset
...
...
@@ -496,15 +496,15 @@ class Dataset(object):
Label of the training data.
weight : list or numpy 1-D array , optional
Weight for each instance.
group
_id
: list or numpy 1-D array , optional
group/query
id for each instance. Note: if having group/query id, data should group by this id
group : list or numpy 1-D array , optional
group/query
size for dataset
silent : boolean, optional
Whether print messages during construction
params: dict, optional
other parameters
"""
return
Dataset
(
data
,
label
=
label
,
max_bin
=
self
.
max_bin
,
reference
=
self
,
weight
=
weight
,
group
_id
=
group
_id
,
predictor
=
self
.
predictor
,
weight
=
weight
,
group
=
group
,
predictor
=
self
.
predictor
,
silent
=
silent
,
params
=
params
)
def
subset
(
self
,
used_indices
,
params
=
None
):
...
...
@@ -514,7 +514,7 @@ class Dataset(object):
used_indices
=
list_to_1d_numpy
(
used_indices
,
np
.
int32
)
ret
=
Dataset
(
None
)
ret
.
handle
=
ctypes
.
c_void_p
()
params_str
=
dict_to_str
(
params
)
params_str
=
param_
dict_to_str
(
params
)
_safe_call
(
_LIB
.
LGBM_DatasetGetSubset
(
ctypes
.
byref
(
self
.
handle
),
used_indices
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_int32
)),
...
...
@@ -624,6 +624,7 @@ class Dataset(object):
The array ofdata to be set
"""
if
data
is
None
:
"""set to None"""
_safe_call
(
_LIB
.
LGBM_DatasetSetField
(
self
.
handle
,
c_str
(
field_name
),
...
...
@@ -713,18 +714,6 @@ class Dataset(object):
self
.
__group
=
group
self
.
set_field
(
'group'
,
group
)
def
set_group_id
(
self
,
group_id
):
"""Set group_id of Dataset (used for ranking).
Parameters
----------
group : array like
group_id of Dataset (used for ranking).
"""
if
group_id
is
not
None
:
group_id
=
list_to_1d_numpy
(
group_id
,
np
.
int32
)
self
.
set_field
(
'group_id'
,
group_id
)
def
get_label
(
self
):
"""Get the label of the Dataset.
...
...
@@ -817,8 +806,7 @@ class Booster(object):
self
.
handle
=
ctypes
.
c_void_p
()
self
.
__need_reload_eval_info
=
True
self
.
__is_manage_handle
=
True
if
params
is
None
:
params
=
{}
params
=
{}
if
params
is
None
else
params
if
silent
:
params
[
"verbose"
]
=
0
elif
"verbose"
not
in
params
:
...
...
@@ -827,7 +815,7 @@ class Booster(object):
"""Training task"""
if
not
isinstance
(
train_set
,
Dataset
):
raise
TypeError
(
'training data should be Dataset instance, met{}'
.
format
(
type
(
train_set
).
__name__
))
params_str
=
dict_to_str
(
params
)
params_str
=
param_
dict_to_str
(
params
)
"""construct booster object"""
_safe_call
(
_LIB
.
LGBM_BoosterCreate
(
train_set
.
handle
,
...
...
@@ -907,7 +895,7 @@ class Booster(object):
params
[
"verbose"
]
=
0
elif
"verbose"
not
in
params
:
params
[
"verbose"
]
=
1
params_str
=
dict_to_str
(
params
)
params_str
=
param_
dict_to_str
(
params
)
_safe_call
(
_LIB
.
LGBM_BoosterResetParameter
(
self
.
handle
,
c_str
(
params_str
)))
...
...
@@ -1162,11 +1150,11 @@ class Booster(object):
raise
ValueError
(
"data_idx should be smaller than number of dataset"
)
if
self
.
__inner_predict_buffer
[
data_idx
]
is
None
:
if
data_idx
==
0
:
n
um_data
=
self
.
train_set
.
num_data
()
*
self
.
__num_class
n
_preds
=
self
.
train_set
.
num_data
()
*
self
.
__num_class
else
:
n
um_data
=
self
.
valid_sets
[
data_idx
-
1
].
num_data
()
*
self
.
__num_class
n
_preds
=
self
.
valid_sets
[
data_idx
-
1
].
num_data
()
*
self
.
__num_class
self
.
__inner_predict_buffer
[
data_idx
]
=
\
np
.
array
([
0.0
for
_
in
range
(
n
um_data
)],
dtype
=
np
.
float32
,
copy
=
False
)
np
.
array
([
0.0
for
_
in
range
(
n
_preds
)],
dtype
=
np
.
float32
,
copy
=
False
)
"""avoid to predict many time in one iteration"""
if
not
self
.
__is_predicted_cur_iter
[
data_idx
]:
tmp_out_len
=
ctypes
.
c_int64
(
0
)
...
...
src/boosting/gbdt.h
View file @
19512d82
...
...
@@ -181,6 +181,8 @@ public:
}
else
{
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
}
num_iteration_for_pred_
=
std
::
min
(
num_iteration_for_pred_
,
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
);
}
/*!
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment