Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
ef778069
Commit
ef778069
authored
Mar 01, 2017
by
Guolin Ke
Browse files
Add categorical feature support back.
parent
d93eb338
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
807 additions
and
258 deletions
+807
-258
include/LightGBM/feature_group.h
include/LightGBM/feature_group.h
+1
-1
include/LightGBM/tree.h
include/LightGBM/tree.h
+25
-37
pmml/pmml.py
pmml/pmml.py
+3
-2
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+114
-18
python-package/lightgbm/compat.py
python-package/lightgbm/compat.py
+9
-0
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+14
-2
python-package/lightgbm/plotting.py
python-package/lightgbm/plotting.py
+6
-1
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+13
-4
src/io/bin.cpp
src/io/bin.cpp
+171
-90
src/io/config.cpp
src/io/config.cpp
+1
-0
src/io/dataset.cpp
src/io/dataset.cpp
+2
-2
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+38
-3
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+32
-14
src/io/dense_nbits_bin.hpp
src/io/dense_nbits_bin.hpp
+32
-14
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+32
-14
src/io/tree.cpp
src/io/tree.cpp
+201
-49
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+108
-3
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+2
-1
src/treelearner/voting_parallel_tree_learner.cpp
src/treelearner/voting_parallel_tree_learner.cpp
+2
-2
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+1
-1
No files found.
include/LightGBM/feature_group.h
View file @
ef778069
...
...
@@ -131,7 +131,7 @@ public:
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
,
bin_mappers_
[
sub_feature
]
->
bin_type
()
);
}
/*!
* \brief From bin to feature value
...
...
include/LightGBM/tree.h
View file @
ef778069
...
...
@@ -34,6 +34,7 @@ public:
* \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value
...
...
@@ -44,7 +45,7 @@ public:
* \param gain Split gain
* \return The index of new leaf.
*/
int
Split
(
int
leaf
,
int
feature
,
uint32_t
threshold
,
int
real_feature
,
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
...
...
@@ -113,6 +114,15 @@ public:
/*! \brief Serialize this object to json*/
std
::
string
ToJSON
();
template
<
typename
T
>
static
bool
CategoricalDecision
(
T
fval
,
T
threshold
)
{
if
(
static_cast
<
int
>
(
fval
)
==
static_cast
<
int
>
(
threshold
))
{
return
true
;
}
else
{
return
false
;
}
}
template
<
typename
T
>
static
bool
NumericalDecision
(
T
fval
,
T
threshold
)
{
if
(
fval
<=
threshold
)
{
...
...
@@ -122,13 +132,18 @@ public:
}
}
private:
static
const
char
*
GetDecisionTypeName
(
int8_t
type
)
{
if
(
type
==
0
)
{
return
"no_greater"
;
}
else
{
return
"is"
;
}
}
inline
int
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
static
std
::
vector
<
bool
(
*
)(
uint32_t
,
uint32_t
)
>
inner_decision_funs
;
static
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
decision_funs
;
inline
int
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
;
private:
/*!
* \brief Find leaf index of which record belongs by features
...
...
@@ -157,6 +172,8 @@ private:
std
::
vector
<
uint32_t
>
threshold_in_bin_
;
/*! \brief A non-leaf node's split threshold in feature value */
std
::
vector
<
double
>
threshold_
;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std
::
vector
<
int8_t
>
decision_type_
;
/*! \brief A non-leaf node's split gain */
std
::
vector
<
double
>
split_gain_
;
// used for leaf node
...
...
@@ -173,6 +190,7 @@ private:
/*! \brief Depth for leaves */
std
::
vector
<
int
>
leaf_depth_
;
double
shrinkage_
;
bool
has_categorical_
;
};
...
...
@@ -186,40 +204,10 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return
leaf
;
}
inline
int
Tree
::
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
node
]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
return
~
node
;
}
inline
int
Tree
::
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
split_feature_inner
[
node
]]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
return
~
node
;
}
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
double
>
(
if
(
decision_funs
[
decision_type_
[
node
]]
(
feature_values
[
split_feature_
[
node
]],
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
...
...
pmml/pmml.py
View file @
ef778069
...
...
@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def
print_simple_predicate
(
tab_len
,
node_id
,
is_left_child
,
prev_node_idx
,
is_leaf
):
if
is_left_child
:
op
=
'lessOrEqual'
op
=
'equal'
if
decision_type
[
prev_node_idx
]
==
1
else
'lessOrEqual'
else
:
op
=
'greaterThan'
op
=
'notEqual'
if
decision_type
[
prev_node_idx
]
==
1
else
'greaterThan'
out_
(
'
\t
'
*
(
tab_len
+
1
)
+
(
"<SimplePredicate field=
\"
{0}
\"
"
+
" operator=
\"
{1}
\"
value=
\"
{2}
\"
/>"
).
format
(
get_field_name
(
node_id
,
prev_node_idx
,
is_leaf
),
op
,
get_threshold
(
node_id
,
prev_node_idx
,
is_leaf
)))
...
...
@@ -128,6 +128,7 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature
=
get_array_ints
(
next
(
model_content
))
split_gain
=
next
(
model_content
)
# unused
threshold
=
get_array_strings
(
next
(
model_content
))
decision_type
=
get_array_ints
(
next
(
model_content
))
left_child
=
get_array_ints
(
next
(
model_content
))
right_child
=
get_array_ints
(
next
(
model_content
))
leaf_parent
=
get_array_ints
(
next
(
model_content
))
...
...
python-package/lightgbm/basic.py
View file @
ef778069
...
...
@@ -12,8 +12,9 @@ from tempfile import NamedTemporaryFile
import
numpy
as
np
import
scipy.sparse
from
.compat
import
(
DataFrame
,
Series
,
integer_types
,
json
,
numeric_types
,
range_
,
string_type
)
from
.compat
import
(
DataFrame
,
Series
,
integer_types
,
json
,
json_default_with_numpy
,
numeric_types
,
range_
,
string_type
)
from
.libpath
import
find_lib_path
...
...
@@ -220,22 +221,49 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
def
_data_from_pandas
(
data
,
feature_name
):
def
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
pandas_categorical
):
if
isinstance
(
data
,
DataFrame
):
bad_fields
=
[
data
.
columns
[
i
]
for
i
,
dtype
in
enumerate
(
data
.
dtypes
)
if
dtype
.
name
not
in
PANDAS_DTYPE_MAPPER
]
if
bad_fields
:
msg
=
"""DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields: """
raise
ValueError
(
msg
+
', '
.
join
(
bad_fields
))
if
feature_name
==
'auto'
:
if
feature_name
==
'auto'
or
feature_name
is
None
:
if
all
([
isinstance
(
name
,
integer_types
+
(
np
.
integer
,
))
for
name
in
data
.
columns
]):
msg
=
"""Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values."""
warnings
.
filterwarnings
(
'once'
)
warnings
.
warn
(
msg
,
stacklevel
=
5
)
feature_name
=
[
str
(
name
)
for
name
in
data
.
columns
]
data
=
data
.
rename
(
columns
=
str
)
cat_cols
=
data
.
select_dtypes
(
include
=
[
'category'
]).
columns
if
pandas_categorical
is
None
:
# train dataset
pandas_categorical
=
[
list
(
data
[
col
].
cat
.
categories
)
for
col
in
cat_cols
]
else
:
if
len
(
cat_cols
)
!=
len
(
pandas_categorical
):
raise
ValueError
(
'train and valid dataset categorical_feature do not match.'
)
for
col
,
category
in
zip
(
cat_cols
,
pandas_categorical
):
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is pandas Index object
data
=
data
.
copy
()
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
)
if
categorical_feature
is
not
None
:
if
feature_name
is
None
:
feature_name
=
list
(
data
.
columns
)
if
categorical_feature
==
'auto'
:
categorical_feature
=
list
(
cat_cols
)
else
:
categorical_feature
=
list
(
categorical_feature
)
+
list
(
cat_cols
)
if
feature_name
==
'auto'
:
feature_name
=
list
(
data
.
columns
)
data_dtypes
=
data
.
dtypes
if
not
all
(
dtype
.
name
in
PANDAS_DTYPE_MAPPER
for
dtype
in
data_dtypes
):
bad_fields
=
[
data
.
columns
[
i
]
for
i
,
dtype
in
enumerate
(
data_dtypes
)
if
dtype
.
name
not
in
PANDAS_DTYPE_MAPPER
]
msg
=
"""DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise
ValueError
(
msg
+
', '
.
join
(
bad_fields
))
data
=
data
.
values
.
astype
(
'float'
)
elif
feature_name
==
'auto'
:
feature_name
=
None
return
data
,
feature_name
else
:
if
feature_name
==
'auto'
:
feature_name
=
None
if
categorical_feature
==
'auto'
:
categorical_feature
=
None
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
def
_label_from_pandas
(
label
):
...
...
@@ -249,6 +277,19 @@ def _label_from_pandas(label):
return
label
def
_save_pandas_categorical
(
file_name
,
pandas_categorical
):
with
open
(
file_name
,
'a'
)
as
f
:
f
.
write
(
'
\n
pandas_categorical:'
+
json
.
dumps
(
pandas_categorical
,
default
=
json_default_with_numpy
))
def
_load_pandas_categorical
(
file_name
):
with
open
(
file_name
,
'r'
)
as
f
:
last_line
=
f
.
readlines
()[
-
1
]
if
last_line
.
startswith
(
'pandas_categorical:'
):
return
json
.
loads
(
last_line
[
len
(
'pandas_categorical:'
):])
return
None
class
_InnerPredictor
(
object
):
"""
A _InnerPredictor of LightGBM.
...
...
@@ -280,6 +321,7 @@ class _InnerPredictor(object):
ctypes
.
byref
(
out_num_class
)))
self
.
num_class
=
out_num_class
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
False
self
.
handle
=
booster_handle
...
...
@@ -293,6 +335,7 @@ class _InnerPredictor(object):
self
.
handle
,
ctypes
.
byref
(
out_num_iterations
)))
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
None
else
:
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
...
...
@@ -328,7 +371,7 @@ class _InnerPredictor(object):
"""
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
data
=
_data_from_pandas
(
data
,
None
)[
0
]
data
=
_data_from_pandas
(
data
,
None
,
None
,
self
.
pandas_categorical
)[
0
]
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
predict_type
=
C_API_PREDICT_RAW_SCORE
...
...
@@ -359,6 +402,9 @@ class _InnerPredictor(object):
elif
isinstance
(
data
,
np
.
ndarray
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
,
num_iteration
,
predict_type
)
elif
isinstance
(
data
,
DataFrame
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
.
values
,
num_iteration
,
predict_type
)
else
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
...
...
@@ -486,7 +532,7 @@ class Dataset(object):
"""Dataset in LightGBM."""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
params
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
params
=
None
,
free_raw_data
=
True
):
"""
Parameters
...
...
@@ -509,6 +555,11 @@ class Dataset(object):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
Other parameters
free_raw_data: Bool
...
...
@@ -523,10 +574,12 @@ class Dataset(object):
self
.
group
=
group
self
.
silent
=
silent
self
.
feature_name
=
feature_name
self
.
categorical_feature
=
categorical_feature
self
.
params
=
params
self
.
free_raw_data
=
free_raw_data
self
.
used_indices
=
None
self
.
_predictor
=
None
self
.
pandas_categorical
=
None
def
__del__
(
self
):
self
.
_free_handle
()
...
...
@@ -539,11 +592,11 @@ class Dataset(object):
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
params
=
None
):
categorical_feature
=
'auto'
,
params
=
None
):
if
data
is
None
:
self
.
handle
=
None
return
data
,
feature_name
,
=
_data_from_pandas
(
data
,
feature_name
)
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
=
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
)
label
=
_label_from_pandas
(
label
)
self
.
data_has_header
=
False
"""process for args"""
...
...
@@ -555,6 +608,23 @@ class Dataset(object):
params
[
"verbose"
]
=
0
elif
"verbose"
not
in
params
:
params
[
"verbose"
]
=
1
"""get categorical features"""
if
categorical_feature
is
not
None
:
categorical_indices
=
set
()
feature_dict
=
{}
if
feature_name
is
not
None
:
feature_dict
=
{
name
:
i
for
i
,
name
in
enumerate
(
feature_name
)}
for
name
in
categorical_feature
:
if
isinstance
(
name
,
string_type
)
and
name
in
feature_dict
:
categorical_indices
.
add
(
feature_dict
[
name
])
elif
isinstance
(
name
,
integer_types
):
categorical_indices
.
add
(
name
)
else
:
raise
TypeError
(
"Wrong type({}) or unknown name({}) in categorical_feature"
.
format
(
type
(
name
).
__name__
,
name
))
params
[
'categorical_column'
]
=
sorted
(
categorical_indices
)
params_str
=
param_dict_to_str
(
params
)
"""process for reference dataset"""
ref_dataset
=
None
...
...
@@ -714,7 +784,7 @@ class Dataset(object):
self
.
_lazy_init
(
self
.
data
,
label
=
self
.
label
,
max_bin
=
self
.
max_bin
,
weight
=
self
.
weight
,
group
=
self
.
group
,
predictor
=
self
.
_predictor
,
silent
=
self
.
silent
,
feature_name
=
self
.
feature_name
,
params
=
self
.
params
)
categorical_feature
=
self
.
categorical_feature
,
params
=
self
.
params
)
if
self
.
free_raw_data
:
self
.
data
=
None
return
self
...
...
@@ -744,6 +814,7 @@ class Dataset(object):
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
free_raw_data
=
self
.
free_raw_data
)
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
return
ret
def
subset
(
self
,
used_indices
,
params
=
None
):
...
...
@@ -758,8 +829,9 @@ class Dataset(object):
Other parameters
"""
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
params
=
params
)
categorical_feature
=
self
.
categorical_feature
,
params
=
params
)
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
ret
.
used_indices
=
used_indices
return
ret
...
...
@@ -867,6 +939,24 @@ class Dataset(object):
else
:
raise
TypeError
(
"Unknown type"
)
def
set_categorical_feature
(
self
,
categorical_feature
):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if
self
.
categorical_feature
==
categorical_feature
:
return
if
self
.
data
is
not
None
:
self
.
categorical_feature
=
categorical_feature
self
.
_free_handle
()
else
:
raise
LightGBMError
(
"Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this."
)
def
_set_predictor
(
self
,
predictor
):
"""
Set predictor for continued training, not recommand for user to call this function.
...
...
@@ -889,6 +979,7 @@ class Dataset(object):
reference : Dataset
Will use reference as template to consturct current dataset
"""
self
.
set_categorical_feature
(
reference
.
categorical_feature
)
self
.
set_feature_name
(
reference
.
feature_name
)
self
.
_set_predictor
(
reference
.
_predictor
)
if
self
.
reference
is
reference
:
...
...
@@ -1117,6 +1208,7 @@ class Booster(object):
self
.
__inner_predict_buffer
=
[
None
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__get_eval_info
()
self
.
pandas_categorical
=
train_set
.
pandas_categorical
elif
model_file
is
not
None
:
"""Prediction task"""
out_num_iterations
=
ctypes
.
c_int
(
0
)
...
...
@@ -1129,6 +1221,7 @@ class Booster(object):
self
.
handle
,
ctypes
.
byref
(
out_num_class
)))
self
.
__num_class
=
out_num_class
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
'model_str'
in
params
:
self
.
__load_model_from_string
(
params
[
'model_str'
])
else
:
...
...
@@ -1144,6 +1237,7 @@ class Booster(object):
def
__deepcopy__
(
self
,
_
):
model_str
=
self
.
__save_model_to_string
()
booster
=
Booster
({
'model_str'
:
model_str
})
booster
.
pandas_categorical
=
self
.
pandas_categorical
return
booster
def
__getstate__
(
self
):
...
...
@@ -1383,6 +1477,7 @@ class Booster(object):
self
.
handle
,
ctypes
.
c_int
(
num_iteration
),
c_str
(
filename
)))
_save_pandas_categorical
(
filename
,
self
.
pandas_categorical
)
def
__load_model_from_string
(
self
,
model_str
):
"""[Private] Load model from string"""
...
...
@@ -1494,6 +1589,7 @@ class Booster(object):
def
_to_predictor
(
self
):
"""Convert to predictor"""
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
.
pandas_categorical
=
self
.
pandas_categorical
return
predictor
def
feature_name
(
self
):
...
...
python-package/lightgbm/compat.py
View file @
ef778069
...
...
@@ -39,6 +39,15 @@ except (ImportError, SyntaxError):
import
json
def
json_default_with_numpy
(
obj
):
if
isinstance
(
obj
,
(
np
.
integer
,
np
.
floating
,
np
.
bool_
)):
return
obj
.
item
()
elif
isinstance
(
obj
,
np
.
ndarray
):
return
obj
.
tolist
()
else
:
return
obj
"""pandas"""
try
:
from
pandas
import
Series
,
DataFrame
...
...
python-package/lightgbm/engine.py
View file @
ef778069
...
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
valid_sets
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""
...
...
@@ -45,6 +45,11 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping.
Requires at least one validation data and one metric
...
...
@@ -98,6 +103,7 @@ def train(params, train_set, num_boost_round=100,
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
is_valid_contain_train
=
False
train_data_name
=
"training"
...
...
@@ -271,7 +277,7 @@ def _agg_cv_result(raw_results):
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
callbacks
=
None
):
...
...
@@ -305,6 +311,11 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
...
...
@@ -343,6 +354,7 @@ def cv(params, train_set, num_boost_round=10,
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
if
metrics
:
params
.
setdefault
(
'metric'
,
[])
...
...
python-package/lightgbm/plotting.py
View file @
ef778069
...
...
@@ -257,7 +257,12 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if
info
in
{
'split_gain'
,
'internal_value'
,
'internal_count'
}:
label
+=
'
\n
'
+
info
+
':'
+
str
(
root
[
info
])
graph
.
node
(
name
,
label
=
label
)
l_dec
,
r_dec
=
'<='
,
'>'
if
root
[
'decision_type'
]
==
'no_greater'
:
l_dec
,
r_dec
=
'<='
,
'>'
elif
root
[
'decision_type'
]
==
'is'
:
l_dec
,
r_dec
=
'is'
,
"isn't"
else
:
raise
ValueError
(
'Invalid decision type in tree model.'
)
add
(
root
[
'left_child'
],
name
,
l_dec
)
add
(
root
[
'right_child'
],
name
,
r_dec
)
else
:
# leaf
...
...
python-package/lightgbm/sklearn.py
View file @
ef778069
...
...
@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
"""
Fit the gradient boosting model
...
...
@@ -318,6 +318,11 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto'
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
...
...
@@ -401,6 +406,7 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds
=
early_stopping_rounds
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
verbose_eval
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
if
evals_result
:
...
...
@@ -508,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score
=
None
,
eval_metric
=
"l2"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
callbacks
=
None
):
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
=
sample_weight
,
init_score
=
init_score
,
eval_set
=
eval_set
,
...
...
@@ -517,6 +523,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
return
self
...
...
@@ -553,7 +560,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score
=
None
,
eval_metric
=
"binary_logloss"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
y
=
self
.
_le
.
transform
(
y
)
...
...
@@ -576,6 +583,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
return
self
...
...
@@ -653,7 +661,7 @@ class LGBMRanker(LGBMModel):
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
'ndcg'
,
eval_at
=
1
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
"""
Most arguments like common methods except following:
...
...
@@ -684,5 +692,6 @@ class LGBMRanker(LGBMModel):
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
return
self
src/io/bin.cpp
View file @
ef778069
...
...
@@ -24,7 +24,13 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_
=
other
.
num_bin_
;
is_trival_
=
other
.
is_trival_
;
sparse_rate_
=
other
.
sparse_rate_
;
bin_upper_bound_
=
other
.
bin_upper_bound_
;
bin_type_
=
other
.
bin_type_
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
other
.
bin_upper_bound_
;
}
else
{
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
default_bin_
=
other
.
default_bin_
;
...
...
@@ -38,22 +44,34 @@ BinMapper::~BinMapper() {
}
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
)
{
int
sum_left
=
0
;
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
sum_left
+=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
,
BinType
bin_type
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
int
sum_left
=
0
;
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
sum_left
+=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
int
sum_left
=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
return
true
;
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>&
values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
)
{
// limit max_bin by min_data_in_bin
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
)
{
bin_type_
=
bin_type
;
default_bin_
=
0
;
std
::
vector
<
double
>&
raw_values
=
values
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
raw_values
.
size
());
// find distinct_values first
...
...
@@ -95,98 +113,134 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
max_val_
=
distinct_values
.
back
();
std
::
vector
<
int
>
cnt_in_bin
;
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
if
(
num_values
<=
max_bin
)
{
// use distinct value is enough
bin_upper_bound_
.
clear
();
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cur_cnt_inbin
=
0
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
num_values
<=
max_bin
)
{
// use distinct value is enough
bin_upper_bound_
.
clear
();
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cur_cnt_inbin
=
0
;
}
}
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
}
else
{
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
if
(
zero_cnt
>
mean_bin_size
)
{
int
non_zero_cnt
=
static_cast
<
int
>
(
raw_values
.
size
());
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
}
// mean size for one bin
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_sample_cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
}
else
{
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
if
(
zero_cnt
>
mean_bin_size
)
{
int
non_zero_cnt
=
static_cast
<
int
>
(
raw_values
.
size
());
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
// mean size for one bin
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_sample_cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
num_bin_
=
bin_cnt
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
CHECK
(
num_bin_
<=
max_bin
);
}
else
{
// convert to int type first
std
::
vector
<
int
>
distinct_values_int
;
std
::
vector
<
int
>
counts_int
;
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
0
]));
counts_int
.
push_back
(
counts
[
0
]);
for
(
size_t
i
=
1
;
i
<
distinct_values
.
size
();
++
i
)
{
if
(
static_cast
<
int
>
(
distinct_values
[
i
])
!=
distinct_values_int
.
back
())
{
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
i
]));
counts_int
.
push_back
(
counts
[
i
]);
}
else
{
counts_int
.
back
()
+=
counts
[
i
];
}
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
num_bin_
=
bin_cnt
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
// sort by counts
Common
::
SortForPair
<
int
,
int
>
(
counts_int
,
distinct_values_int
,
0
,
true
);
// will ingore the categorical of small counts
const
int
cut_cnt
=
static_cast
<
int
>
(
total_sample_cnt
*
0.98
f
);
categorical_2_bin_
.
clear
();
bin_2_categorical_
.
clear
();
num_bin_
=
0
;
int
used_cnt
=
0
;
max_bin
=
std
::
min
(
static_cast
<
int
>
(
distinct_values_int
.
size
()),
max_bin
);
while
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
)
{
bin_2_categorical_
.
push_back
(
distinct_values_int
[
num_bin_
]);
categorical_2_bin_
[
distinct_values_int
[
num_bin_
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
num_bin_
];
++
num_bin_
;
}
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
cnt_in_bin
=
counts_int
;
counts_int
.
resize
(
num_bin_
);
counts_int
.
back
()
+=
static_cast
<
int
>
(
total_sample_cnt
-
used_cnt
);
}
// check trival(num_bin_ == 1) feature
if
(
num_bin_
<=
1
)
{
is_trival_
=
true
;
default_bin_
=
0
;
}
else
{
is_trival_
=
false
;
default_bin_
=
ValueToBin
(
0
);
}
if
(
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
))
{
// check useless bin
if
(
!
is_trival_
&&
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
,
bin_type_
))
{
is_trival_
=
true
;
}
if
(
!
is_trival_
)
{
default_bin_
=
ValueToBin
(
0
);
}
// calculate sparse rate
CHECK
(
num_bin_
<=
max_bin
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
GetDefaultBin
()])
/
static_cast
<
double
>
(
total_sample_cnt
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
default_bin_
])
/
static_cast
<
double
>
(
total_sample_cnt
);
}
...
...
@@ -195,6 +249,7 @@ int BinMapper::SizeForSpecificBin(int bin) {
size
+=
sizeof
(
int
);
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
BinType
);
size
+=
2
*
sizeof
(
double
);
size
+=
bin
*
sizeof
(
double
);
size
+=
sizeof
(
uint32_t
);
...
...
@@ -208,13 +263,19 @@ void BinMapper::CopyTo(char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
else
{
std
::
memcpy
(
buffer
,
bin_2_categorical_
.
data
(),
num_bin_
*
sizeof
(
int
));
}
}
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
...
...
@@ -224,30 +285,50 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
}
else
{
bin_2_categorical_
=
std
::
vector
<
int
>
(
num_bin_
);
std
::
memcpy
(
bin_2_categorical_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
int
));
categorical_2_bin_
.
clear
();
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
categorical_2_bin_
[
bin_2_categorical_
[
i
]]
=
static_cast
<
unsigned
int
>
(
i
);
}
}
}
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
num_bin_
,
sizeof
(
num_bin_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
bin_type_
,
sizeof
(
bin_type_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
fwrite
(
&
default_bin_
,
sizeof
(
default_bin_
),
1
,
file
);
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
else
{
fwrite
(
bin_2_categorical_
.
data
(),
sizeof
(
int
),
num_bin_
,
file
);
}
}
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
ret
+=
sizeof
(
double
)
*
num_bin_
;
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
ret
+=
sizeof
(
double
)
*
num_bin_
;
}
else
{
ret
+=
sizeof
(
int
)
*
num_bin_
;
}
return
ret
;
}
...
...
src/io/config.cpp
View file @
ef778069
...
...
@@ -216,6 +216,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString
(
params
,
"weight_column"
,
&
weight_column
);
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
GetString
(
params
,
"categorical_column"
,
&
categorical_column
);
GetInt
(
params
,
"min_data_in_leaf"
,
&
min_data_in_leaf
);
GetInt
(
params
,
"min_dato_in_bin"
,
&
min_data_in_bin
);
GetDouble
(
params
,
"max_conflict_rate"
,
&
max_conflict_rate
);
...
...
src/io/dataset.cpp
View file @
ef778069
...
...
@@ -43,8 +43,8 @@ std::vector<std::vector<int>> NoGroup(
void
Dataset
::
Construct
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
const
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
size_t
total_sample_cnt
,
const
std
::
vector
<
std
::
vector
<
int
>>&
,
size_t
,
const
IOConfig
&
io_config
)
{
num_total_features_
=
static_cast
<
int
>
(
bin_mappers
.
size
());
// get num_features
...
...
src/io/dataset_loader.cpp
View file @
ef778069
...
...
@@ -131,6 +131,29 @@ void DatasetLoader::SetHeader(const char* filename) {
ignore_features_
.
emplace
(
group_idx_
);
}
}
if
(
io_config_
.
categorical_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config_
.
categorical_column
,
name_prefix
))
{
std
::
string
names
=
io_config_
.
categorical_column
.
substr
(
name_prefix
.
size
());
for
(
auto
name
:
Common
::
Split
(
names
.
c_str
(),
','
))
{
if
(
name2idx
.
count
(
name
)
>
0
)
{
int
tmp
=
name2idx
[
name
];
categorical_features_
.
emplace
(
tmp
);
}
else
{
Log
::
Fatal
(
"Could not find categorical_column %s in data file"
,
name
.
c_str
());
}
}
}
else
{
for
(
auto
token
:
Common
::
Split
(
io_config_
.
categorical_column
.
c_str
(),
','
))
{
int
tmp
=
0
;
if
(
!
Common
::
AtoiAndCheck
(
token
.
c_str
(),
&
tmp
))
{
Log
::
Fatal
(
"categorical_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
categorical_features_
.
emplace
(
tmp
);
}
}
}
}
Dataset
*
DatasetLoader
::
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
)
{
...
...
@@ -471,9 +494,13 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
total_sample_size
,
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
);
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_data
));
dataset
->
feature_names_
=
feature_names_
;
...
...
@@ -684,9 +711,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
);
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
}
else
{
// if have multi-machines, need find bin distributed
...
...
@@ -716,9 +747,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
start
[
rank
]
+
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
BinMapper
bin_mapper
;
bin_mapper
.
FindBin
(
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
);
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
bin_mapper
.
CopyTo
(
input_buffer
.
data
()
+
i
*
type_size
);
}
// convert to binary size
...
...
src/io/dense_bin.hpp
View file @
ef778069
...
...
@@ -132,7 +132,7 @@ public:
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
...
...
@@ -144,19 +144,37 @@ public:
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
return
lte_count
;
...
...
src/io/dense_nbits_bin.hpp
View file @
ef778069
...
...
@@ -161,7 +161,7 @@ public:
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
uint8_t
th
=
static_cast
<
uint8_t
>
(
threshold
+
min_bin
);
uint8_t
minb
=
static_cast
<
uint8_t
>
(
min_bin
);
...
...
@@ -173,19 +173,37 @@ public:
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
return
lte_count
;
...
...
src/io/sparse_bin.hpp
View file @
ef778069
...
...
@@ -125,7 +125,7 @@ public:
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
// not need to split
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
...
...
@@ -139,19 +139,37 @@ public:
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
return
lte_count
;
...
...
src/io/tree.cpp
View file @
ef778069
...
...
@@ -15,6 +15,11 @@
namespace
LightGBM
{
std
::
vector
<
bool
(
*
)(
uint32_t
,
uint32_t
)
>
Tree
::
inner_decision_funs
=
{
Tree
::
NumericalDecision
<
uint32_t
>
,
Tree
::
CategoricalDecision
<
uint32_t
>
};
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
Tree
::
decision_funs
=
{
Tree
::
NumericalDecision
<
double
>
,
Tree
::
CategoricalDecision
<
double
>
};
Tree
::
Tree
(
int
max_leaves
)
:
max_leaves_
(
max_leaves
)
{
...
...
@@ -25,6 +30,7 @@ Tree::Tree(int max_leaves)
split_feature_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
uint32_t
>
(
max_leaves_
-
1
);
threshold_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
decision_type_
=
std
::
vector
<
int8_t
>
(
max_leaves_
-
1
);
split_gain_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
leaf_parent_
=
std
::
vector
<
int
>
(
max_leaves_
);
leaf_value_
=
std
::
vector
<
double
>
(
max_leaves_
);
...
...
@@ -37,12 +43,13 @@ Tree::Tree(int max_leaves)
num_leaves_
=
1
;
leaf_parent_
[
0
]
=
-
1
;
shrinkage_
=
1.0
f
;
has_categorical_
=
false
;
}
Tree
::~
Tree
()
{
}
int
Tree
::
Split
(
int
leaf
,
int
feature
,
uint32_t
threshold_bin
,
int
real_feature
,
int
Tree
::
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold_bin
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
)
{
int
new_node_idx
=
num_leaves_
-
1
;
...
...
@@ -59,6 +66,12 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
// add new node
split_feature_inner
[
new_node_idx
]
=
feature
;
split_feature_
[
new_node_idx
]
=
real_feature
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
decision_type_
[
new_node_idx
]
=
0
;
}
else
{
has_categorical_
=
true
;
decision_type_
[
new_node_idx
]
=
1
;
}
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_
[
new_node_idx
]
=
threshold_double
;
split_gain_
[
new_node_idx
]
=
gain
;
...
...
@@ -84,62 +97,196 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iter
,
i
)]);
}
});
if
(
has_categorical_
)
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
i
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeafRaw
(
iter
,
i
)]);
}
});
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
const
data_size_t
*
used_data_indices
,
data_size_t
num_data
,
double
*
score
)
const
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iter
,
used_data_indices
[
i
])]);
}
});
if
(
has_categorical_
)
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeafRaw
(
iter
,
used_data_indices
[
i
])]);
}
});
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
}
...
...
@@ -152,6 +299,8 @@ std::string Tree::ToString() {
<<
Common
::
ArrayToString
<
double
>
(
split_gain_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"threshold="
<<
Common
::
ArrayToString
<
double
>
(
threshold_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"decision_type="
<<
Common
::
ArrayToString
<
int
>
(
Common
::
ArrayCast
<
int8_t
,
int
>
(
decision_type_
),
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"left_child="
<<
Common
::
ArrayToString
<
int
>
(
left_child_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"right_child="
...
...
@@ -191,6 +340,7 @@ std::string Tree::NodeToJSON(int index) {
str_buf
<<
"
\"
split_feature
\"
:"
<<
split_feature_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_gain
\"
:"
<<
split_gain_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
threshold
\"
:"
<<
threshold_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
decision_type
\"
:
\"
"
<<
Tree
::
GetDecisionTypeName
(
decision_type_
[
index
])
<<
"
\"
,"
<<
std
::
endl
;
str_buf
<<
"
\"
internal_value
\"
:"
<<
internal_value_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_count
\"
:"
<<
internal_count_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
left_child
\"
:"
<<
NodeToJSON
(
left_child_
[
index
])
<<
","
<<
std
::
endl
;
...
...
@@ -229,6 +379,7 @@ Tree::Tree(const std::string& str) {
||
key_vals
.
count
(
"leaf_parent"
)
<=
0
||
key_vals
.
count
(
"leaf_value"
)
<=
0
||
key_vals
.
count
(
"internal_value"
)
<=
0
||
key_vals
.
count
(
"internal_count"
)
<=
0
||
key_vals
.
count
(
"leaf_count"
)
<=
0
||
key_vals
.
count
(
"shrinkage"
)
<=
0
||
key_vals
.
count
(
"decision_type"
)
<=
0
)
{
Log
::
Fatal
(
"Tree model string format error"
);
}
...
...
@@ -239,6 +390,7 @@ Tree::Tree(const std::string& str) {
right_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"right_child"
],
' '
,
num_leaves_
-
1
);
split_feature_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"split_feature"
],
' '
,
num_leaves_
-
1
);
threshold_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"threshold"
],
' '
,
num_leaves_
-
1
);
decision_type_
=
Common
::
StringToArray
<
int8_t
>
(
key_vals
[
"decision_type"
],
' '
,
num_leaves_
-
1
);
split_gain_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"split_gain"
],
' '
,
num_leaves_
-
1
);
internal_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"internal_count"
],
' '
,
num_leaves_
-
1
);
internal_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"internal_value"
],
' '
,
num_leaves_
-
1
);
...
...
src/treelearner/feature_histogram.hpp
View file @
ef778069
...
...
@@ -41,9 +41,16 @@ public:
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void
Init
(
HistogramBinEntry
*
data
,
const
FeatureMetainfo
*
meta
)
{
void
Init
(
HistogramBinEntry
*
data
,
const
FeatureMetainfo
*
meta
,
BinType
bin_type
)
{
meta_
=
meta
;
data_
=
data
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdNumerical
,
this
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
);
}
else
{
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdCategorical
,
this
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
);
}
}
HistogramBinEntry
*
RawData
()
{
...
...
@@ -60,9 +67,14 @@ public:
data_
[
i
].
sum_hessians
-=
other
.
data_
[
i
].
sum_hessians
;
}
}
void
FindBestThreshold
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
sum_hessian
+=
2
*
kEpsilon
;
find_best_threshold_fun_
(
sum_gradient
,
sum_hessian
+
2
*
kEpsilon
,
num_data
,
output
);
}
void
FindBestThresholdNumerical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
double
best_sum_left_gradient
=
NAN
;
double
best_sum_left_hessian
=
NAN
;
double
best_gain
=
kMinScore
;
...
...
@@ -131,6 +143,97 @@ public:
output
->
gain
=
kMinScore
;
}
}
void
FindBestThresholdCategorical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
double
best_gain
=
kMinScore
;
uint32_t
best_threshold
=
static_cast
<
uint32_t
>
(
meta_
->
num_bin
);
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
,
sum_hessian
);
double
min_gain_shift
=
gain_shift
+
meta_
->
tree_config
->
min_gain_to_split
;
is_splittable_
=
false
;
const
int
bias
=
meta_
->
bias
;
int
t
=
meta_
->
num_bin
-
1
-
bias
;
const
int
t_end
=
0
;
// from right to left, and we don't need data in bin0
for
(;
t
>=
t_end
;
--
t
)
{
// if data not enough, or sum hessian too small
if
(
data_
[
t
].
cnt
<
meta_
->
tree_config
->
min_data_in_leaf
||
data_
[
t
].
sum_hessians
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
other_count
=
num_data
-
data_
[
t
].
cnt
;
// if data not enough
if
(
other_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
continue
;
double
sum_other_hessian
=
sum_hessian
-
data_
[
t
].
sum_hessians
-
kEpsilon
;
// if sum hessian too small
if
(
sum_other_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
double
sum_other_gradient
=
sum_gradient
-
data_
[
t
].
sum_gradients
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
)
+
GetLeafSplitGain
(
data_
[
t
].
sum_gradients
,
data_
[
t
].
sum_hessians
+
kEpsilon
);
// gain with split is worse than without split
if
(
current_gain
<=
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_threshold
=
static_cast
<
uint32_t
>
(
t
+
bias
);
best_gain
=
current_gain
;
}
}
// need restore zero bin
if
(
bias
==
1
)
{
t
=
meta_
->
num_bin
-
1
-
bias
;
double
sum_bin0_gradient
=
sum_gradient
;
double
sum_bin0_hessian
=
sum_hessian
;
data_size_t
cnt_bin0
=
num_data
;
for
(;
t
>=
0
;
--
t
)
{
sum_bin0_gradient
-=
data_
[
t
].
sum_gradients
;
sum_bin0_hessian
-=
data_
[
t
].
sum_hessians
;
cnt_bin0
-=
data_
[
t
].
cnt
;
}
data_size_t
other_count
=
num_data
-
cnt_bin0
;
double
sum_other_hessian
=
sum_hessian
-
sum_bin0_hessian
-
kEpsilon
;
if
(
cnt_bin0
>=
meta_
->
tree_config
->
min_data_in_leaf
&&
sum_bin0_hessian
>=
meta_
->
tree_config
->
min_sum_hessian_in_leaf
&&
other_count
>=
meta_
->
tree_config
->
min_data_in_leaf
&&
sum_other_hessian
>=
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
{
double
sum_other_gradient
=
sum_gradient
-
sum_bin0_gradient
;
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
)
+
GetLeafSplitGain
(
sum_bin0_gradient
,
sum_bin0_hessian
+
kEpsilon
);
if
(
current_gain
>
min_gain_shift
)
{
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_threshold
=
static_cast
<
uint32_t
>
(
0
);
best_gain
=
current_gain
;
}
}
}
}
if
(
is_splittable_
)
{
// update split information
output
->
feature
=
meta_
->
feature_idx
;
output
->
threshold
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
data_
[
best_threshold
].
sum_gradients
,
data_
[
best_threshold
].
sum_hessians
+
kEpsilon
);
output
->
left_count
=
data_
[
best_threshold
].
cnt
;
output
->
left_sum_gradient
=
data_
[
best_threshold
].
sum_gradients
;
output
->
left_sum_hessian
=
data_
[
best_threshold
].
sum_hessians
+
kEpsilon
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradient
-
data_
[
best_threshold
].
sum_gradients
,
sum_hessian
-
data_
[
best_threshold
].
sum_hessians
-
kEpsilon
);
output
->
right_count
=
num_data
-
data_
[
best_threshold
].
cnt
;
output
->
right_sum_gradient
=
sum_gradient
-
data_
[
best_threshold
].
sum_gradients
;
output
->
right_sum_hessian
=
sum_hessian
-
data_
[
best_threshold
].
sum_hessians
-
kEpsilon
;
output
->
gain
=
best_gain
-
gain_shift
;
}
else
{
output
->
feature
=
meta_
->
feature_idx
;
output
->
gain
=
kMinScore
;
}
}
/*!
* \brief Binary size of this histogram
*/
...
...
@@ -188,6 +291,8 @@ private:
//std::vector<HistogramBinEntry> data_;
/*! \brief False if this histogram cannot split */
bool
is_splittable_
=
true
;
std
::
function
<
void
(
double
,
double
,
data_size_t
,
SplitInfo
*
)
>
find_best_threshold_fun_
;
};
class
HistogramPool
{
public:
...
...
@@ -264,7 +369,7 @@ public:
uint64_t
offset
=
0
;
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
pool_
[
i
][
j
].
Init
(
data_
[
i
].
data
()
+
offset
,
&
feature_metas_
[
j
]);
pool_
[
i
][
j
].
Init
(
data_
[
i
].
data
()
+
offset
,
&
feature_metas_
[
j
]
,
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
()
);
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
...
...
src/treelearner/serial_tree_learner.cpp
View file @
ef778069
...
...
@@ -490,7 +490,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
// left = parent
*
left_leaf
=
best_Leaf
;
// split tree, will return right leaf
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
best_split_info
.
feature
,
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
best_split_info
.
feature
,
train_data_
->
FeatureBinMapper
(
best_split_info
.
feature
)
->
bin_type
(),
best_split_info
.
threshold
,
train_data_
->
RealFeatureIndex
(
best_split_info
.
feature
),
train_data_
->
RealThreshold
(
best_split_info
.
feature
,
best_split_info
.
threshold
),
...
...
src/treelearner/voting_parallel_tree_learner.cpp
View file @
ef778069
...
...
@@ -79,8 +79,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
uint64_t
offset
=
0
;
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
smaller_leaf_histogram_array_global_
[
j
].
Init
(
smaller_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]);
larger_leaf_histogram_array_global_
[
j
].
Init
(
larger_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]);
smaller_leaf_histogram_array_global_
[
j
].
Init
(
smaller_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]
,
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
()
);
larger_leaf_histogram_array_global_
[
j
].
Init
(
larger_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]
,
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
()
);
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
...
...
tests/python_package_test/test_basic.py
View file @
ef778069
...
...
@@ -49,7 +49,7 @@ class TestBasic(unittest.TestCase):
for
preds
in
zip
(
pred_from_matr
,
pred_from_model_file
):
self
.
assertEqual
(
*
preds
)
# check pmml
#
os.system('python ../../pmml/pmml.py model.txt')
os
.
system
(
'python ../../pmml/pmml.py model.txt'
)
print
(
"----------------------------------------------------------------------"
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment