Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
ef778069
Commit
ef778069
authored
Mar 01, 2017
by
Guolin Ke
Browse files
Add categorical feature support back.
parent
d93eb338
Changes
41
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
807 additions
and
258 deletions
+807
-258
include/LightGBM/feature_group.h
include/LightGBM/feature_group.h
+1
-1
include/LightGBM/tree.h
include/LightGBM/tree.h
+25
-37
pmml/pmml.py
pmml/pmml.py
+3
-2
python-package/lightgbm/basic.py
python-package/lightgbm/basic.py
+114
-18
python-package/lightgbm/compat.py
python-package/lightgbm/compat.py
+9
-0
python-package/lightgbm/engine.py
python-package/lightgbm/engine.py
+14
-2
python-package/lightgbm/plotting.py
python-package/lightgbm/plotting.py
+6
-1
python-package/lightgbm/sklearn.py
python-package/lightgbm/sklearn.py
+13
-4
src/io/bin.cpp
src/io/bin.cpp
+171
-90
src/io/config.cpp
src/io/config.cpp
+1
-0
src/io/dataset.cpp
src/io/dataset.cpp
+2
-2
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+38
-3
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+32
-14
src/io/dense_nbits_bin.hpp
src/io/dense_nbits_bin.hpp
+32
-14
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+32
-14
src/io/tree.cpp
src/io/tree.cpp
+201
-49
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+108
-3
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+2
-1
src/treelearner/voting_parallel_tree_learner.cpp
src/treelearner/voting_parallel_tree_learner.cpp
+2
-2
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+1
-1
No files found.
include/LightGBM/feature_group.h
View file @
ef778069
...
@@ -131,7 +131,7 @@ public:
...
@@ -131,7 +131,7 @@ public:
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
,
bin_mappers_
[
sub_feature
]
->
bin_type
()
);
}
}
/*!
/*!
* \brief From bin to feature value
* \brief From bin to feature value
...
...
include/LightGBM/tree.h
View file @
ef778069
...
@@ -34,6 +34,7 @@ public:
...
@@ -34,6 +34,7 @@ public:
* \brief Performing a split on tree leaves.
* \brief Performing a split on tree leaves.
* \param leaf Index of leaf to be split
* \param leaf Index of leaf to be split
* \param feature Index of feature; the converted index after removing useless features
* \param feature Index of feature; the converted index after removing useless features
* \param bin_type type of this feature, numerical or categorical
* \param threshold Threshold(bin) of split
* \param threshold Threshold(bin) of split
* \param real_feature Index of feature, the original index on data
* \param real_feature Index of feature, the original index on data
* \param threshold_double Threshold on feature value
* \param threshold_double Threshold on feature value
...
@@ -44,7 +45,7 @@ public:
...
@@ -44,7 +45,7 @@ public:
* \param gain Split gain
* \param gain Split gain
* \return The index of new leaf.
* \return The index of new leaf.
*/
*/
int
Split
(
int
leaf
,
int
feature
,
uint32_t
threshold
,
int
real_feature
,
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
...
@@ -113,6 +114,15 @@ public:
...
@@ -113,6 +114,15 @@ public:
/*! \brief Serialize this object to json*/
/*! \brief Serialize this object to json*/
std
::
string
ToJSON
();
std
::
string
ToJSON
();
template
<
typename
T
>
static
bool
CategoricalDecision
(
T
fval
,
T
threshold
)
{
if
(
static_cast
<
int
>
(
fval
)
==
static_cast
<
int
>
(
threshold
))
{
return
true
;
}
else
{
return
false
;
}
}
template
<
typename
T
>
template
<
typename
T
>
static
bool
NumericalDecision
(
T
fval
,
T
threshold
)
{
static
bool
NumericalDecision
(
T
fval
,
T
threshold
)
{
if
(
fval
<=
threshold
)
{
if
(
fval
<=
threshold
)
{
...
@@ -122,13 +132,18 @@ public:
...
@@ -122,13 +132,18 @@ public:
}
}
}
}
private:
static
const
char
*
GetDecisionTypeName
(
int8_t
type
)
{
if
(
type
==
0
)
{
return
"no_greater"
;
}
else
{
return
"is"
;
}
}
inline
int
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
static
std
::
vector
<
bool
(
*
)(
uint32_t
,
uint32_t
)
>
inner_decision_funs
;
data_size_t
data_idx
)
const
;
static
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
decision_funs
;
inline
int
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
private:
data_size_t
data_idx
)
const
;
/*!
/*!
* \brief Find leaf index of which record belongs by features
* \brief Find leaf index of which record belongs by features
...
@@ -157,6 +172,8 @@ private:
...
@@ -157,6 +172,8 @@ private:
std
::
vector
<
uint32_t
>
threshold_in_bin_
;
std
::
vector
<
uint32_t
>
threshold_in_bin_
;
/*! \brief A non-leaf node's split threshold in feature value */
/*! \brief A non-leaf node's split threshold in feature value */
std
::
vector
<
double
>
threshold_
;
std
::
vector
<
double
>
threshold_
;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std
::
vector
<
int8_t
>
decision_type_
;
/*! \brief A non-leaf node's split gain */
/*! \brief A non-leaf node's split gain */
std
::
vector
<
double
>
split_gain_
;
std
::
vector
<
double
>
split_gain_
;
// used for leaf node
// used for leaf node
...
@@ -173,6 +190,7 @@ private:
...
@@ -173,6 +190,7 @@ private:
/*! \brief Depth for leaves */
/*! \brief Depth for leaves */
std
::
vector
<
int
>
leaf_depth_
;
std
::
vector
<
int
>
leaf_depth_
;
double
shrinkage_
;
double
shrinkage_
;
bool
has_categorical_
;
};
};
...
@@ -186,40 +204,10 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
...
@@ -186,40 +204,10 @@ inline int Tree::PredictLeafIndex(const double* feature_values) const {
return
leaf
;
return
leaf
;
}
}
inline
int
Tree
::
GetLeaf
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
node
]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
return
~
node
;
}
inline
int
Tree
::
GetLeafRaw
(
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>&
iterators
,
data_size_t
data_idx
)
const
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
uint32_t
>
(
iterators
[
split_feature_inner
[
node
]]
->
Get
(
data_idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
return
~
node
;
}
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
inline
int
Tree
::
GetLeaf
(
const
double
*
feature_values
)
const
{
int
node
=
0
;
int
node
=
0
;
while
(
node
>=
0
)
{
while
(
node
>=
0
)
{
if
(
NumericalDecision
<
double
>
(
if
(
decision_funs
[
decision_type_
[
node
]]
(
feature_values
[
split_feature_
[
node
]],
feature_values
[
split_feature_
[
node
]],
threshold_
[
node
]))
{
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
node
=
left_child_
[
node
];
...
...
pmml/pmml.py
View file @
ef778069
...
@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
...
@@ -31,9 +31,9 @@ def get_threshold(node_id, prev_node_idx, is_child):
def
print_simple_predicate
(
tab_len
,
node_id
,
is_left_child
,
prev_node_idx
,
is_leaf
):
def
print_simple_predicate
(
tab_len
,
node_id
,
is_left_child
,
prev_node_idx
,
is_leaf
):
if
is_left_child
:
if
is_left_child
:
op
=
'lessOrEqual'
op
=
'equal'
if
decision_type
[
prev_node_idx
]
==
1
else
'lessOrEqual'
else
:
else
:
op
=
'greaterThan'
op
=
'notEqual'
if
decision_type
[
prev_node_idx
]
==
1
else
'greaterThan'
out_
(
'
\t
'
*
(
tab_len
+
1
)
+
(
"<SimplePredicate field=
\"
{0}
\"
"
+
" operator=
\"
{1}
\"
value=
\"
{2}
\"
/>"
).
format
(
out_
(
'
\t
'
*
(
tab_len
+
1
)
+
(
"<SimplePredicate field=
\"
{0}
\"
"
+
" operator=
\"
{1}
\"
value=
\"
{2}
\"
/>"
).
format
(
get_field_name
(
node_id
,
prev_node_idx
,
is_leaf
),
op
,
get_threshold
(
node_id
,
prev_node_idx
,
is_leaf
)))
get_field_name
(
node_id
,
prev_node_idx
,
is_leaf
),
op
,
get_threshold
(
node_id
,
prev_node_idx
,
is_leaf
)))
...
@@ -128,6 +128,7 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
...
@@ -128,6 +128,7 @@ with open('LightGBM_pmml.xml', 'w') as pmml_out:
split_feature
=
get_array_ints
(
next
(
model_content
))
split_feature
=
get_array_ints
(
next
(
model_content
))
split_gain
=
next
(
model_content
)
# unused
split_gain
=
next
(
model_content
)
# unused
threshold
=
get_array_strings
(
next
(
model_content
))
threshold
=
get_array_strings
(
next
(
model_content
))
decision_type
=
get_array_ints
(
next
(
model_content
))
left_child
=
get_array_ints
(
next
(
model_content
))
left_child
=
get_array_ints
(
next
(
model_content
))
right_child
=
get_array_ints
(
next
(
model_content
))
right_child
=
get_array_ints
(
next
(
model_content
))
leaf_parent
=
get_array_ints
(
next
(
model_content
))
leaf_parent
=
get_array_ints
(
next
(
model_content
))
...
...
python-package/lightgbm/basic.py
View file @
ef778069
...
@@ -12,8 +12,9 @@ from tempfile import NamedTemporaryFile
...
@@ -12,8 +12,9 @@ from tempfile import NamedTemporaryFile
import
numpy
as
np
import
numpy
as
np
import
scipy.sparse
import
scipy.sparse
from
.compat
import
(
DataFrame
,
Series
,
integer_types
,
json
,
numeric_types
,
from
.compat
import
(
DataFrame
,
Series
,
integer_types
,
json
,
range_
,
string_type
)
json_default_with_numpy
,
numeric_types
,
range_
,
string_type
)
from
.libpath
import
find_lib_path
from
.libpath
import
find_lib_path
...
@@ -220,22 +221,49 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
...
@@ -220,22 +221,49 @@ PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
'float32'
:
'float'
,
'float64'
:
'float'
,
'bool'
:
'int'
}
def
_data_from_pandas
(
data
,
feature_name
):
def
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
pandas_categorical
):
if
isinstance
(
data
,
DataFrame
):
if
isinstance
(
data
,
DataFrame
):
bad_fields
=
[
data
.
columns
[
i
]
for
i
,
dtype
in
enumerate
(
data
.
dtypes
)
if
dtype
.
name
not
in
PANDAS_DTYPE_MAPPER
]
if
feature_name
==
'auto'
or
feature_name
is
None
:
if
bad_fields
:
msg
=
"""DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields: """
raise
ValueError
(
msg
+
', '
.
join
(
bad_fields
))
if
feature_name
==
'auto'
:
if
all
([
isinstance
(
name
,
integer_types
+
(
np
.
integer
,
))
for
name
in
data
.
columns
]):
if
all
([
isinstance
(
name
,
integer_types
+
(
np
.
integer
,
))
for
name
in
data
.
columns
]):
msg
=
"""Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values."""
msg
=
"""Using Pandas (default) integer column names, not column indexes. You can use indexes with DataFrame.values."""
warnings
.
filterwarnings
(
'once'
)
warnings
.
filterwarnings
(
'once'
)
warnings
.
warn
(
msg
,
stacklevel
=
5
)
warnings
.
warn
(
msg
,
stacklevel
=
5
)
feature_name
=
[
str
(
name
)
for
name
in
data
.
columns
]
data
=
data
.
rename
(
columns
=
str
)
cat_cols
=
data
.
select_dtypes
(
include
=
[
'category'
]).
columns
if
pandas_categorical
is
None
:
# train dataset
pandas_categorical
=
[
list
(
data
[
col
].
cat
.
categories
)
for
col
in
cat_cols
]
else
:
if
len
(
cat_cols
)
!=
len
(
pandas_categorical
):
raise
ValueError
(
'train and valid dataset categorical_feature do not match.'
)
for
col
,
category
in
zip
(
cat_cols
,
pandas_categorical
):
if
list
(
data
[
col
].
cat
.
categories
)
!=
list
(
category
):
data
[
col
]
=
data
[
col
].
cat
.
set_categories
(
category
)
if
len
(
cat_cols
):
# cat_cols is pandas Index object
data
=
data
.
copy
()
# not alter origin DataFrame
data
[
cat_cols
]
=
data
[
cat_cols
].
apply
(
lambda
x
:
x
.
cat
.
codes
)
if
categorical_feature
is
not
None
:
if
feature_name
is
None
:
feature_name
=
list
(
data
.
columns
)
if
categorical_feature
==
'auto'
:
categorical_feature
=
list
(
cat_cols
)
else
:
categorical_feature
=
list
(
categorical_feature
)
+
list
(
cat_cols
)
if
feature_name
==
'auto'
:
feature_name
=
list
(
data
.
columns
)
data_dtypes
=
data
.
dtypes
if
not
all
(
dtype
.
name
in
PANDAS_DTYPE_MAPPER
for
dtype
in
data_dtypes
):
bad_fields
=
[
data
.
columns
[
i
]
for
i
,
dtype
in
enumerate
(
data_dtypes
)
if
dtype
.
name
not
in
PANDAS_DTYPE_MAPPER
]
msg
=
"""DataFrame.dtypes for data must be int, float or bool. Did not expect the data types in fields """
raise
ValueError
(
msg
+
', '
.
join
(
bad_fields
))
data
=
data
.
values
.
astype
(
'float'
)
data
=
data
.
values
.
astype
(
'float'
)
elif
feature_name
==
'auto'
:
else
:
feature_name
=
None
if
feature_name
==
'auto'
:
return
data
,
feature_name
feature_name
=
None
if
categorical_feature
==
'auto'
:
categorical_feature
=
None
return
data
,
feature_name
,
categorical_feature
,
pandas_categorical
def
_label_from_pandas
(
label
):
def
_label_from_pandas
(
label
):
...
@@ -249,6 +277,19 @@ def _label_from_pandas(label):
...
@@ -249,6 +277,19 @@ def _label_from_pandas(label):
return
label
return
label
def
_save_pandas_categorical
(
file_name
,
pandas_categorical
):
with
open
(
file_name
,
'a'
)
as
f
:
f
.
write
(
'
\n
pandas_categorical:'
+
json
.
dumps
(
pandas_categorical
,
default
=
json_default_with_numpy
))
def
_load_pandas_categorical
(
file_name
):
with
open
(
file_name
,
'r'
)
as
f
:
last_line
=
f
.
readlines
()[
-
1
]
if
last_line
.
startswith
(
'pandas_categorical:'
):
return
json
.
loads
(
last_line
[
len
(
'pandas_categorical:'
):])
return
None
class
_InnerPredictor
(
object
):
class
_InnerPredictor
(
object
):
"""
"""
A _InnerPredictor of LightGBM.
A _InnerPredictor of LightGBM.
...
@@ -280,6 +321,7 @@ class _InnerPredictor(object):
...
@@ -280,6 +321,7 @@ class _InnerPredictor(object):
ctypes
.
byref
(
out_num_class
)))
ctypes
.
byref
(
out_num_class
)))
self
.
num_class
=
out_num_class
.
value
self
.
num_class
=
out_num_class
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
booster_handle
is
not
None
:
elif
booster_handle
is
not
None
:
self
.
__is_manage_handle
=
False
self
.
__is_manage_handle
=
False
self
.
handle
=
booster_handle
self
.
handle
=
booster_handle
...
@@ -293,6 +335,7 @@ class _InnerPredictor(object):
...
@@ -293,6 +335,7 @@ class _InnerPredictor(object):
self
.
handle
,
self
.
handle
,
ctypes
.
byref
(
out_num_iterations
)))
ctypes
.
byref
(
out_num_iterations
)))
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
num_total_iteration
=
out_num_iterations
.
value
self
.
pandas_categorical
=
None
else
:
else
:
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
raise
TypeError
(
'Need Model file or Booster handle to create a predictor'
)
...
@@ -328,7 +371,7 @@ class _InnerPredictor(object):
...
@@ -328,7 +371,7 @@ class _InnerPredictor(object):
"""
"""
if
isinstance
(
data
,
Dataset
):
if
isinstance
(
data
,
Dataset
):
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
raise
TypeError
(
"Cannot use Dataset instance for prediction, please use raw data instead"
)
data
=
_data_from_pandas
(
data
,
None
)[
0
]
data
=
_data_from_pandas
(
data
,
None
,
None
,
self
.
pandas_categorical
)[
0
]
predict_type
=
C_API_PREDICT_NORMAL
predict_type
=
C_API_PREDICT_NORMAL
if
raw_score
:
if
raw_score
:
predict_type
=
C_API_PREDICT_RAW_SCORE
predict_type
=
C_API_PREDICT_RAW_SCORE
...
@@ -359,6 +402,9 @@ class _InnerPredictor(object):
...
@@ -359,6 +402,9 @@ class _InnerPredictor(object):
elif
isinstance
(
data
,
np
.
ndarray
):
elif
isinstance
(
data
,
np
.
ndarray
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
,
num_iteration
,
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
,
num_iteration
,
predict_type
)
predict_type
)
elif
isinstance
(
data
,
DataFrame
):
preds
,
nrow
=
self
.
__pred_for_np2d
(
data
.
values
,
num_iteration
,
predict_type
)
else
:
else
:
try
:
try
:
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
csr
=
scipy
.
sparse
.
csr_matrix
(
data
)
...
@@ -486,7 +532,7 @@ class Dataset(object):
...
@@ -486,7 +532,7 @@ class Dataset(object):
"""Dataset in LightGBM."""
"""Dataset in LightGBM."""
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
def
__init__
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
silent
=
False
,
weight
=
None
,
group
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
params
=
None
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
params
=
None
,
free_raw_data
=
True
):
free_raw_data
=
True
):
"""
"""
Parameters
Parameters
...
@@ -509,6 +555,11 @@ class Dataset(object):
...
@@ -509,6 +555,11 @@ class Dataset(object):
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
params: dict, optional
params: dict, optional
Other parameters
Other parameters
free_raw_data: Bool
free_raw_data: Bool
...
@@ -523,10 +574,12 @@ class Dataset(object):
...
@@ -523,10 +574,12 @@ class Dataset(object):
self
.
group
=
group
self
.
group
=
group
self
.
silent
=
silent
self
.
silent
=
silent
self
.
feature_name
=
feature_name
self
.
feature_name
=
feature_name
self
.
categorical_feature
=
categorical_feature
self
.
params
=
params
self
.
params
=
params
self
.
free_raw_data
=
free_raw_data
self
.
free_raw_data
=
free_raw_data
self
.
used_indices
=
None
self
.
used_indices
=
None
self
.
_predictor
=
None
self
.
_predictor
=
None
self
.
pandas_categorical
=
None
def
__del__
(
self
):
def
__del__
(
self
):
self
.
_free_handle
()
self
.
_free_handle
()
...
@@ -539,11 +592,11 @@ class Dataset(object):
...
@@ -539,11 +592,11 @@ class Dataset(object):
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
def
_lazy_init
(
self
,
data
,
label
=
None
,
max_bin
=
255
,
reference
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
weight
=
None
,
group
=
None
,
predictor
=
None
,
silent
=
False
,
feature_name
=
'auto'
,
silent
=
False
,
feature_name
=
'auto'
,
params
=
None
):
categorical_feature
=
'auto'
,
params
=
None
):
if
data
is
None
:
if
data
is
None
:
self
.
handle
=
None
self
.
handle
=
None
return
return
data
,
feature_name
,
=
_data_from_pandas
(
data
,
feature_name
)
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
=
_data_from_pandas
(
data
,
feature_name
,
categorical_feature
,
self
.
pandas_categorical
)
label
=
_label_from_pandas
(
label
)
label
=
_label_from_pandas
(
label
)
self
.
data_has_header
=
False
self
.
data_has_header
=
False
"""process for args"""
"""process for args"""
...
@@ -555,6 +608,23 @@ class Dataset(object):
...
@@ -555,6 +608,23 @@ class Dataset(object):
params
[
"verbose"
]
=
0
params
[
"verbose"
]
=
0
elif
"verbose"
not
in
params
:
elif
"verbose"
not
in
params
:
params
[
"verbose"
]
=
1
params
[
"verbose"
]
=
1
"""get categorical features"""
if
categorical_feature
is
not
None
:
categorical_indices
=
set
()
feature_dict
=
{}
if
feature_name
is
not
None
:
feature_dict
=
{
name
:
i
for
i
,
name
in
enumerate
(
feature_name
)}
for
name
in
categorical_feature
:
if
isinstance
(
name
,
string_type
)
and
name
in
feature_dict
:
categorical_indices
.
add
(
feature_dict
[
name
])
elif
isinstance
(
name
,
integer_types
):
categorical_indices
.
add
(
name
)
else
:
raise
TypeError
(
"Wrong type({}) or unknown name({}) in categorical_feature"
.
format
(
type
(
name
).
__name__
,
name
))
params
[
'categorical_column'
]
=
sorted
(
categorical_indices
)
params_str
=
param_dict_to_str
(
params
)
params_str
=
param_dict_to_str
(
params
)
"""process for reference dataset"""
"""process for reference dataset"""
ref_dataset
=
None
ref_dataset
=
None
...
@@ -714,7 +784,7 @@ class Dataset(object):
...
@@ -714,7 +784,7 @@ class Dataset(object):
self
.
_lazy_init
(
self
.
data
,
label
=
self
.
label
,
max_bin
=
self
.
max_bin
,
self
.
_lazy_init
(
self
.
data
,
label
=
self
.
label
,
max_bin
=
self
.
max_bin
,
weight
=
self
.
weight
,
group
=
self
.
group
,
predictor
=
self
.
_predictor
,
weight
=
self
.
weight
,
group
=
self
.
group
,
predictor
=
self
.
_predictor
,
silent
=
self
.
silent
,
feature_name
=
self
.
feature_name
,
silent
=
self
.
silent
,
feature_name
=
self
.
feature_name
,
params
=
self
.
params
)
categorical_feature
=
self
.
categorical_feature
,
params
=
self
.
params
)
if
self
.
free_raw_data
:
if
self
.
free_raw_data
:
self
.
data
=
None
self
.
data
=
None
return
self
return
self
...
@@ -744,6 +814,7 @@ class Dataset(object):
...
@@ -744,6 +814,7 @@ class Dataset(object):
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
weight
=
weight
,
group
=
group
,
silent
=
silent
,
params
=
params
,
free_raw_data
=
self
.
free_raw_data
)
free_raw_data
=
self
.
free_raw_data
)
ret
.
_predictor
=
self
.
_predictor
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
return
ret
return
ret
def
subset
(
self
,
used_indices
,
params
=
None
):
def
subset
(
self
,
used_indices
,
params
=
None
):
...
@@ -758,8 +829,9 @@ class Dataset(object):
...
@@ -758,8 +829,9 @@ class Dataset(object):
Other parameters
Other parameters
"""
"""
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
ret
=
Dataset
(
None
,
reference
=
self
,
feature_name
=
self
.
feature_name
,
params
=
params
)
categorical_feature
=
self
.
categorical_feature
,
params
=
params
)
ret
.
_predictor
=
self
.
_predictor
ret
.
_predictor
=
self
.
_predictor
ret
.
pandas_categorical
=
self
.
pandas_categorical
ret
.
used_indices
=
used_indices
ret
.
used_indices
=
used_indices
return
ret
return
ret
...
@@ -867,6 +939,24 @@ class Dataset(object):
...
@@ -867,6 +939,24 @@ class Dataset(object):
else
:
else
:
raise
TypeError
(
"Unknown type"
)
raise
TypeError
(
"Unknown type"
)
def
set_categorical_feature
(
self
,
categorical_feature
):
"""
Set categorical features
Parameters
----------
categorical_feature : list of int or str
Name/index of categorical features
"""
if
self
.
categorical_feature
==
categorical_feature
:
return
if
self
.
data
is
not
None
:
self
.
categorical_feature
=
categorical_feature
self
.
_free_handle
()
else
:
raise
LightGBMError
(
"Cannot set categorical feature after freed raw data, set free_raw_data=False when construct Dataset to avoid this."
)
def
_set_predictor
(
self
,
predictor
):
def
_set_predictor
(
self
,
predictor
):
"""
"""
Set predictor for continued training, not recommand for user to call this function.
Set predictor for continued training, not recommand for user to call this function.
...
@@ -889,6 +979,7 @@ class Dataset(object):
...
@@ -889,6 +979,7 @@ class Dataset(object):
reference : Dataset
reference : Dataset
Will use reference as template to consturct current dataset
Will use reference as template to consturct current dataset
"""
"""
self
.
set_categorical_feature
(
reference
.
categorical_feature
)
self
.
set_feature_name
(
reference
.
feature_name
)
self
.
set_feature_name
(
reference
.
feature_name
)
self
.
_set_predictor
(
reference
.
_predictor
)
self
.
_set_predictor
(
reference
.
_predictor
)
if
self
.
reference
is
reference
:
if
self
.
reference
is
reference
:
...
@@ -1117,6 +1208,7 @@ class Booster(object):
...
@@ -1117,6 +1208,7 @@ class Booster(object):
self
.
__inner_predict_buffer
=
[
None
]
self
.
__inner_predict_buffer
=
[
None
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__is_predicted_cur_iter
=
[
False
]
self
.
__get_eval_info
()
self
.
__get_eval_info
()
self
.
pandas_categorical
=
train_set
.
pandas_categorical
elif
model_file
is
not
None
:
elif
model_file
is
not
None
:
"""Prediction task"""
"""Prediction task"""
out_num_iterations
=
ctypes
.
c_int
(
0
)
out_num_iterations
=
ctypes
.
c_int
(
0
)
...
@@ -1129,6 +1221,7 @@ class Booster(object):
...
@@ -1129,6 +1221,7 @@ class Booster(object):
self
.
handle
,
self
.
handle
,
ctypes
.
byref
(
out_num_class
)))
ctypes
.
byref
(
out_num_class
)))
self
.
__num_class
=
out_num_class
.
value
self
.
__num_class
=
out_num_class
.
value
self
.
pandas_categorical
=
_load_pandas_categorical
(
model_file
)
elif
'model_str'
in
params
:
elif
'model_str'
in
params
:
self
.
__load_model_from_string
(
params
[
'model_str'
])
self
.
__load_model_from_string
(
params
[
'model_str'
])
else
:
else
:
...
@@ -1144,6 +1237,7 @@ class Booster(object):
...
@@ -1144,6 +1237,7 @@ class Booster(object):
def
__deepcopy__
(
self
,
_
):
def
__deepcopy__
(
self
,
_
):
model_str
=
self
.
__save_model_to_string
()
model_str
=
self
.
__save_model_to_string
()
booster
=
Booster
({
'model_str'
:
model_str
})
booster
=
Booster
({
'model_str'
:
model_str
})
booster
.
pandas_categorical
=
self
.
pandas_categorical
return
booster
return
booster
def
__getstate__
(
self
):
def
__getstate__
(
self
):
...
@@ -1383,6 +1477,7 @@ class Booster(object):
...
@@ -1383,6 +1477,7 @@ class Booster(object):
self
.
handle
,
self
.
handle
,
ctypes
.
c_int
(
num_iteration
),
ctypes
.
c_int
(
num_iteration
),
c_str
(
filename
)))
c_str
(
filename
)))
_save_pandas_categorical
(
filename
,
self
.
pandas_categorical
)
def
__load_model_from_string
(
self
,
model_str
):
def
__load_model_from_string
(
self
,
model_str
):
"""[Private] Load model from string"""
"""[Private] Load model from string"""
...
@@ -1494,6 +1589,7 @@ class Booster(object):
...
@@ -1494,6 +1589,7 @@ class Booster(object):
def
_to_predictor
(
self
):
def
_to_predictor
(
self
):
"""Convert to predictor"""
"""Convert to predictor"""
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
=
_InnerPredictor
(
booster_handle
=
self
.
handle
)
predictor
.
pandas_categorical
=
self
.
pandas_categorical
return
predictor
return
predictor
def
feature_name
(
self
):
def
feature_name
(
self
):
...
...
python-package/lightgbm/compat.py
View file @
ef778069
...
@@ -39,6 +39,15 @@ except (ImportError, SyntaxError):
...
@@ -39,6 +39,15 @@ except (ImportError, SyntaxError):
import
json
import
json
def
json_default_with_numpy
(
obj
):
if
isinstance
(
obj
,
(
np
.
integer
,
np
.
floating
,
np
.
bool_
)):
return
obj
.
item
()
elif
isinstance
(
obj
,
np
.
ndarray
):
return
obj
.
tolist
()
else
:
return
obj
"""pandas"""
"""pandas"""
try
:
try
:
from
pandas
import
Series
,
DataFrame
from
pandas
import
Series
,
DataFrame
...
...
python-package/lightgbm/engine.py
View file @
ef778069
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
...
@@ -17,7 +17,7 @@ from .compat import (SKLEARN_INSTALLED, LGBMStratifiedKFold, integer_types,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
def
train
(
params
,
train_set
,
num_boost_round
=
100
,
valid_sets
=
None
,
valid_names
=
None
,
valid_sets
=
None
,
valid_names
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
early_stopping_rounds
=
None
,
evals_result
=
None
,
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
verbose_eval
=
True
,
learning_rates
=
None
,
callbacks
=
None
):
"""
"""
...
@@ -45,6 +45,11 @@ def train(params, train_set, num_boost_round=100,
...
@@ -45,6 +45,11 @@ def train(params, train_set, num_boost_round=100,
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping.
Activates early stopping.
Requires at least one validation data and one metric
Requires at least one validation data and one metric
...
@@ -98,6 +103,7 @@ def train(params, train_set, num_boost_round=100,
...
@@ -98,6 +103,7 @@ def train(params, train_set, num_boost_round=100,
train_set
.
_update_params
(
params
)
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
is_valid_contain_train
=
False
is_valid_contain_train
=
False
train_data_name
=
"training"
train_data_name
=
"training"
...
@@ -271,7 +277,7 @@ def _agg_cv_result(raw_results):
...
@@ -271,7 +277,7 @@ def _agg_cv_result(raw_results):
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
def
cv
(
params
,
train_set
,
num_boost_round
=
10
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
data_splitter
=
None
,
nfold
=
5
,
stratified
=
False
,
shuffle
=
True
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
metrics
=
None
,
fobj
=
None
,
feval
=
None
,
init_model
=
None
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
early_stopping_rounds
=
None
,
fpreproc
=
None
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
verbose_eval
=
None
,
show_stdv
=
True
,
seed
=
0
,
callbacks
=
None
):
callbacks
=
None
):
...
@@ -305,6 +311,11 @@ def cv(params, train_set, num_boost_round=10,
...
@@ -305,6 +311,11 @@ def cv(params, train_set, num_boost_round=10,
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
early_stopping_rounds: int
early_stopping_rounds: int
Activates early stopping. CV error needs to decrease at least
Activates early stopping. CV error needs to decrease at least
every <early_stopping_rounds> round(s) to continue.
every <early_stopping_rounds> round(s) to continue.
...
@@ -343,6 +354,7 @@ def cv(params, train_set, num_boost_round=10,
...
@@ -343,6 +354,7 @@ def cv(params, train_set, num_boost_round=10,
train_set
.
_update_params
(
params
)
train_set
.
_update_params
(
params
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
_set_predictor
(
predictor
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_feature_name
(
feature_name
)
train_set
.
set_categorical_feature
(
categorical_feature
)
if
metrics
:
if
metrics
:
params
.
setdefault
(
'metric'
,
[])
params
.
setdefault
(
'metric'
,
[])
...
...
python-package/lightgbm/plotting.py
View file @
ef778069
...
@@ -257,7 +257,12 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
...
@@ -257,7 +257,12 @@ def _to_graphviz(graph, tree_info, show_info, feature_names):
if
info
in
{
'split_gain'
,
'internal_value'
,
'internal_count'
}:
if
info
in
{
'split_gain'
,
'internal_value'
,
'internal_count'
}:
label
+=
'
\n
'
+
info
+
':'
+
str
(
root
[
info
])
label
+=
'
\n
'
+
info
+
':'
+
str
(
root
[
info
])
graph
.
node
(
name
,
label
=
label
)
graph
.
node
(
name
,
label
=
label
)
l_dec
,
r_dec
=
'<='
,
'>'
if
root
[
'decision_type'
]
==
'no_greater'
:
l_dec
,
r_dec
=
'<='
,
'>'
elif
root
[
'decision_type'
]
==
'is'
:
l_dec
,
r_dec
=
'is'
,
"isn't"
else
:
raise
ValueError
(
'Invalid decision type in tree model.'
)
add
(
root
[
'left_child'
],
name
,
l_dec
)
add
(
root
[
'left_child'
],
name
,
l_dec
)
add
(
root
[
'right_child'
],
name
,
r_dec
)
add
(
root
[
'right_child'
],
name
,
r_dec
)
else
:
# leaf
else
:
# leaf
...
...
python-package/lightgbm/sklearn.py
View file @
ef778069
...
@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
...
@@ -284,7 +284,7 @@ class LGBMModel(LGBMModelBase):
eval_init_score
=
None
,
eval_group
=
None
,
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
None
,
eval_metric
=
None
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
callbacks
=
None
):
"""
"""
Fit the gradient boosting model
Fit the gradient boosting model
...
@@ -318,6 +318,11 @@ class LGBMModel(LGBMModelBase):
...
@@ -318,6 +318,11 @@ class LGBMModel(LGBMModelBase):
feature_name : list of str, or 'auto'
feature_name : list of str, or 'auto'
Feature names
Feature names
If 'auto' and data is pandas DataFrame, use data columns name
If 'auto' and data is pandas DataFrame, use data columns name
categorical_feature : list of str or int, or 'auto'
Categorical features,
type int represents index,
type str represents feature names (need to specify feature_name as well)
If 'auto' and data is pandas DataFrame, use pandas categorical columns
callbacks : list of callback functions
callbacks : list of callback functions
List of callback functions that are applied at each iteration.
List of callback functions that are applied at each iteration.
See Callbacks in Python-API.md for more information.
See Callbacks in Python-API.md for more information.
...
@@ -401,6 +406,7 @@ class LGBMModel(LGBMModelBase):
...
@@ -401,6 +406,7 @@ class LGBMModel(LGBMModelBase):
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
evals_result
=
evals_result
,
fobj
=
self
.
fobj
,
feval
=
feval
,
verbose_eval
=
verbose
,
feature_name
=
feature_name
,
verbose_eval
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
if
evals_result
:
if
evals_result
:
...
@@ -508,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
...
@@ -508,7 +514,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_init_score
=
None
,
eval_init_score
=
None
,
eval_metric
=
"l2"
,
eval_metric
=
"l2"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
callbacks
=
None
):
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
=
sample_weight
,
super
(
LGBMRegressor
,
self
).
fit
(
X
,
y
,
sample_weight
=
sample_weight
,
init_score
=
init_score
,
eval_set
=
eval_set
,
init_score
=
init_score
,
eval_set
=
eval_set
,
...
@@ -517,6 +523,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
...
@@ -517,6 +523,7 @@ class LGBMRegressor(LGBMModel, LGBMRegressorBase):
eval_metric
=
eval_metric
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
return
self
return
self
...
@@ -553,7 +560,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
...
@@ -553,7 +560,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_init_score
=
None
,
eval_init_score
=
None
,
eval_metric
=
"binary_logloss"
,
eval_metric
=
"binary_logloss"
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
callbacks
=
None
):
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
self
.
_le
=
LGBMLabelEncoder
().
fit
(
y
)
y
=
self
.
_le
.
transform
(
y
)
y
=
self
.
_le
.
transform
(
y
)
...
@@ -576,6 +583,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
...
@@ -576,6 +583,7 @@ class LGBMClassifier(LGBMModel, LGBMClassifierBase):
eval_metric
=
eval_metric
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
return
self
return
self
...
@@ -653,7 +661,7 @@ class LGBMRanker(LGBMModel):
...
@@ -653,7 +661,7 @@ class LGBMRanker(LGBMModel):
eval_init_score
=
None
,
eval_group
=
None
,
eval_init_score
=
None
,
eval_group
=
None
,
eval_metric
=
'ndcg'
,
eval_at
=
1
,
eval_metric
=
'ndcg'
,
eval_at
=
1
,
early_stopping_rounds
=
None
,
verbose
=
True
,
early_stopping_rounds
=
None
,
verbose
=
True
,
feature_name
=
'auto'
,
feature_name
=
'auto'
,
categorical_feature
=
'auto'
,
callbacks
=
None
):
callbacks
=
None
):
"""
"""
Most arguments like common methods except following:
Most arguments like common methods except following:
...
@@ -684,5 +692,6 @@ class LGBMRanker(LGBMModel):
...
@@ -684,5 +692,6 @@ class LGBMRanker(LGBMModel):
eval_metric
=
eval_metric
,
eval_metric
=
eval_metric
,
early_stopping_rounds
=
early_stopping_rounds
,
early_stopping_rounds
=
early_stopping_rounds
,
verbose
=
verbose
,
feature_name
=
feature_name
,
verbose
=
verbose
,
feature_name
=
feature_name
,
categorical_feature
=
categorical_feature
,
callbacks
=
callbacks
)
callbacks
=
callbacks
)
return
self
return
self
src/io/bin.cpp
View file @
ef778069
...
@@ -24,7 +24,13 @@ BinMapper::BinMapper(const BinMapper& other) {
...
@@ -24,7 +24,13 @@ BinMapper::BinMapper(const BinMapper& other) {
num_bin_
=
other
.
num_bin_
;
num_bin_
=
other
.
num_bin_
;
is_trival_
=
other
.
is_trival_
;
is_trival_
=
other
.
is_trival_
;
sparse_rate_
=
other
.
sparse_rate_
;
sparse_rate_
=
other
.
sparse_rate_
;
bin_upper_bound_
=
other
.
bin_upper_bound_
;
bin_type_
=
other
.
bin_type_
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
bin_upper_bound_
=
other
.
bin_upper_bound_
;
}
else
{
bin_2_categorical_
=
other
.
bin_2_categorical_
;
categorical_2_bin_
=
other
.
categorical_2_bin_
;
}
min_val_
=
other
.
min_val_
;
min_val_
=
other
.
min_val_
;
max_val_
=
other
.
max_val_
;
max_val_
=
other
.
max_val_
;
default_bin_
=
other
.
default_bin_
;
default_bin_
=
other
.
default_bin_
;
...
@@ -38,22 +44,34 @@ BinMapper::~BinMapper() {
...
@@ -38,22 +44,34 @@ BinMapper::~BinMapper() {
}
}
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
)
{
bool
NeedFilter
(
std
::
vector
<
int
>&
cnt_in_bin
,
int
total_cnt
,
int
filter_cnt
,
BinType
bin_type
)
{
int
sum_left
=
0
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
int
sum_left
=
0
;
sum_left
+=
cnt_in_bin
[
i
];
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
if
(
sum_left
>=
filter_cnt
)
{
sum_left
+=
cnt_in_bin
[
i
];
return
false
;
if
(
sum_left
>=
filter_cnt
)
{
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
else
{
for
(
size_t
i
=
0
;
i
<
cnt_in_bin
.
size
()
-
1
;
++
i
)
{
int
sum_left
=
cnt_in_bin
[
i
];
if
(
sum_left
>=
filter_cnt
)
{
return
false
;
}
else
if
(
total_cnt
-
sum_left
>=
filter_cnt
)
{
return
false
;
}
}
}
}
}
return
true
;
return
true
;
}
}
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>&
values
,
size_t
total_sample_cnt
,
void
BinMapper
::
FindBin
(
std
::
vector
<
double
>&
values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
)
{
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
)
{
// limit max_bin by min_data_in_bin
bin_type_
=
bin_type
;
default_bin_
=
0
;
std
::
vector
<
double
>&
raw_values
=
values
;
std
::
vector
<
double
>&
raw_values
=
values
;
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
raw_values
.
size
());
int
zero_cnt
=
static_cast
<
int
>
(
total_sample_cnt
-
raw_values
.
size
());
// find distinct_values first
// find distinct_values first
...
@@ -95,98 +113,134 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
...
@@ -95,98 +113,134 @@ void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
max_val_
=
distinct_values
.
back
();
max_val_
=
distinct_values
.
back
();
std
::
vector
<
int
>
cnt_in_bin
;
std
::
vector
<
int
>
cnt_in_bin
;
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
int
num_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
num_values
<=
max_bin
)
{
if
(
num_values
<=
max_bin
)
{
// use distinct value is enough
// use distinct value is enough
bin_upper_bound_
.
clear
();
bin_upper_bound_
.
clear
();
int
cur_cnt_inbin
=
0
;
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
cur_cnt_inbin
+=
counts
[
i
];
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cur_cnt_inbin
=
0
;
cur_cnt_inbin
=
0
;
}
}
}
}
cur_cnt_inbin
+=
counts
.
back
();
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
}
else
{
}
else
{
if
(
min_data_in_bin
>
0
)
{
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
max_bin
=
std
::
max
(
max_bin
,
1
);
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
if
(
zero_cnt
>
mean_bin_size
)
{
int
non_zero_cnt
=
static_cast
<
int
>
(
raw_values
.
size
());
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
}
// mean size for one bin
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_sample_cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
}
}
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
if
(
zero_cnt
>
mean_bin_size
)
{
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
non_zero_cnt
=
static_cast
<
int
>
(
raw_values
.
size
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
}
cur_cnt_inbin
+=
counts
[
i
];
// mean size for one bin
// need a new bin
int
rest_bin_cnt
=
max_bin
;
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_sample_cnt
);
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
std
::
vector
<
bool
>
is_big_count_value
(
num_values
,
false
);
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
for
(
int
i
=
0
;
i
<
num_values
;
++
i
)
{
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
if
(
counts
[
i
]
>=
mean_bin_size
)
{
++
bin_cnt
;
is_big_count_value
[
i
]
=
true
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
num_bin_
=
bin_cnt
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
CHECK
(
num_bin_
<=
max_bin
);
}
else
{
// convert to int type first
std
::
vector
<
int
>
distinct_values_int
;
std
::
vector
<
int
>
counts_int
;
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
0
]));
counts_int
.
push_back
(
counts
[
0
]);
for
(
size_t
i
=
1
;
i
<
distinct_values
.
size
();
++
i
)
{
if
(
static_cast
<
int
>
(
distinct_values
[
i
])
!=
distinct_values_int
.
back
())
{
distinct_values_int
.
push_back
(
static_cast
<
int
>
(
distinct_values
[
i
]));
counts_int
.
push_back
(
counts
[
i
]);
}
else
{
counts_int
.
back
()
+=
counts
[
i
];
}
}
}
}
cur_cnt_inbin
+=
counts
.
back
();
// sort by counts
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
Common
::
SortForPair
<
int
,
int
>
(
counts_int
,
distinct_values_int
,
0
,
true
);
++
bin_cnt
;
// will ingore the categorical of small counts
// update bin upper bound
const
int
cut_cnt
=
static_cast
<
int
>
(
total_sample_cnt
*
0.98
f
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
categorical_2_bin_
.
clear
();
num_bin_
=
bin_cnt
;
bin_2_categorical_
.
clear
();
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
num_bin_
=
0
;
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
int
used_cnt
=
0
;
max_bin
=
std
::
min
(
static_cast
<
int
>
(
distinct_values_int
.
size
()),
max_bin
);
while
(
used_cnt
<
cut_cnt
||
num_bin_
<
max_bin
)
{
bin_2_categorical_
.
push_back
(
distinct_values_int
[
num_bin_
]);
categorical_2_bin_
[
distinct_values_int
[
num_bin_
]]
=
static_cast
<
unsigned
int
>
(
num_bin_
);
used_cnt
+=
counts_int
[
num_bin_
];
++
num_bin_
;
}
}
// last bin upper bound
cnt_in_bin
=
counts_int
;
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
counts_int
.
resize
(
num_bin_
);
counts_int
.
back
()
+=
static_cast
<
int
>
(
total_sample_cnt
-
used_cnt
);
}
}
// check trival(num_bin_ == 1) feature
// check trival(num_bin_ == 1) feature
if
(
num_bin_
<=
1
)
{
if
(
num_bin_
<=
1
)
{
is_trival_
=
true
;
is_trival_
=
true
;
default_bin_
=
0
;
}
else
{
}
else
{
is_trival_
=
false
;
is_trival_
=
false
;
default_bin_
=
ValueToBin
(
0
);
}
}
if
(
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
))
{
// check useless bin
if
(
!
is_trival_
&&
NeedFilter
(
cnt_in_bin
,
static_cast
<
int
>
(
total_sample_cnt
),
min_split_data
,
bin_type_
))
{
is_trival_
=
true
;
is_trival_
=
true
;
}
}
if
(
!
is_trival_
)
{
default_bin_
=
ValueToBin
(
0
);
}
// calculate sparse rate
// calculate sparse rate
CHECK
(
num_bin_
<=
max_bin
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
default_bin_
])
/
static_cast
<
double
>
(
total_sample_cnt
);
sparse_rate_
=
static_cast
<
double
>
(
cnt_in_bin
[
GetDefaultBin
()])
/
static_cast
<
double
>
(
total_sample_cnt
);
}
}
...
@@ -195,6 +249,7 @@ int BinMapper::SizeForSpecificBin(int bin) {
...
@@ -195,6 +249,7 @@ int BinMapper::SizeForSpecificBin(int bin) {
size
+=
sizeof
(
int
);
size
+=
sizeof
(
int
);
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
bool
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
double
);
size
+=
sizeof
(
BinType
);
size
+=
2
*
sizeof
(
double
);
size
+=
2
*
sizeof
(
double
);
size
+=
bin
*
sizeof
(
double
);
size
+=
bin
*
sizeof
(
double
);
size
+=
sizeof
(
uint32_t
);
size
+=
sizeof
(
uint32_t
);
...
@@ -208,13 +263,19 @@ void BinMapper::CopyTo(char * buffer) {
...
@@ -208,13 +263,19 @@ void BinMapper::CopyTo(char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
std
::
memcpy
(
buffer
,
&
sparse_rate_
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
buffer
,
&
bin_type_
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
buffer
+=
sizeof
(
default_bin_
);
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
buffer
,
bin_upper_bound_
.
data
(),
num_bin_
*
sizeof
(
double
));
}
else
{
std
::
memcpy
(
buffer
,
bin_2_categorical_
.
data
(),
num_bin_
*
sizeof
(
int
));
}
}
}
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
void
BinMapper
::
CopyFrom
(
const
char
*
buffer
)
{
...
@@ -224,30 +285,50 @@ void BinMapper::CopyFrom(const char * buffer) {
...
@@ -224,30 +285,50 @@ void BinMapper::CopyFrom(const char * buffer) {
buffer
+=
sizeof
(
is_trival_
);
buffer
+=
sizeof
(
is_trival_
);
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
));
std
::
memcpy
(
&
sparse_rate_
,
buffer
,
sizeof
(
sparse_rate_
));
buffer
+=
sizeof
(
sparse_rate_
);
buffer
+=
sizeof
(
sparse_rate_
);
std
::
memcpy
(
&
bin_type_
,
buffer
,
sizeof
(
bin_type_
));
buffer
+=
sizeof
(
bin_type_
);
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
std
::
memcpy
(
&
min_val_
,
buffer
,
sizeof
(
min_val_
));
buffer
+=
sizeof
(
min_val_
);
buffer
+=
sizeof
(
min_val_
);
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
std
::
memcpy
(
&
max_val_
,
buffer
,
sizeof
(
max_val_
));
buffer
+=
sizeof
(
max_val_
);
buffer
+=
sizeof
(
max_val_
);
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
std
::
memcpy
(
&
default_bin_
,
buffer
,
sizeof
(
default_bin_
));
buffer
+=
sizeof
(
default_bin_
);
buffer
+=
sizeof
(
default_bin_
);
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
bin_upper_bound_
=
std
::
vector
<
double
>
(
num_bin_
);
std
::
memcpy
(
bin_upper_bound_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
double
));
}
else
{
bin_2_categorical_
=
std
::
vector
<
int
>
(
num_bin_
);
std
::
memcpy
(
bin_2_categorical_
.
data
(),
buffer
,
num_bin_
*
sizeof
(
int
));
categorical_2_bin_
.
clear
();
for
(
int
i
=
0
;
i
<
num_bin_
;
++
i
)
{
categorical_2_bin_
[
bin_2_categorical_
[
i
]]
=
static_cast
<
unsigned
int
>
(
i
);
}
}
}
}
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
void
BinMapper
::
SaveBinaryToFile
(
FILE
*
file
)
const
{
fwrite
(
&
num_bin_
,
sizeof
(
num_bin_
),
1
,
file
);
fwrite
(
&
num_bin_
,
sizeof
(
num_bin_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
is_trival_
,
sizeof
(
is_trival_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
sparse_rate_
,
sizeof
(
sparse_rate_
),
1
,
file
);
fwrite
(
&
bin_type_
,
sizeof
(
bin_type_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
min_val_
,
sizeof
(
min_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
fwrite
(
&
max_val_
,
sizeof
(
max_val_
),
1
,
file
);
fwrite
(
&
default_bin_
,
sizeof
(
default_bin_
),
1
,
file
);
fwrite
(
&
default_bin_
,
sizeof
(
default_bin_
),
1
,
file
);
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
fwrite
(
bin_upper_bound_
.
data
(),
sizeof
(
double
),
num_bin_
,
file
);
}
else
{
fwrite
(
bin_2_categorical_
.
data
(),
sizeof
(
int
),
num_bin_
,
file
);
}
}
}
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
BinMapper
::
SizesInByte
()
const
{
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
size_t
ret
=
sizeof
(
num_bin_
)
+
sizeof
(
is_trival_
)
+
sizeof
(
sparse_rate_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
+
sizeof
(
bin_type_
)
+
sizeof
(
min_val_
)
+
sizeof
(
max_val_
)
+
sizeof
(
default_bin_
);
ret
+=
sizeof
(
double
)
*
num_bin_
;
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
ret
+=
sizeof
(
double
)
*
num_bin_
;
}
else
{
ret
+=
sizeof
(
int
)
*
num_bin_
;
}
return
ret
;
return
ret
;
}
}
...
...
src/io/config.cpp
View file @
ef778069
...
@@ -216,6 +216,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
...
@@ -216,6 +216,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString
(
params
,
"weight_column"
,
&
weight_column
);
GetString
(
params
,
"weight_column"
,
&
weight_column
);
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"group_column"
,
&
group_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
GetString
(
params
,
"ignore_column"
,
&
ignore_column
);
GetString
(
params
,
"categorical_column"
,
&
categorical_column
);
GetInt
(
params
,
"min_data_in_leaf"
,
&
min_data_in_leaf
);
GetInt
(
params
,
"min_data_in_leaf"
,
&
min_data_in_leaf
);
GetInt
(
params
,
"min_dato_in_bin"
,
&
min_data_in_bin
);
GetInt
(
params
,
"min_dato_in_bin"
,
&
min_data_in_bin
);
GetDouble
(
params
,
"max_conflict_rate"
,
&
max_conflict_rate
);
GetDouble
(
params
,
"max_conflict_rate"
,
&
max_conflict_rate
);
...
...
src/io/dataset.cpp
View file @
ef778069
...
@@ -43,8 +43,8 @@ std::vector<std::vector<int>> NoGroup(
...
@@ -43,8 +43,8 @@ std::vector<std::vector<int>> NoGroup(
void
Dataset
::
Construct
(
void
Dataset
::
Construct
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
const
std
::
vector
<
std
::
vector
<
int
>>&
sample_indices
,
const
std
::
vector
<
std
::
vector
<
int
>>&
,
size_t
total_sample_cnt
,
size_t
,
const
IOConfig
&
io_config
)
{
const
IOConfig
&
io_config
)
{
num_total_features_
=
static_cast
<
int
>
(
bin_mappers
.
size
());
num_total_features_
=
static_cast
<
int
>
(
bin_mappers
.
size
());
// get num_features
// get num_features
...
...
src/io/dataset_loader.cpp
View file @
ef778069
...
@@ -131,6 +131,29 @@ void DatasetLoader::SetHeader(const char* filename) {
...
@@ -131,6 +131,29 @@ void DatasetLoader::SetHeader(const char* filename) {
ignore_features_
.
emplace
(
group_idx_
);
ignore_features_
.
emplace
(
group_idx_
);
}
}
}
}
if
(
io_config_
.
categorical_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config_
.
categorical_column
,
name_prefix
))
{
std
::
string
names
=
io_config_
.
categorical_column
.
substr
(
name_prefix
.
size
());
for
(
auto
name
:
Common
::
Split
(
names
.
c_str
(),
','
))
{
if
(
name2idx
.
count
(
name
)
>
0
)
{
int
tmp
=
name2idx
[
name
];
categorical_features_
.
emplace
(
tmp
);
}
else
{
Log
::
Fatal
(
"Could not find categorical_column %s in data file"
,
name
.
c_str
());
}
}
}
else
{
for
(
auto
token
:
Common
::
Split
(
io_config_
.
categorical_column
.
c_str
(),
','
))
{
int
tmp
=
0
;
if
(
!
Common
::
AtoiAndCheck
(
token
.
c_str
(),
&
tmp
))
{
Log
::
Fatal
(
"categorical_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
categorical_features_
.
emplace
(
tmp
);
}
}
}
}
}
Dataset
*
DatasetLoader
::
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
)
{
Dataset
*
DatasetLoader
::
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
)
{
...
@@ -471,9 +494,13 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
...
@@ -471,9 +494,13 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
bin_mappers
[
i
]
=
nullptr
;
bin_mappers
[
i
]
=
nullptr
;
continue
;
continue
;
}
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
total_sample_size
,
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
total_sample_size
,
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
);
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
}
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_data
));
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_data
));
dataset
->
feature_names_
=
feature_names_
;
dataset
->
feature_names_
=
feature_names_
;
...
@@ -684,9 +711,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
...
@@ -684,9 +711,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
bin_mappers
[
i
]
=
nullptr
;
bin_mappers
[
i
]
=
nullptr
;
continue
;
continue
;
}
}
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
].
reset
(
new
BinMapper
());
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
sample_data
.
size
(),
bin_mappers
[
i
]
->
FindBin
(
sample_values
[
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
);
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
}
}
}
else
{
}
else
{
// if have multi-machines, need find bin distributed
// if have multi-machines, need find bin distributed
...
@@ -716,9 +747,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
...
@@ -716,9 +747,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
// find local feature bins and copy to buffer
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
BinType
bin_type
=
BinType
::
NumericalBin
;
if
(
categorical_features_
.
count
(
start
[
rank
]
+
i
))
{
bin_type
=
BinType
::
CategoricalBin
;
}
BinMapper
bin_mapper
;
BinMapper
bin_mapper
;
bin_mapper
.
FindBin
(
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
bin_mapper
.
FindBin
(
sample_values
[
start
[
rank
]
+
i
],
sample_data
.
size
(),
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
);
io_config_
.
max_bin
,
io_config_
.
min_data_in_bin
,
filter_cnt
,
bin_type
);
bin_mapper
.
CopyTo
(
input_buffer
.
data
()
+
i
*
type_size
);
bin_mapper
.
CopyTo
(
input_buffer
.
data
()
+
i
*
type_size
);
}
}
// convert to binary size
// convert to binary size
...
...
src/io/dense_bin.hpp
View file @
ef778069
...
@@ -132,7 +132,7 @@ public:
...
@@ -132,7 +132,7 @@ public:
virtual
data_size_t
Split
(
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
...
@@ -144,19 +144,37 @@ public:
...
@@ -144,19 +144,37 @@ public:
data_size_t
gt_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
default_indices
=
lte_indices
;
if
(
default_bin
<=
threshold
)
{
default_count
=
&
lte_count
;
default_indices
=
lte_indices
;
}
default_count
=
&
lte_count
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
}
const
data_size_t
idx
=
data_indices
[
i
];
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
VAL_T
bin
=
data_
[
idx
];
const
data_size_t
idx
=
data_indices
[
i
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
VAL_T
bin
=
data_
[
idx
];
default_indices
[(
*
default_count
)
++
]
=
idx
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
}
else
if
(
bin
>
th
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
gt_indices
[
gt_count
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
}
else
{
gt_indices
[
gt_count
++
]
=
idx
;
lte_indices
[
lte_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
}
}
return
lte_count
;
return
lte_count
;
...
...
src/io/dense_nbits_bin.hpp
View file @
ef778069
...
@@ -161,7 +161,7 @@ public:
...
@@ -161,7 +161,7 @@ public:
virtual
data_size_t
Split
(
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
if
(
num_data
<=
0
)
{
return
0
;
}
uint8_t
th
=
static_cast
<
uint8_t
>
(
threshold
+
min_bin
);
uint8_t
th
=
static_cast
<
uint8_t
>
(
threshold
+
min_bin
);
uint8_t
minb
=
static_cast
<
uint8_t
>
(
min_bin
);
uint8_t
minb
=
static_cast
<
uint8_t
>
(
min_bin
);
...
@@ -173,19 +173,37 @@ public:
...
@@ -173,19 +173,37 @@ public:
data_size_t
gt_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
default_indices
=
lte_indices
;
if
(
default_bin
<=
threshold
)
{
default_count
=
&
lte_count
;
default_indices
=
lte_indices
;
}
default_count
=
&
lte_count
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
}
const
data_size_t
idx
=
data_indices
[
i
];
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
const
data_size_t
idx
=
data_indices
[
i
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
default_indices
[(
*
default_count
)
++
]
=
idx
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
}
else
if
(
bin
>
th
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
gt_indices
[
gt_count
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
}
else
{
gt_indices
[
gt_count
++
]
=
idx
;
lte_indices
[
lte_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
}
}
return
lte_count
;
return
lte_count
;
...
...
src/io/sparse_bin.hpp
View file @
ef778069
...
@@ -125,7 +125,7 @@ public:
...
@@ -125,7 +125,7 @@ public:
virtual
data_size_t
Split
(
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
override
{
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
// not need to split
// not need to split
if
(
num_data
<=
0
)
{
return
0
;
}
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
...
@@ -139,19 +139,37 @@ public:
...
@@ -139,19 +139,37 @@ public:
data_size_t
gt_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
default_bin
<=
threshold
)
{
if
(
bin_type
==
BinType
::
NumericalBin
)
{
default_indices
=
lte_indices
;
if
(
default_bin
<=
threshold
)
{
default_count
=
&
lte_count
;
default_indices
=
lte_indices
;
}
default_count
=
&
lte_count
;
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
}
const
data_size_t
idx
=
data_indices
[
i
];
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
const
data_size_t
idx
=
data_indices
[
i
];
if
(
bin
>
maxb
||
bin
<
minb
)
{
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
default_indices
[(
*
default_count
)
++
]
=
idx
;
if
(
bin
>
maxb
||
bin
<
minb
)
{
}
else
if
(
bin
>
th
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
gt_indices
[
gt_count
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
}
else
{
gt_indices
[
gt_count
++
]
=
idx
;
lte_indices
[
lte_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
RawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
minb
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
}
else
{
lte_indices
[
lte_count
++
]
=
idx
;
}
}
}
}
}
return
lte_count
;
return
lte_count
;
...
...
src/io/tree.cpp
View file @
ef778069
...
@@ -15,6 +15,11 @@
...
@@ -15,6 +15,11 @@
namespace
LightGBM
{
namespace
LightGBM
{
std
::
vector
<
bool
(
*
)(
uint32_t
,
uint32_t
)
>
Tree
::
inner_decision_funs
=
{
Tree
::
NumericalDecision
<
uint32_t
>
,
Tree
::
CategoricalDecision
<
uint32_t
>
};
std
::
vector
<
bool
(
*
)(
double
,
double
)
>
Tree
::
decision_funs
=
{
Tree
::
NumericalDecision
<
double
>
,
Tree
::
CategoricalDecision
<
double
>
};
Tree
::
Tree
(
int
max_leaves
)
Tree
::
Tree
(
int
max_leaves
)
:
max_leaves_
(
max_leaves
)
{
:
max_leaves_
(
max_leaves
)
{
...
@@ -25,6 +30,7 @@ Tree::Tree(int max_leaves)
...
@@ -25,6 +30,7 @@ Tree::Tree(int max_leaves)
split_feature_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
uint32_t
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
uint32_t
>
(
max_leaves_
-
1
);
threshold_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
threshold_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
decision_type_
=
std
::
vector
<
int8_t
>
(
max_leaves_
-
1
);
split_gain_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
split_gain_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
leaf_parent_
=
std
::
vector
<
int
>
(
max_leaves_
);
leaf_parent_
=
std
::
vector
<
int
>
(
max_leaves_
);
leaf_value_
=
std
::
vector
<
double
>
(
max_leaves_
);
leaf_value_
=
std
::
vector
<
double
>
(
max_leaves_
);
...
@@ -37,12 +43,13 @@ Tree::Tree(int max_leaves)
...
@@ -37,12 +43,13 @@ Tree::Tree(int max_leaves)
num_leaves_
=
1
;
num_leaves_
=
1
;
leaf_parent_
[
0
]
=
-
1
;
leaf_parent_
[
0
]
=
-
1
;
shrinkage_
=
1.0
f
;
shrinkage_
=
1.0
f
;
has_categorical_
=
false
;
}
}
Tree
::~
Tree
()
{
Tree
::~
Tree
()
{
}
}
int
Tree
::
Split
(
int
leaf
,
int
feature
,
uint32_t
threshold_bin
,
int
real_feature
,
int
Tree
::
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold_bin
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
)
{
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
)
{
int
new_node_idx
=
num_leaves_
-
1
;
int
new_node_idx
=
num_leaves_
-
1
;
...
@@ -59,6 +66,12 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
...
@@ -59,6 +66,12 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
// add new node
// add new node
split_feature_inner
[
new_node_idx
]
=
feature
;
split_feature_inner
[
new_node_idx
]
=
feature
;
split_feature_
[
new_node_idx
]
=
real_feature
;
split_feature_
[
new_node_idx
]
=
real_feature
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
decision_type_
[
new_node_idx
]
=
0
;
}
else
{
has_categorical_
=
true
;
decision_type_
[
new_node_idx
]
=
1
;
}
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_
[
new_node_idx
]
=
threshold_double
;
threshold_
[
new_node_idx
]
=
threshold_double
;
split_gain_
[
new_node_idx
]
=
gain
;
split_gain_
[
new_node_idx
]
=
gain
;
...
@@ -84,62 +97,196 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
...
@@ -84,62 +97,196 @@ int Tree::Split(int leaf, int feature, uint32_t threshold_bin, int real_feature,
}
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
{
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
data_size_t
num_data
,
double
*
score
)
const
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
if
(
has_categorical_
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
const
int
fidx
=
split_feature_inner
[
i
];
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
]
->
Reset
(
start
);
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
}
iter
[
i
]
->
Reset
(
start
);
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iter
,
i
)]);
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
}
int
node
=
0
;
});
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
i
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
else
{
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
iter
[
i
]
->
Reset
(
start
);
const
int
fidx
=
split_feature_inner
[
i
];
}
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
iter
[
i
]
->
Reset
(
start
);
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeafRaw
(
iter
,
i
)]);
}
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
});
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
i
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
}
}
}
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
void
Tree
::
AddPredictionToScore
(
const
Dataset
*
data
,
const
data_size_t
*
used_data_indices
,
const
data_size_t
*
used_data_indices
,
data_size_t
num_data
,
double
*
score
)
const
{
data_size_t
num_data
,
double
*
score
)
const
{
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
if
(
has_categorical_
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
const
int
fidx
=
split_feature_inner
[
i
];
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
const
int
fidx
=
split_feature_inner
[
i
];
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
}
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
}
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeaf
(
iter
,
used_data_indices
[
i
])]);
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
}
int
node
=
0
;
});
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
),
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
else
{
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
if
(
data
->
num_features
()
>
num_leaves_
-
1
)
{
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
const
int
fidx
=
split_feature_inner
[
i
];
}
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
score
[
used_data_indices
[
i
]]
+=
static_cast
<
double
>
(
leaf_value_
[
GetLeafRaw
(
iter
,
used_data_indices
[
i
])]);
}
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
});
int
node
=
0
;
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
else
{
Threading
::
For
<
data_size_t
>
(
0
,
num_data
,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
data
->
num_features
());
for
(
int
i
=
0
;
i
<
data
->
num_features
();
++
i
)
{
iter
[
i
].
reset
(
data
->
FeatureIterator
(
i
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
}
}
score
[
idx
]
+=
static_cast
<
double
>
(
leaf_value_
[
~
node
]);
}
});
}
}
}
}
}
...
@@ -152,6 +299,8 @@ std::string Tree::ToString() {
...
@@ -152,6 +299,8 @@ std::string Tree::ToString() {
<<
Common
::
ArrayToString
<
double
>
(
split_gain_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
double
>
(
split_gain_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"threshold="
str_buf
<<
"threshold="
<<
Common
::
ArrayToString
<
double
>
(
threshold_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
double
>
(
threshold_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"decision_type="
<<
Common
::
ArrayToString
<
int
>
(
Common
::
ArrayCast
<
int8_t
,
int
>
(
decision_type_
),
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"left_child="
str_buf
<<
"left_child="
<<
Common
::
ArrayToString
<
int
>
(
left_child_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
<<
Common
::
ArrayToString
<
int
>
(
left_child_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"right_child="
str_buf
<<
"right_child="
...
@@ -191,6 +340,7 @@ std::string Tree::NodeToJSON(int index) {
...
@@ -191,6 +340,7 @@ std::string Tree::NodeToJSON(int index) {
str_buf
<<
"
\"
split_feature
\"
:"
<<
split_feature_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_feature
\"
:"
<<
split_feature_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_gain
\"
:"
<<
split_gain_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
split_gain
\"
:"
<<
split_gain_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
threshold
\"
:"
<<
threshold_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
threshold
\"
:"
<<
threshold_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
decision_type
\"
:
\"
"
<<
Tree
::
GetDecisionTypeName
(
decision_type_
[
index
])
<<
"
\"
,"
<<
std
::
endl
;
str_buf
<<
"
\"
internal_value
\"
:"
<<
internal_value_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_value
\"
:"
<<
internal_value_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_count
\"
:"
<<
internal_count_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_count
\"
:"
<<
internal_count_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
left_child
\"
:"
<<
NodeToJSON
(
left_child_
[
index
])
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
left_child
\"
:"
<<
NodeToJSON
(
left_child_
[
index
])
<<
","
<<
std
::
endl
;
...
@@ -229,6 +379,7 @@ Tree::Tree(const std::string& str) {
...
@@ -229,6 +379,7 @@ Tree::Tree(const std::string& str) {
||
key_vals
.
count
(
"leaf_parent"
)
<=
0
||
key_vals
.
count
(
"leaf_value"
)
<=
0
||
key_vals
.
count
(
"leaf_parent"
)
<=
0
||
key_vals
.
count
(
"leaf_value"
)
<=
0
||
key_vals
.
count
(
"internal_value"
)
<=
0
||
key_vals
.
count
(
"internal_count"
)
<=
0
||
key_vals
.
count
(
"internal_value"
)
<=
0
||
key_vals
.
count
(
"internal_count"
)
<=
0
||
key_vals
.
count
(
"leaf_count"
)
<=
0
||
key_vals
.
count
(
"shrinkage"
)
<=
0
||
key_vals
.
count
(
"leaf_count"
)
<=
0
||
key_vals
.
count
(
"shrinkage"
)
<=
0
||
key_vals
.
count
(
"decision_type"
)
<=
0
)
{
)
{
Log
::
Fatal
(
"Tree model string format error"
);
Log
::
Fatal
(
"Tree model string format error"
);
}
}
...
@@ -239,6 +390,7 @@ Tree::Tree(const std::string& str) {
...
@@ -239,6 +390,7 @@ Tree::Tree(const std::string& str) {
right_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"right_child"
],
' '
,
num_leaves_
-
1
);
right_child_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"right_child"
],
' '
,
num_leaves_
-
1
);
split_feature_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"split_feature"
],
' '
,
num_leaves_
-
1
);
split_feature_
=
Common
::
StringToArray
<
int
>
(
key_vals
[
"split_feature"
],
' '
,
num_leaves_
-
1
);
threshold_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"threshold"
],
' '
,
num_leaves_
-
1
);
threshold_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"threshold"
],
' '
,
num_leaves_
-
1
);
decision_type_
=
Common
::
StringToArray
<
int8_t
>
(
key_vals
[
"decision_type"
],
' '
,
num_leaves_
-
1
);
split_gain_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"split_gain"
],
' '
,
num_leaves_
-
1
);
split_gain_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"split_gain"
],
' '
,
num_leaves_
-
1
);
internal_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"internal_count"
],
' '
,
num_leaves_
-
1
);
internal_count_
=
Common
::
StringToArray
<
data_size_t
>
(
key_vals
[
"internal_count"
],
' '
,
num_leaves_
-
1
);
internal_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"internal_value"
],
' '
,
num_leaves_
-
1
);
internal_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"internal_value"
],
' '
,
num_leaves_
-
1
);
...
...
src/treelearner/feature_histogram.hpp
View file @
ef778069
...
@@ -41,9 +41,16 @@ public:
...
@@ -41,9 +41,16 @@ public:
* \param feature the feature data for this histogram
* \param feature the feature data for this histogram
* \param min_num_data_one_leaf minimal number of data in one leaf
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
*/
void
Init
(
HistogramBinEntry
*
data
,
const
FeatureMetainfo
*
meta
)
{
void
Init
(
HistogramBinEntry
*
data
,
const
FeatureMetainfo
*
meta
,
BinType
bin_type
)
{
meta_
=
meta
;
meta_
=
meta
;
data_
=
data
;
data_
=
data
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdNumerical
,
this
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
);
}
else
{
find_best_threshold_fun_
=
std
::
bind
(
&
FeatureHistogram
::
FindBestThresholdCategorical
,
this
,
std
::
placeholders
::
_1
,
std
::
placeholders
::
_2
,
std
::
placeholders
::
_3
,
std
::
placeholders
::
_4
);
}
}
}
HistogramBinEntry
*
RawData
()
{
HistogramBinEntry
*
RawData
()
{
...
@@ -60,9 +67,14 @@ public:
...
@@ -60,9 +67,14 @@ public:
data_
[
i
].
sum_hessians
-=
other
.
data_
[
i
].
sum_hessians
;
data_
[
i
].
sum_hessians
-=
other
.
data_
[
i
].
sum_hessians
;
}
}
}
}
void
FindBestThreshold
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
void
FindBestThreshold
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
SplitInfo
*
output
)
{
sum_hessian
+=
2
*
kEpsilon
;
find_best_threshold_fun_
(
sum_gradient
,
sum_hessian
+
2
*
kEpsilon
,
num_data
,
output
);
}
void
FindBestThresholdNumerical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
double
best_sum_left_gradient
=
NAN
;
double
best_sum_left_gradient
=
NAN
;
double
best_sum_left_hessian
=
NAN
;
double
best_sum_left_hessian
=
NAN
;
double
best_gain
=
kMinScore
;
double
best_gain
=
kMinScore
;
...
@@ -131,6 +143,97 @@ public:
...
@@ -131,6 +143,97 @@ public:
output
->
gain
=
kMinScore
;
output
->
gain
=
kMinScore
;
}
}
}
}
void
FindBestThresholdCategorical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
double
best_gain
=
kMinScore
;
uint32_t
best_threshold
=
static_cast
<
uint32_t
>
(
meta_
->
num_bin
);
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
,
sum_hessian
);
double
min_gain_shift
=
gain_shift
+
meta_
->
tree_config
->
min_gain_to_split
;
is_splittable_
=
false
;
const
int
bias
=
meta_
->
bias
;
int
t
=
meta_
->
num_bin
-
1
-
bias
;
const
int
t_end
=
0
;
// from right to left, and we don't need data in bin0
for
(;
t
>=
t_end
;
--
t
)
{
// if data not enough, or sum hessian too small
if
(
data_
[
t
].
cnt
<
meta_
->
tree_config
->
min_data_in_leaf
||
data_
[
t
].
sum_hessians
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
other_count
=
num_data
-
data_
[
t
].
cnt
;
// if data not enough
if
(
other_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
continue
;
double
sum_other_hessian
=
sum_hessian
-
data_
[
t
].
sum_hessians
-
kEpsilon
;
// if sum hessian too small
if
(
sum_other_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
double
sum_other_gradient
=
sum_gradient
-
data_
[
t
].
sum_gradients
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
)
+
GetLeafSplitGain
(
data_
[
t
].
sum_gradients
,
data_
[
t
].
sum_hessians
+
kEpsilon
);
// gain with split is worse than without split
if
(
current_gain
<=
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_threshold
=
static_cast
<
uint32_t
>
(
t
+
bias
);
best_gain
=
current_gain
;
}
}
// need restore zero bin
if
(
bias
==
1
)
{
t
=
meta_
->
num_bin
-
1
-
bias
;
double
sum_bin0_gradient
=
sum_gradient
;
double
sum_bin0_hessian
=
sum_hessian
;
data_size_t
cnt_bin0
=
num_data
;
for
(;
t
>=
0
;
--
t
)
{
sum_bin0_gradient
-=
data_
[
t
].
sum_gradients
;
sum_bin0_hessian
-=
data_
[
t
].
sum_hessians
;
cnt_bin0
-=
data_
[
t
].
cnt
;
}
data_size_t
other_count
=
num_data
-
cnt_bin0
;
double
sum_other_hessian
=
sum_hessian
-
sum_bin0_hessian
-
kEpsilon
;
if
(
cnt_bin0
>=
meta_
->
tree_config
->
min_data_in_leaf
&&
sum_bin0_hessian
>=
meta_
->
tree_config
->
min_sum_hessian_in_leaf
&&
other_count
>=
meta_
->
tree_config
->
min_data_in_leaf
&&
sum_other_hessian
>=
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
{
double
sum_other_gradient
=
sum_gradient
-
sum_bin0_gradient
;
double
current_gain
=
GetLeafSplitGain
(
sum_other_gradient
,
sum_other_hessian
)
+
GetLeafSplitGain
(
sum_bin0_gradient
,
sum_bin0_hessian
+
kEpsilon
);
if
(
current_gain
>
min_gain_shift
)
{
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_threshold
=
static_cast
<
uint32_t
>
(
0
);
best_gain
=
current_gain
;
}
}
}
}
if
(
is_splittable_
)
{
// update split information
output
->
feature
=
meta_
->
feature_idx
;
output
->
threshold
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
data_
[
best_threshold
].
sum_gradients
,
data_
[
best_threshold
].
sum_hessians
+
kEpsilon
);
output
->
left_count
=
data_
[
best_threshold
].
cnt
;
output
->
left_sum_gradient
=
data_
[
best_threshold
].
sum_gradients
;
output
->
left_sum_hessian
=
data_
[
best_threshold
].
sum_hessians
+
kEpsilon
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradient
-
data_
[
best_threshold
].
sum_gradients
,
sum_hessian
-
data_
[
best_threshold
].
sum_hessians
-
kEpsilon
);
output
->
right_count
=
num_data
-
data_
[
best_threshold
].
cnt
;
output
->
right_sum_gradient
=
sum_gradient
-
data_
[
best_threshold
].
sum_gradients
;
output
->
right_sum_hessian
=
sum_hessian
-
data_
[
best_threshold
].
sum_hessians
-
kEpsilon
;
output
->
gain
=
best_gain
-
gain_shift
;
}
else
{
output
->
feature
=
meta_
->
feature_idx
;
output
->
gain
=
kMinScore
;
}
}
/*!
/*!
* \brief Binary size of this histogram
* \brief Binary size of this histogram
*/
*/
...
@@ -188,6 +291,8 @@ private:
...
@@ -188,6 +291,8 @@ private:
//std::vector<HistogramBinEntry> data_;
//std::vector<HistogramBinEntry> data_;
/*! \brief False if this histogram cannot split */
/*! \brief False if this histogram cannot split */
bool
is_splittable_
=
true
;
bool
is_splittable_
=
true
;
std
::
function
<
void
(
double
,
double
,
data_size_t
,
SplitInfo
*
)
>
find_best_threshold_fun_
;
};
};
class
HistogramPool
{
class
HistogramPool
{
public:
public:
...
@@ -264,7 +369,7 @@ public:
...
@@ -264,7 +369,7 @@ public:
uint64_t
offset
=
0
;
uint64_t
offset
=
0
;
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
pool_
[
i
][
j
].
Init
(
data_
[
i
].
data
()
+
offset
,
&
feature_metas_
[
j
]);
pool_
[
i
][
j
].
Init
(
data_
[
i
].
data
()
+
offset
,
&
feature_metas_
[
j
]
,
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
()
);
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
num_bin
-=
1
;
...
...
src/treelearner/serial_tree_learner.cpp
View file @
ef778069
...
@@ -490,7 +490,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
...
@@ -490,7 +490,8 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
// left = parent
// left = parent
*
left_leaf
=
best_Leaf
;
*
left_leaf
=
best_Leaf
;
// split tree, will return right leaf
// split tree, will return right leaf
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
best_split_info
.
feature
,
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
best_split_info
.
feature
,
train_data_
->
FeatureBinMapper
(
best_split_info
.
feature
)
->
bin_type
(),
best_split_info
.
threshold
,
best_split_info
.
threshold
,
train_data_
->
RealFeatureIndex
(
best_split_info
.
feature
),
train_data_
->
RealFeatureIndex
(
best_split_info
.
feature
),
train_data_
->
RealThreshold
(
best_split_info
.
feature
,
best_split_info
.
threshold
),
train_data_
->
RealThreshold
(
best_split_info
.
feature
,
best_split_info
.
threshold
),
...
...
src/treelearner/voting_parallel_tree_learner.cpp
View file @
ef778069
...
@@ -79,8 +79,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
...
@@ -79,8 +79,8 @@ void VotingParallelTreeLearner::Init(const Dataset* train_data) {
uint64_t
offset
=
0
;
uint64_t
offset
=
0
;
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
for
(
int
j
=
0
;
j
<
train_data
->
num_features
();
++
j
)
{
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
offset
+=
static_cast
<
uint64_t
>
(
train_data
->
SubFeatureBinOffset
(
j
));
smaller_leaf_histogram_array_global_
[
j
].
Init
(
smaller_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]);
smaller_leaf_histogram_array_global_
[
j
].
Init
(
smaller_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]
,
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
()
);
larger_leaf_histogram_array_global_
[
j
].
Init
(
larger_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]);
larger_leaf_histogram_array_global_
[
j
].
Init
(
larger_leaf_histogram_data_
.
data
()
+
offset
,
&
feature_metas_
[
j
]
,
train_data
->
FeatureBinMapper
(
j
)
->
bin_type
()
);
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
auto
num_bin
=
train_data
->
FeatureNumBin
(
j
);
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
if
(
train_data
->
FeatureBinMapper
(
j
)
->
GetDefaultBin
()
==
0
)
{
num_bin
-=
1
;
num_bin
-=
1
;
...
...
tests/python_package_test/test_basic.py
View file @
ef778069
...
@@ -49,7 +49,7 @@ class TestBasic(unittest.TestCase):
...
@@ -49,7 +49,7 @@ class TestBasic(unittest.TestCase):
for
preds
in
zip
(
pred_from_matr
,
pred_from_model_file
):
for
preds
in
zip
(
pred_from_matr
,
pred_from_model_file
):
self
.
assertEqual
(
*
preds
)
self
.
assertEqual
(
*
preds
)
# check pmml
# check pmml
#
os.system('python ../../pmml/pmml.py model.txt')
os
.
system
(
'python ../../pmml/pmml.py model.txt'
)
print
(
"----------------------------------------------------------------------"
)
print
(
"----------------------------------------------------------------------"
)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment