Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
e984b0d6
Commit
e984b0d6
authored
May 15, 2017
by
Guolin Ke
Committed by
GitHub
May 15, 2017
Browse files
Handle for missing values (#516)
parent
e8cc6ab9
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
620 additions
and
216 deletions
+620
-216
include/LightGBM/bin.h
include/LightGBM/bin.h
+2
-1
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+2
-2
include/LightGBM/feature_group.h
include/LightGBM/feature_group.h
+2
-1
include/LightGBM/meta.h
include/LightGBM/meta.h
+2
-0
include/LightGBM/tree.h
include/LightGBM/tree.h
+33
-6
include/LightGBM/utils/common.h
include/LightGBM/utils/common.h
+10
-0
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+2
-2
src/io/bin.cpp
src/io/bin.cpp
+122
-71
src/io/dataset.cpp
src/io/dataset.cpp
+195
-13
src/io/dense_bin.hpp
src/io/dense_bin.hpp
+7
-5
src/io/dense_nbits_bin.hpp
src/io/dense_nbits_bin.hpp
+7
-5
src/io/sparse_bin.hpp
src/io/sparse_bin.hpp
+7
-5
src/io/tree.cpp
src/io/tree.cpp
+62
-32
src/treelearner/data_partition.hpp
src/treelearner/data_partition.hpp
+3
-2
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+151
-68
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+9
-2
src/treelearner/split_info.hpp
src/treelearner/split_info.hpp
+3
-1
src/treelearner/voting_parallel_tree_learner.cpp
src/treelearner/voting_parallel_tree_learner.cpp
+1
-0
No files found.
include/LightGBM/bin.h
View file @
e984b0d6
...
...
@@ -360,6 +360,7 @@ public:
* \param min_bin min_bin of current used feature
* \param max_bin max_bin of current used feature
* \param default_bin defualt bin if bin not in [min_bin, max_bin]
* \param default_bin_for_zero defualt bin for the zero(missing) bin
* \param threshold The split threshold.
* \param data_indices Used data indices. After called this function. The less than or equal data indices will store on this object.
* \param num_data Number of used data
...
...
@@ -369,7 +370,7 @@ public:
* \return The number of less than or equal data.
*/
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
threshold
,
uint32_t
default_bin
,
uint32_t
default_bin_for_zero
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
=
0
;
...
...
include/LightGBM/dataset.h
View file @
e984b0d6
...
...
@@ -402,12 +402,12 @@ public:
HistogramBinEntry
*
data
)
const
;
inline
data_size_t
Split
(
int
feature
,
uint32_t
threshold
,
uint32_t
threshold
,
uint32_t
default_bin_for_zero
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
const
int
group
=
feature2group_
[
feature
];
const
int
sub_feature
=
feature2subfeature_
[
feature
];
return
feature_groups_
[
group
]
->
Split
(
sub_feature
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
return
feature_groups_
[
group
]
->
Split
(
sub_feature
,
threshold
,
default_bin_for_zero
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
);
}
inline
int
SubFeatureBinOffset
(
int
i
)
const
{
...
...
include/LightGBM/feature_group.h
View file @
e984b0d6
...
...
@@ -161,13 +161,14 @@ public:
inline
data_size_t
Split
(
int
sub_feature
,
uint32_t
threshold
,
uint32_t
default_bin_for_zero
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
)
const
{
uint32_t
min_bin
=
bin_offsets_
[
sub_feature
];
uint32_t
max_bin
=
bin_offsets_
[
sub_feature
+
1
]
-
1
;
uint32_t
default_bin
=
bin_mappers_
[
sub_feature
]
->
GetDefaultBin
();
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
return
bin_data_
->
Split
(
min_bin
,
max_bin
,
default_bin
,
default_bin_for_zero
,
threshold
,
data_indices
,
num_data
,
lte_indices
,
gt_indices
,
bin_mappers_
[
sub_feature
]
->
bin_type
());
}
/*!
...
...
include/LightGBM/meta.h
View file @
e984b0d6
...
...
@@ -19,6 +19,8 @@ const score_t kMinScore = -std::numeric_limits<score_t>::infinity();
const
score_t
kEpsilon
=
1e-15
f
;
const
double
kMissingValueRange
=
1e-20
f
;
using
ReduceFunction
=
std
::
function
<
void
(
const
char
*
,
char
*
,
int
)
>
;
using
PredictFunction
=
...
...
include/LightGBM/tree.h
View file @
e984b0d6
...
...
@@ -44,11 +44,15 @@ public:
* \param left_cnt Count of left child
* \param right_cnt Count of right child
* \param gain Split gain
* \param zero_bin bin value for value==0 (missing value)
* \param default_bin default conversion for the missing value, in bin
* \param default_value default conversion for the missing value, in float value
* \return The index of new leaf.
*/
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
);
int
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
,
uint32_t
zero_bin
,
uint32_t
default_bin_for_zero
,
double
default_value
);
/*! \brief Get the output of one leaf */
inline
double
LeafOutput
(
int
leaf
)
const
{
return
leaf_value_
[
leaf
];
}
...
...
@@ -140,6 +144,23 @@ public:
}
}
static
double
DefaultValueForZero
(
double
fval
,
double
zero
,
double
out
)
{
if
(
fval
>
-
zero
&&
fval
<=
zero
)
{
return
out
;
}
else
{
return
fval
;
}
}
static
uint32_t
DefaultValueForZero
(
uint32_t
fval
,
uint32_t
zero
,
uint32_t
out
)
{
if
(
fval
==
zero
)
{
return
out
;
}
else
{
return
fval
;
}
}
static
const
char
*
GetDecisionTypeName
(
int8_t
type
)
{
if
(
type
==
0
)
{
return
"no_greater"
;
...
...
@@ -176,7 +197,7 @@ private:
/*! \brief A non-leaf node's right child */
std
::
vector
<
int
>
right_child_
;
/*! \brief A non-leaf node's split feature */
std
::
vector
<
int
>
split_feature_inner
;
std
::
vector
<
int
>
split_feature_inner
_
;
/*! \brief A non-leaf node's split feature, the original index */
std
::
vector
<
int
>
split_feature_
;
/*! \brief A non-leaf node's split threshold in bin */
...
...
@@ -185,6 +206,10 @@ private:
std
::
vector
<
double
>
threshold_
;
/*! \brief Decision type, 0 for '<='(numerical feature), 1 for 'is'(categorical feature) */
std
::
vector
<
int8_t
>
decision_type_
;
/*! \brief Default values for the na/0 feature values */
std
::
vector
<
double
>
default_value_
;
std
::
vector
<
uint32_t
>
zero_bin_
;
std
::
vector
<
uint32_t
>
default_bin_for_zero_
;
/*! \brief A non-leaf node's split gain */
std
::
vector
<
double
>
split_gain_
;
// used for leaf node
...
...
@@ -226,8 +251,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
int
node
=
0
;
if
(
has_categorical_
)
{
while
(
node
>=
0
)
{
double
fval
=
DefaultValueForZero
(
feature_values
[
split_feature_
[
node
]],
kMissingValueRange
,
default_value_
[
node
]);
if
(
decision_funs
[
decision_type_
[
node
]](
f
eature_values
[
split_feature_
[
node
]]
,
f
val
,
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
...
...
@@ -236,8 +262,9 @@ inline int Tree::GetLeaf(const double* feature_values) const {
}
}
else
{
while
(
node
>=
0
)
{
double
fval
=
DefaultValueForZero
(
feature_values
[
split_feature_
[
node
]],
kMissingValueRange
,
default_value_
[
node
]);
if
(
NumericalDecision
<
double
>
(
f
eature_values
[
split_feature_
[
node
]]
,
f
val
,
threshold_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
...
...
include/LightGBM/utils/common.h
View file @
e984b0d6
...
...
@@ -462,6 +462,16 @@ inline static std::vector<int> VectorSize(const std::vector<std::vector<T>>& dat
return
ret
;
}
inline
static
double
AvoidInf
(
double
x
)
{
if
(
x
>=
std
::
numeric_limits
<
double
>::
max
())
{
return
std
::
numeric_limits
<
double
>::
max
();
}
else
if
(
x
<=
std
::
numeric_limits
<
double
>::
min
())
{
return
std
::
numeric_limits
<
double
>::
min
();
}
else
{
return
x
;
}
}
}
// namespace Common
}
// namespace LightGBM
...
...
src/boosting/gbdt.cpp
View file @
e984b0d6
...
...
@@ -353,7 +353,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
}
init_score
/=
num_data_
;
std
::
unique_ptr
<
Tree
>
new_tree
(
new
Tree
(
2
));
new_tree
->
Split
(
0
,
0
,
BinType
::
NumericalBin
,
0
,
0
,
0
,
init_score
,
init_score
,
0
,
num_data_
,
-
1
);
new_tree
->
Split
(
0
,
0
,
BinType
::
NumericalBin
,
0
,
0
,
0
,
init_score
,
init_score
,
0
,
num_data_
,
-
1
,
0
,
0
,
0
);
train_score_updater_
->
AddScore
(
init_score
,
0
);
for
(
auto
&
score_updater
:
valid_score_updater_
)
{
score_updater
->
AddScore
(
init_score
,
0
);
...
...
@@ -432,7 +432,7 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
if
(
!
class_need_train_
[
cur_tree_id
]
&&
models_
.
size
()
<
static_cast
<
size_t
>
(
num_tree_per_iteration_
))
{
auto
output
=
class_default_output_
[
cur_tree_id
];
new_tree
->
Split
(
0
,
0
,
BinType
::
NumericalBin
,
0
,
0
,
0
,
output
,
output
,
0
,
num_data_
,
-
1
);
output
,
output
,
0
,
num_data_
,
-
1
,
0
,
0
,
0
);
train_score_updater_
->
AddScore
(
output
,
cur_tree_id
);
for
(
auto
&
score_updater
:
valid_score_updater_
)
{
score_updater
->
AddScore
(
output
,
cur_tree_id
);
...
...
src/io/bin.cpp
View file @
e984b0d6
...
...
@@ -63,6 +63,76 @@ bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, Bin
}
return
true
;
}
std
::
vector
<
double
>
GreedyFindBin
(
const
double
*
distinct_values
,
const
int
*
counts
,
int
num_distinct_values
,
int
max_bin
,
int
total_cnt
,
int
min_data_in_bin
)
{
std
::
vector
<
double
>
bin_upper_bound
;
if
(
num_distinct_values
<=
max_bin
)
{
bin_upper_bound
.
clear
();
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cur_cnt_inbin
=
0
;
}
}
cur_cnt_inbin
+=
counts
[
num_distinct_values
-
1
];
bin_upper_bound
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
}
else
{
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_cnt
)
/
max_bin
;
// mean size for one bin
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_distinct_values
,
false
);
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
++
bin_cnt
;
// update bin upper bound
bin_upper_bound
.
resize
(
bin_cnt
);
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
}
// last bin upper bound
bin_upper_bound
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
return
bin_upper_bound
;
}
void
BinMapper
::
FindBin
(
double
*
values
,
int
num_sample_values
,
size_t
total_sample_cnt
,
int
max_bin
,
int
min_data_in_bin
,
int
min_split_data
,
BinType
bin_type
)
{
...
...
@@ -109,81 +179,62 @@ void BinMapper::FindBin(double* values, int num_sample_values, size_t total_samp
std
::
vector
<
int
>
cnt_in_bin
;
int
num_distinct_values
=
static_cast
<
int
>
(
distinct_values
.
size
());
if
(
bin_type_
==
BinType
::
NumericalBin
)
{
if
(
num_distinct_values
<=
max_bin
)
{
// use distinct value is enough
bin_upper_bound_
.
clear
();
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
cur_cnt_inbin
+=
counts
[
i
];
if
(
cur_cnt_inbin
>=
min_data_in_bin
)
{
bin_upper_bound_
.
push_back
((
distinct_values
[
i
]
+
distinct_values
[
i
+
1
])
/
2
);
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
cur_cnt_inbin
=
0
;
}
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
}
else
{
if
(
min_data_in_bin
>
0
)
{
max_bin
=
std
::
min
(
max_bin
,
static_cast
<
int
>
(
total_sample_cnt
/
min_data_in_bin
));
max_bin
=
std
::
max
(
max_bin
,
1
);
bin_upper_bound_
.
clear
();
int
left_cnt_data
=
0
;
int
missing_cnt_data
=
0
;
int
right_cnt_data
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
if
(
distinct_values
[
i
]
<=
-
kMissingValueRange
)
{
left_cnt_data
+=
counts
[
i
];
}
else
if
(
distinct_values
[
i
]
>
kMissingValueRange
)
{
right_cnt_data
+=
counts
[
i
];
}
else
{
missing_cnt_data
+=
counts
[
i
];
}
double
mean_bin_size
=
static_cast
<
double
>
(
total_sample_cnt
)
/
max_bin
;
if
(
zero_cnt
>
mean_bin_size
)
{
int
non_zero_cnt
=
num_sample_values
;
max_bin
=
std
::
min
(
max_bin
,
1
+
static_cast
<
int
>
(
non_zero_cnt
/
min_data_in_bin
));
}
int
left_cnt
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
if
(
distinct_values
[
i
]
>
-
kMissingValueRange
)
{
left_cnt
=
i
;
break
;
}
}
if
(
left_cnt
>
0
)
{
int
left_max_bin
=
static_cast
<
int
>
(
static_cast
<
double
>
(
left_cnt_data
)
/
(
total_sample_cnt
-
missing_cnt_data
)
*
(
max_bin
-
1
));
bin_upper_bound_
=
GreedyFindBin
(
distinct_values
.
data
(),
counts
.
data
(),
left_cnt
,
left_max_bin
,
left_cnt_data
,
min_data_in_bin
);
bin_upper_bound_
.
back
()
=
-
kMissingValueRange
;
}
int
right_start
=
-
1
;
for
(
int
i
=
left_cnt
;
i
<
num_distinct_values
;
++
i
)
{
if
(
distinct_values
[
i
]
>
kMissingValueRange
)
{
right_start
=
i
;
break
;
}
// mean size for one bin
int
rest_bin_cnt
=
max_bin
;
int
rest_sample_cnt
=
static_cast
<
int
>
(
total_sample_cnt
);
std
::
vector
<
bool
>
is_big_count_value
(
num_distinct_values
,
false
);
}
if
(
right_start
>=
0
)
{
int
right_max_bin
=
max_bin
-
1
-
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
auto
right_bounds
=
GreedyFindBin
(
distinct_values
.
data
()
+
right_start
,
counts
.
data
()
+
right_start
,
num_distinct_values
-
right_start
,
right_max_bin
,
right_cnt_data
,
min_data_in_bin
);
bin_upper_bound_
.
push_back
(
kMissingValueRange
);
bin_upper_bound_
.
insert
(
bin_upper_bound_
.
end
(),
right_bounds
.
begin
(),
right_bounds
.
end
());
}
else
{
bin_upper_bound_
.
push_back
(
std
::
numeric_limits
<
double
>::
infinity
());
}
num_bin_
=
static_cast
<
int
>
(
bin_upper_bound_
.
size
());
{
cnt_in_bin
.
resize
(
num_bin_
,
0
);
int
i_bin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
;
++
i
)
{
if
(
counts
[
i
]
>=
mean_bin_size
)
{
is_big_count_value
[
i
]
=
true
;
--
rest_bin_cnt
;
rest_sample_cnt
-=
counts
[
i
];
}
}
mean_bin_size
=
static_cast
<
double
>
(
rest_sample_cnt
)
/
rest_bin_cnt
;
std
::
vector
<
double
>
upper_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
std
::
vector
<
double
>
lower_bounds
(
max_bin
,
std
::
numeric_limits
<
double
>::
infinity
());
int
bin_cnt
=
0
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
0
];
int
cur_cnt_inbin
=
0
;
for
(
int
i
=
0
;
i
<
num_distinct_values
-
1
;
++
i
)
{
if
(
!
is_big_count_value
[
i
])
{
rest_sample_cnt
-=
counts
[
i
];
}
cur_cnt_inbin
+=
counts
[
i
];
// need a new bin
if
(
is_big_count_value
[
i
]
||
cur_cnt_inbin
>=
mean_bin_size
||
(
is_big_count_value
[
i
+
1
]
&&
cur_cnt_inbin
>=
std
::
max
(
1.0
,
mean_bin_size
*
0.5
f
)))
{
upper_bounds
[
bin_cnt
]
=
distinct_values
[
i
];
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
lower_bounds
[
bin_cnt
]
=
distinct_values
[
i
+
1
];
if
(
bin_cnt
>=
max_bin
-
1
)
{
break
;
}
cur_cnt_inbin
=
0
;
if
(
!
is_big_count_value
[
i
])
{
--
rest_bin_cnt
;
mean_bin_size
=
rest_sample_cnt
/
static_cast
<
double
>
(
rest_bin_cnt
);
}
}
}
cur_cnt_inbin
+=
counts
.
back
();
cnt_in_bin
.
push_back
(
cur_cnt_inbin
);
++
bin_cnt
;
// update bin upper bound
bin_upper_bound_
=
std
::
vector
<
double
>
(
bin_cnt
);
num_bin_
=
bin_cnt
;
for
(
int
i
=
0
;
i
<
bin_cnt
-
1
;
++
i
)
{
bin_upper_bound_
[
i
]
=
(
upper_bounds
[
i
]
+
lower_bounds
[
i
+
1
])
/
2.0
f
;
if
(
distinct_values
[
i
]
>
bin_upper_bound_
[
i_bin
])
{
++
i_bin
;
}
cnt_in_bin
[
i_bin
]
+=
counts
[
i
];
}
// last bin upper bound
bin_upper_bound_
[
bin_cnt
-
1
]
=
std
::
numeric_limits
<
double
>::
infinity
();
}
CHECK
(
num_bin_
<=
max_bin
);
}
else
{
...
...
src/io/dataset.cpp
View file @
e984b0d6
...
...
@@ -15,6 +15,10 @@
namespace
LightGBM
{
#ifdef USE_GPU
const
int
kMaxBinPerGroup
=
256
;
#endif // USE_GPU
const
char
*
Dataset
::
binary_file_token
=
"______LightGBM_Binary_File_Token______
\n
"
;
Dataset
::
Dataset
()
{
...
...
@@ -43,12 +47,180 @@ std::vector<std::vector<int>> NoGroup(
return
features_in_group
;
}
int
GetConfilctCount
(
const
std
::
vector
<
bool
>&
mark
,
const
int
*
indices
,
int
num_indices
,
int
max_cnt
)
{
int
ret
=
0
;
for
(
int
i
=
0
;
i
<
num_indices
;
++
i
)
{
if
(
mark
[
indices
[
i
]])
{
++
ret
;
if
(
ret
>
max_cnt
)
{
return
-
1
;
}
}
}
return
ret
;
}
void
MarkUsed
(
std
::
vector
<
bool
>&
mark
,
const
int
*
indices
,
int
num_indices
)
{
for
(
int
i
=
0
;
i
<
num_indices
;
++
i
)
{
mark
[
indices
[
i
]]
=
true
;
}
}
std
::
vector
<
std
::
vector
<
int
>>
FindGroups
(
const
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
const
std
::
vector
<
int
>&
find_order
,
int
**
sample_indices
,
const
int
*
num_per_col
,
size_t
total_sample_cnt
,
data_size_t
max_error_cnt
,
data_size_t
filter_cnt
,
data_size_t
num_data
)
{
const
int
max_search_group
=
100
;
Random
rand
(
num_data
);
std
::
vector
<
std
::
vector
<
int
>>
features_in_group
;
std
::
vector
<
std
::
vector
<
bool
>>
conflict_marks
;
std
::
vector
<
int
>
group_conflict_cnt
;
std
::
vector
<
size_t
>
group_non_zero_cnt
;
#ifdef USE_GPU
std
::
vector
<
int
>
group_num_bin
;
#endif // USE_GPU
for
(
auto
fidx
:
find_order
)
{
const
size_t
cur_non_zero_cnt
=
num_per_col
[
fidx
];
bool
need_new_group
=
true
;
std
::
vector
<
int
>
available_groups
;
for
(
int
gid
=
0
;
gid
<
static_cast
<
int
>
(
features_in_group
.
size
());
++
gid
)
{
if
(
group_non_zero_cnt
[
gid
]
+
cur_non_zero_cnt
<=
total_sample_cnt
+
max_error_cnt
#ifdef USE_GPU
&&
group_num_bin
[
gid
]
+
bin_mappers
[
fidx
]
->
num_bin
()
+
(
bin_mappers
[
fidx
]
->
GetDefaultBin
()
==
0
?
-
1
:
0
)
<=
kMaxBinPerGroup
#endif // USE_GPU
)
{
available_groups
.
push_back
(
gid
);
}
}
std
::
vector
<
int
>
search_groups
;
if
(
!
available_groups
.
empty
())
{
int
last
=
static_cast
<
int
>
(
available_groups
.
size
())
-
1
;
auto
indices
=
rand
.
Sample
(
last
,
std
::
min
(
last
,
max_search_group
-
1
));
search_groups
.
push_back
(
available_groups
.
back
());
for
(
auto
idx
:
indices
)
{
search_groups
.
push_back
(
available_groups
[
idx
]);
}
}
for
(
auto
gid
:
search_groups
)
{
const
int
rest_max_cnt
=
max_error_cnt
-
group_conflict_cnt
[
gid
];
int
cnt
=
GetConfilctCount
(
conflict_marks
[
gid
],
sample_indices
[
fidx
],
num_per_col
[
fidx
],
rest_max_cnt
);
if
(
cnt
>=
0
&&
cnt
<=
rest_max_cnt
)
{
data_size_t
rest_non_zero_data
=
static_cast
<
data_size_t
>
(
static_cast
<
double
>
(
cur_non_zero_cnt
-
cnt
)
*
num_data
/
total_sample_cnt
);
if
(
rest_non_zero_data
<
filter_cnt
)
{
continue
;
}
need_new_group
=
false
;
features_in_group
[
gid
].
push_back
(
fidx
);
group_conflict_cnt
[
gid
]
+=
cnt
;
group_non_zero_cnt
[
gid
]
+=
cur_non_zero_cnt
-
cnt
;
MarkUsed
(
conflict_marks
[
gid
],
sample_indices
[
fidx
],
num_per_col
[
fidx
]);
#ifdef USE_GPU
group_num_bin
[
gid
]
+=
bin_mappers
[
fidx
]
->
num_bin
()
+
(
bin_mappers
[
fidx
]
->
GetDefaultBin
()
==
0
?
-
1
:
0
);
#endif // USE_GPU
break
;
}
}
if
(
need_new_group
)
{
features_in_group
.
emplace_back
();
features_in_group
.
back
().
push_back
(
fidx
);
group_conflict_cnt
.
push_back
(
0
);
conflict_marks
.
emplace_back
(
total_sample_cnt
,
false
);
MarkUsed
(
conflict_marks
.
back
(),
sample_indices
[
fidx
],
num_per_col
[
fidx
]);
group_non_zero_cnt
.
emplace_back
(
cur_non_zero_cnt
);
#ifdef USE_GPU
group_num_bin
.
push_back
(
1
+
bin_mappers
[
fidx
]
->
num_bin
()
+
(
bin_mappers
[
fidx
]
->
GetDefaultBin
()
==
0
?
-
1
:
0
));
#endif // USE_GPU
}
}
return
features_in_group
;
}
std
::
vector
<
std
::
vector
<
int
>>
FastFeatureBundling
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
int
**
sample_indices
,
const
int
*
num_per_col
,
size_t
total_sample_cnt
,
const
std
::
vector
<
int
>&
used_features
,
double
max_conflict_rate
,
data_size_t
num_data
,
data_size_t
min_data
,
double
sparse_threshold
,
bool
is_enable_sparse
)
{
// filter is based on sampling data, so decrease its range
const
data_size_t
filter_cnt
=
static_cast
<
data_size_t
>
(
static_cast
<
double
>
(
0.95
*
min_data
)
/
num_data
*
total_sample_cnt
);
const
data_size_t
max_error_cnt
=
static_cast
<
data_size_t
>
(
total_sample_cnt
*
max_conflict_rate
);
int
cur_used_feature_cnt
=
0
;
std
::
vector
<
size_t
>
feature_non_zero_cnt
;
// put dense feature first
for
(
auto
fidx
:
used_features
)
{
feature_non_zero_cnt
.
emplace_back
(
num_per_col
[
fidx
]);
++
cur_used_feature_cnt
;
}
// sort by non zero cnt
std
::
vector
<
int
>
sorted_idx
;
for
(
int
i
=
0
;
i
<
cur_used_feature_cnt
;
++
i
)
{
sorted_idx
.
emplace_back
(
i
);
}
// sort by non zero cnt, bigger first
std
::
sort
(
sorted_idx
.
begin
(),
sorted_idx
.
end
(),
[
&
feature_non_zero_cnt
](
int
a
,
int
b
)
{
return
feature_non_zero_cnt
[
a
]
>
feature_non_zero_cnt
[
b
];
});
std
::
vector
<
int
>
feature_order_by_cnt
;
for
(
auto
sidx
:
sorted_idx
)
{
feature_order_by_cnt
.
push_back
(
used_features
[
sidx
]);
}
auto
features_in_group
=
FindGroups
(
bin_mappers
,
used_features
,
sample_indices
,
num_per_col
,
total_sample_cnt
,
max_error_cnt
,
filter_cnt
,
num_data
);
auto
group2
=
FindGroups
(
bin_mappers
,
feature_order_by_cnt
,
sample_indices
,
num_per_col
,
total_sample_cnt
,
max_error_cnt
,
filter_cnt
,
num_data
);
if
(
features_in_group
.
size
()
>
group2
.
size
())
{
features_in_group
=
group2
;
}
std
::
vector
<
std
::
vector
<
int
>>
ret
;
for
(
size_t
i
=
0
;
i
<
features_in_group
.
size
();
++
i
)
{
if
(
features_in_group
[
i
].
size
()
<=
1
||
features_in_group
[
i
].
size
()
>=
5
)
{
ret
.
push_back
(
features_in_group
[
i
]);
}
else
{
int
cnt_non_zero
=
0
;
for
(
size_t
j
=
0
;
j
<
features_in_group
[
i
].
size
();
++
j
)
{
const
int
fidx
=
features_in_group
[
i
][
j
];
cnt_non_zero
+=
static_cast
<
int
>
(
num_data
*
(
1.0
f
-
bin_mappers
[
fidx
]
->
sparse_rate
()));
}
double
sparse_rate
=
1.0
f
-
static_cast
<
double
>
(
cnt_non_zero
)
/
(
num_data
);
// take apart small sparse group, due it will not gain on speed
if
(
sparse_rate
>=
sparse_threshold
&&
is_enable_sparse
)
{
for
(
size_t
j
=
0
;
j
<
features_in_group
[
i
].
size
();
++
j
)
{
const
int
fidx
=
features_in_group
[
i
][
j
];
ret
.
emplace_back
();
ret
.
back
().
push_back
(
fidx
);
}
}
else
{
ret
.
push_back
(
features_in_group
[
i
]);
}
}
}
// shuffle groups
int
num_group
=
static_cast
<
int
>
(
ret
.
size
());
Random
tmp_rand
(
12
);
for
(
int
i
=
0
;
i
<
num_group
-
1
;
++
i
)
{
int
j
=
tmp_rand
.
NextShort
(
i
+
1
,
num_group
);
std
::
swap
(
ret
[
i
],
ret
[
j
]);
}
return
ret
;
}
void
Dataset
::
Construct
(
std
::
vector
<
std
::
unique_ptr
<
BinMapper
>>&
bin_mappers
,
int
**
,
const
int
*
,
size_t
,
int
**
sample_non_zero_indices
,
const
int
*
num_per_col
,
size_t
total_sample_cnt
,
const
IOConfig
&
io_config
)
{
num_total_features_
=
static_cast
<
int
>
(
bin_mappers
.
size
());
sparse_threshold_
=
io_config
.
sparse_threshold
;
// get num_features
...
...
@@ -61,6 +233,15 @@ void Dataset::Construct(
auto
features_in_group
=
NoGroup
(
used_features
);
if
(
io_config
.
enable_bundle
)
{
std
::
chrono
::
duration
<
double
,
std
::
milli
>
bundling_time_
;
features_in_group
=
FastFeatureBundling
(
bin_mappers
,
sample_non_zero_indices
,
num_per_col
,
total_sample_cnt
,
used_features
,
io_config
.
max_conflict_rate
,
num_data_
,
io_config
.
min_data_in_leaf
,
sparse_threshold_
,
io_config
.
is_enable_sparse
);
}
num_features_
=
0
;
for
(
const
auto
&
fs
:
features_in_group
)
{
num_features_
+=
static_cast
<
int
>
(
fs
.
size
());
...
...
@@ -86,7 +267,8 @@ void Dataset::Construct(
++
cur_fidx
;
}
feature_groups_
.
emplace_back
(
std
::
unique_ptr
<
FeatureGroup
>
(
new
FeatureGroup
(
cur_cnt_features
,
cur_bin_mappers
,
num_data_
,
sparse_threshold_
,
io_config
.
is_enable_sparse
)));
new
FeatureGroup
(
cur_cnt_features
,
cur_bin_mappers
,
num_data_
,
sparse_threshold_
,
io_config
.
is_enable_sparse
)));
}
feature_groups_
.
shrink_to_fit
();
group_bin_boundaries_
.
clear
();
...
...
@@ -116,7 +298,7 @@ void Dataset::Construct(
void
Dataset
::
FinishLoad
()
{
if
(
is_finish_load_
)
{
return
;
}
OMP_INIT_EX
();
#pragma omp parallel for schedule(guided)
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_groups_
;
++
i
)
{
OMP_LOOP_EX_BEGIN
();
feature_groups_
[
i
]
->
bin_data_
->
FinishLoad
();
...
...
@@ -212,7 +394,7 @@ void Dataset::ReSize(data_size_t num_data) {
if
(
num_data_
!=
num_data
)
{
num_data_
=
num_data
;
OMP_INIT_EX
();
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static)
for
(
int
group
=
0
;
group
<
num_groups_
;
++
group
)
{
OMP_LOOP_EX_BEGIN
();
feature_groups_
[
group
]
->
bin_data_
->
ReSize
(
num_data_
);
...
...
@@ -314,7 +496,7 @@ bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const in
void
Dataset
::
SaveBinaryFile
(
const
char
*
bin_filename
)
{
if
(
bin_filename
!=
nullptr
&&
std
::
string
(
bin_filename
)
==
std
::
string
(
data_filename_
))
{
&&
std
::
string
(
bin_filename
)
==
std
::
string
(
data_filename_
))
{
Log
::
Warning
(
"Bianry file %s already existed"
,
bin_filename
);
return
;
}
...
...
@@ -326,11 +508,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
}
bool
is_file_existed
=
false
;
FILE
*
file
;
#ifdef _MSC_VER
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
,
"rb"
);
#else
#else
file
=
fopen
(
bin_filename
,
"rb"
);
#endif
#endif
if
(
file
!=
NULL
)
{
is_file_existed
=
true
;
...
...
@@ -339,11 +521,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
}
if
(
!
is_file_existed
)
{
#ifdef _MSC_VER
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
,
"wb"
);
#else
#else
file
=
fopen
(
bin_filename
,
"wb"
);
#endif
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"Cannot write binary data to %s "
,
bin_filename
);
}
...
...
src/io/dense_bin.hpp
View file @
e984b0d6
...
...
@@ -188,29 +188,31 @@ public:
}
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
default_bin_for_zero
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
VAL_T
maxb
=
static_cast
<
VAL_T
>
(
max_bin
);
VAL_T
t_default_bin
=
static_cast
<
VAL_T
>
(
min_bin
+
default_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
t_default_bin
-=
1
;
}
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
if
(
default_bin
_for_zero
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
m
in
b
)
{
if
(
bin
<
minb
||
bin
>
maxb
||
t_default_bin
==
b
in
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
...
...
@@ -219,14 +221,14 @@ public:
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
if
(
default_bin
_for_zero
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
data_
[
idx
];
if
(
bin
>
maxb
||
bin
<
m
in
b
)
{
if
(
bin
<
minb
||
bin
>
maxb
||
t_default_bin
==
b
in
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
...
...
src/io/dense_nbits_bin.hpp
View file @
e984b0d6
...
...
@@ -227,29 +227,31 @@ public:
}
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
default_bin_for_zero
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
if
(
num_data
<=
0
)
{
return
0
;
}
uint8_t
th
=
static_cast
<
uint8_t
>
(
threshold
+
min_bin
);
uint8_t
minb
=
static_cast
<
uint8_t
>
(
min_bin
);
uint8_t
maxb
=
static_cast
<
uint8_t
>
(
max_bin
);
uint8_t
t_default_bin
=
static_cast
<
uint8_t
>
(
min_bin
+
default_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
t_default_bin
-=
1
;
}
data_size_t
lte_count
=
0
;
data_size_t
gt_count
=
0
;
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
if
(
default_bin
_for_zero
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
m
in
b
)
{
if
(
bin
<
minb
||
bin
>
maxb
||
t_default_bin
==
b
in
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
...
...
@@ -258,14 +260,14 @@ public:
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
if
(
default_bin
_for_zero
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
const
auto
bin
=
(
data_
[
idx
>>
1
]
>>
((
idx
&
1
)
<<
2
))
&
0xf
;
if
(
bin
>
maxb
||
bin
<
m
in
b
)
{
if
(
bin
<
minb
||
bin
>
maxb
||
t_default_bin
==
b
in
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
...
...
src/io/sparse_bin.hpp
View file @
e984b0d6
...
...
@@ -142,7 +142,7 @@ public:
}
virtual
data_size_t
Split
(
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
min_bin
,
uint32_t
max_bin
,
uint32_t
default_bin
,
uint32_t
default_bin_for_zero
,
uint32_t
threshold
,
data_size_t
*
data_indices
,
data_size_t
num_data
,
data_size_t
*
lte_indices
,
data_size_t
*
gt_indices
,
BinType
bin_type
)
const
override
{
// not need to split
...
...
@@ -150,8 +150,10 @@ public:
VAL_T
th
=
static_cast
<
VAL_T
>
(
threshold
+
min_bin
);
VAL_T
minb
=
static_cast
<
VAL_T
>
(
min_bin
);
VAL_T
maxb
=
static_cast
<
VAL_T
>
(
max_bin
);
VAL_T
t_default_bin
=
static_cast
<
VAL_T
>
(
min_bin
+
default_bin
);
if
(
default_bin
==
0
)
{
th
-=
1
;
t_default_bin
-=
1
;
}
SparseBinIterator
<
VAL_T
>
iterator
(
this
,
data_indices
[
0
]);
data_size_t
lte_count
=
0
;
...
...
@@ -159,14 +161,14 @@ public:
data_size_t
*
default_indices
=
gt_indices
;
data_size_t
*
default_count
=
&
gt_count
;
if
(
bin_type
==
BinType
::
NumericalBin
)
{
if
(
default_bin
<=
threshold
)
{
if
(
default_bin
_for_zero
<=
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
InnerRawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
m
in
b
)
{
if
(
bin
<
minb
||
bin
>
maxb
||
t_default_bin
==
b
in
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
>
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
...
...
@@ -175,14 +177,14 @@ public:
}
}
}
else
{
if
(
default_bin
==
threshold
)
{
if
(
default_bin
_for_zero
==
threshold
)
{
default_indices
=
lte_indices
;
default_count
=
&
lte_count
;
}
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
const
data_size_t
idx
=
data_indices
[
i
];
VAL_T
bin
=
iterator
.
InnerRawGet
(
idx
);
if
(
bin
>
maxb
||
bin
<
m
in
b
)
{
if
(
bin
<
minb
||
bin
>
maxb
||
t_default_bin
==
b
in
)
{
default_indices
[(
*
default_count
)
++
]
=
idx
;
}
else
if
(
bin
!=
th
)
{
gt_indices
[
gt_count
++
]
=
idx
;
...
...
src/io/tree.cpp
View file @
e984b0d6
...
...
@@ -24,20 +24,23 @@ Tree::Tree(int max_leaves)
:
max_leaves_
(
max_leaves
)
{
num_leaves_
=
0
;
left_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
right_child_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_inner
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
split_feature_
=
std
::
vector
<
int
>
(
max_leaves_
-
1
);
threshold_in_bin_
=
std
::
vector
<
uint32_t
>
(
max_leaves_
-
1
);
threshold_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
decision_type_
=
std
::
vector
<
int8_t
>
(
max_leaves_
-
1
);
split_gain_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
leaf_parent_
=
std
::
vector
<
int
>
(
max_leaves_
);
leaf_value_
=
std
::
vector
<
double
>
(
max_leaves_
);
leaf_count_
=
std
::
vector
<
data_size_t
>
(
max_leaves_
);
internal_value_
=
std
::
vector
<
double
>
(
max_leaves_
-
1
);
internal_count_
=
std
::
vector
<
data_size_t
>
(
max_leaves_
-
1
);
leaf_depth_
=
std
::
vector
<
int
>
(
max_leaves_
);
left_child_
.
resize
(
max_leaves_
-
1
);
right_child_
.
resize
(
max_leaves_
-
1
);
split_feature_inner_
.
resize
(
max_leaves_
-
1
);
split_feature_
.
resize
(
max_leaves_
-
1
);
threshold_in_bin_
.
resize
(
max_leaves_
-
1
);
threshold_
.
resize
(
max_leaves_
-
1
);
decision_type_
.
resize
(
max_leaves_
-
1
);
default_value_
.
resize
(
max_leaves_
-
1
);
zero_bin_
.
resize
(
max_leaves_
-
1
);
default_bin_for_zero_
.
resize
(
max_leaves_
-
1
);
split_gain_
.
resize
(
max_leaves_
-
1
);
leaf_parent_
.
resize
(
max_leaves_
);
leaf_value_
.
resize
(
max_leaves_
);
leaf_count_
.
resize
(
max_leaves_
);
internal_value_
.
resize
(
max_leaves_
-
1
);
internal_count_
.
resize
(
max_leaves_
-
1
);
leaf_depth_
.
resize
(
max_leaves_
);
// root is in the depth 0
leaf_depth_
[
0
]
=
0
;
num_leaves_
=
1
;
...
...
@@ -49,9 +52,9 @@ Tree::~Tree() {
}
int
Tree
::
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold_bin
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
)
{
int
Tree
::
Split
(
int
leaf
,
int
feature
,
BinType
bin_type
,
uint32_t
threshold_bin
,
int
real_feature
,
double
threshold_double
,
double
left_value
,
double
right_value
,
data_size_t
left_cnt
,
data_size_t
right_cnt
,
double
gain
,
uint32_t
zero_bin
,
uint32_t
default_bin_for_zero
,
double
default_value
)
{
int
new_node_idx
=
num_leaves_
-
1
;
// update parent info
int
parent
=
leaf_parent_
[
leaf
];
...
...
@@ -64,17 +67,23 @@ int Tree::Split(int leaf, int feature, BinType bin_type, uint32_t threshold_bin,
}
}
// add new node
split_feature_inner
[
new_node_idx
]
=
feature
;
split_feature_inner
_
[
new_node_idx
]
=
feature
;
split_feature_
[
new_node_idx
]
=
real_feature
;
zero_bin_
[
new_node_idx
]
=
zero_bin
;
default_bin_for_zero_
[
new_node_idx
]
=
default_bin_for_zero
;
default_value_
[
new_node_idx
]
=
Common
::
AvoidInf
(
default_value
);
if
(
bin_type
==
BinType
::
NumericalBin
)
{
decision_type_
[
new_node_idx
]
=
0
;
}
else
{
has_categorical_
=
true
;
decision_type_
[
new_node_idx
]
=
1
;
}
threshold_in_bin_
[
new_node_idx
]
=
threshold_bin
;
threshold_
[
new_node_idx
]
=
threshold_double
;
split_gain_
[
new_node_idx
]
=
gain
==
std
::
numeric_limits
<
double
>::
infinity
()
?
std
::
numeric_limits
<
double
>::
max
()
:
gain
;
split_gain_
[
new_node_idx
]
=
Common
::
AvoidInf
(
gain
)
;
// add two new leaves
left_child_
[
new_node_idx
]
=
~
leaf
;
right_child_
[
new_node_idx
]
=
~
num_leaves_
;
...
...
@@ -104,15 +113,16 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
const
int
fidx
=
split_feature_inner
_
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
node
]
->
Get
(
i
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
i
)
,
fval
,
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
...
...
@@ -133,8 +143,9 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
split_feature_inner_
[
node
]]
->
Get
(
i
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
)
,
fval
,
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
...
...
@@ -151,14 +162,15 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
[
this
,
&
data
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
const
int
fidx
=
split_feature_inner
_
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
start
);
}
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
node
]
->
Get
(
i
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
fval
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
...
...
@@ -178,7 +190,8 @@ void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, doubl
for
(
data_size_t
i
=
start
;
i
<
end
;
++
i
)
{
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
i
)
<=
threshold_in_bin_
[
node
])
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
split_feature_inner_
[
node
]]
->
Get
(
i
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
fval
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
...
...
@@ -201,7 +214,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
const
int
fidx
=
split_feature_inner
_
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
...
...
@@ -209,8 +222,9 @@ void Tree::AddPredictionToScore(const Dataset* data,
int
node
=
0
;
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
node
]
->
Get
(
idx
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
node
]
->
Get
(
idx
)
,
fval
,
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
...
...
@@ -232,8 +246,9 @@ void Tree::AddPredictionToScore(const Dataset* data,
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
split_feature_inner_
[
node
]]
->
Get
(
idx
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
inner_decision_funs
[
decision_type_
[
node
]](
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
)
,
fval
,
threshold_in_bin_
[
node
]))
{
node
=
left_child_
[
node
];
}
else
{
...
...
@@ -250,7 +265,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
[
this
,
data
,
used_data_indices
,
score
](
int
,
data_size_t
start
,
data_size_t
end
)
{
std
::
vector
<
std
::
unique_ptr
<
BinIterator
>>
iter
(
num_leaves_
-
1
);
for
(
int
i
=
0
;
i
<
num_leaves_
-
1
;
++
i
)
{
const
int
fidx
=
split_feature_inner
[
i
];
const
int
fidx
=
split_feature_inner
_
[
i
];
iter
[
i
].
reset
(
data
->
FeatureIterator
(
fidx
));
iter
[
i
]
->
Reset
(
used_data_indices
[
start
]);
}
...
...
@@ -258,7 +273,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
int
node
=
0
;
const
data_size_t
idx
=
used_data_indices
[
i
];
while
(
node
>=
0
)
{
if
(
iter
[
node
]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
node
]
->
Get
(
idx
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
fval
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
...
...
@@ -279,7 +295,8 @@ void Tree::AddPredictionToScore(const Dataset* data,
const
data_size_t
idx
=
used_data_indices
[
i
];
int
node
=
0
;
while
(
node
>=
0
)
{
if
(
iter
[
split_feature_inner
[
node
]]
->
Get
(
idx
)
<=
threshold_in_bin_
[
node
])
{
uint32_t
fval
=
DefaultValueForZero
(
iter
[
split_feature_inner_
[
node
]]
->
Get
(
idx
),
zero_bin_
[
node
],
default_bin_for_zero_
[
node
]);
if
(
fval
<=
threshold_in_bin_
[
node
])
{
node
=
left_child_
[
node
];
}
else
{
node
=
right_child_
[
node
];
...
...
@@ -303,6 +320,8 @@ std::string Tree::ToString() {
<<
Common
::
ArrayToString
<
double
>
(
threshold_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"decision_type="
<<
Common
::
ArrayToString
<
int
>
(
Common
::
ArrayCast
<
int8_t
,
int
>
(
decision_type_
),
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"default_value="
<<
Common
::
ArrayToString
<
double
>
(
default_value_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"left_child="
<<
Common
::
ArrayToString
<
int
>
(
left_child_
,
num_leaves_
-
1
,
' '
)
<<
std
::
endl
;
str_buf
<<
"right_child="
...
...
@@ -349,6 +368,7 @@ std::string Tree::NodeToJSON(int index) {
str_buf
<<
"
\"
split_gain
\"
:"
<<
split_gain_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
threshold
\"
:"
<<
threshold_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
decision_type
\"
:
\"
"
<<
Tree
::
GetDecisionTypeName
(
decision_type_
[
index
])
<<
"
\"
,"
<<
std
::
endl
;
str_buf
<<
"
\"
default_value
\"
:"
<<
default_value_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_value
\"
:"
<<
internal_value_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
internal_count
\"
:"
<<
internal_count_
[
index
]
<<
","
<<
std
::
endl
;
str_buf
<<
"
\"
left_child
\"
:"
<<
NodeToJSON
(
left_child_
[
index
])
<<
","
<<
std
::
endl
;
...
...
@@ -389,7 +409,11 @@ std::string Tree::NodeToIfElse(int index, bool is_predict_leaf_index) {
str_buf
<<
std
::
setprecision
(
std
::
numeric_limits
<
double
>::
digits10
+
2
);
if
(
index
>=
0
)
{
// non-leaf
str_buf
<<
"if ( arr["
<<
split_feature_
[
index
]
<<
"] "
;
std
::
stringstream
tmp_str_buf
;
tmp_str_buf
<<
"arr["
<<
split_feature_
[
index
]
<<
"]"
;
std
::
string
str_fval
=
tmp_str_buf
.
str
();
str_buf
<<
"if( ( "
<<
str_fval
<<
" <= "
<<
kMissingValueRange
<<
" && "
<<
str_fval
<<
" > -"
<<
kMissingValueRange
<<
" ? "
<<
default_value_
[
index
]
<<
" : "
<<
str_fval
<<
" ) "
;
if
(
decision_type_
[
index
]
==
0
)
{
str_buf
<<
"<"
;
}
else
{
...
...
@@ -461,6 +485,12 @@ Tree::Tree(const std::string& str) {
Log
::
Fatal
(
"Tree model string format error, should contain threshold field"
);
}
if
(
key_vals
.
count
(
"default_value"
))
{
default_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"default_value"
],
' '
,
num_leaves_
-
1
);
}
else
{
Log
::
Fatal
(
"Tree model string format error, should contain default_value field"
);
}
if
(
key_vals
.
count
(
"leaf_value"
))
{
leaf_value_
=
Common
::
StringToArray
<
double
>
(
key_vals
[
"leaf_value"
],
' '
,
num_leaves_
);
}
else
{
...
...
src/treelearner/data_partition.hpp
View file @
e984b0d6
...
...
@@ -91,7 +91,7 @@ public:
* \param threshold threshold that want to split
* \param right_leaf index of right leaf
*/
void
Split
(
int
leaf
,
const
Dataset
*
dataset
,
int
feature
,
uint32_t
threshold
,
int
right_leaf
)
{
void
Split
(
int
leaf
,
const
Dataset
*
dataset
,
int
feature
,
uint32_t
threshold
,
uint32_t
default_bin_for_zero
,
int
right_leaf
,
int
expected_left_cnt
)
{
const
data_size_t
min_inner_size
=
512
;
// get leaf boundary
const
data_size_t
begin
=
leaf_begin_
[
leaf
];
...
...
@@ -111,7 +111,7 @@ public:
data_size_t
cur_cnt
=
inner_size
;
if
(
cur_start
+
cur_cnt
>
cnt
)
{
cur_cnt
=
cnt
-
cur_start
;
}
// split data inner, reduce the times of function called
data_size_t
cur_left_count
=
dataset
->
Split
(
feature
,
threshold
,
indices_
.
data
()
+
begin
+
cur_start
,
cur_cnt
,
data_size_t
cur_left_count
=
dataset
->
Split
(
feature
,
threshold
,
default_bin_for_zero
,
indices_
.
data
()
+
begin
+
cur_start
,
cur_cnt
,
temp_left_indices_
.
data
()
+
cur_start
,
temp_right_indices_
.
data
()
+
cur_start
);
offsets_buf_
[
i
]
=
cur_start
;
left_cnts_buf_
[
i
]
=
cur_left_count
;
...
...
@@ -141,6 +141,7 @@ public:
}
// update leaf boundary
leaf_count_
[
leaf
]
=
left_cnt
;
CHECK
(
left_cnt
==
expected_left_cnt
);
leaf_begin_
[
right_leaf
]
=
left_cnt
+
begin
;
leaf_count_
[
right_leaf
]
=
cnt
-
left_cnt
;
}
...
...
src/treelearner/feature_histogram.hpp
View file @
e984b0d6
...
...
@@ -15,6 +15,7 @@ class FeatureMetainfo {
public:
int
num_bin
;
int
bias
=
0
;
uint32_t
default_bin
;
/*! \brief pointer of tree config */
const
TreeConfig
*
tree_config
;
};
...
...
@@ -69,81 +70,28 @@ public:
void
FindBestThreshold
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
output
->
default_bin_for_zero
=
meta_
->
default_bin
;
output
->
gain
=
kMinScore
;
find_best_threshold_fun_
(
sum_gradient
,
sum_hessian
+
2
*
kEpsilon
,
num_data
,
output
);
}
void
FindBestThresholdNumerical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
SplitInfo
*
output
)
{
double
best_sum_left_gradient
=
NAN
;
double
best_sum_left_hessian
=
NAN
;
double
best_gain
=
kMinScore
;
data_size_t
best_left_count
=
0
;
uint32_t
best_threshold
=
static_cast
<
uint32_t
>
(
meta_
->
num_bin
);
double
sum_right_gradient
=
0.0
f
;
double
sum_right_hessian
=
kEpsilon
;
data_size_t
right_count
=
0
;
is_splittable_
=
false
;
double
gain_shift
=
GetLeafSplitGain
(
sum_gradient
,
sum_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
double
min_gain_shift
=
gain_shift
+
meta_
->
tree_config
->
min_gain_to_split
;
is_splittable_
=
false
;
const
int
bias
=
meta_
->
bias
;
int
t
=
meta_
->
num_bin
-
1
-
bias
;
const
int
t_end
=
1
-
bias
;
// from right to left, and we don't need data in bin0
for
(;
t
>=
t_end
;
--
t
)
{
sum_right_gradient
+=
data_
[
t
].
sum_gradients
;
sum_right_hessian
+=
data_
[
t
].
sum_hessians
;
right_count
+=
data_
[
t
].
cnt
;
// if data not enough, or sum hessian too small
if
(
right_count
<
meta_
->
tree_config
->
min_data_in_leaf
||
sum_right_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
left_count
=
num_data
-
right_count
;
// if data not enough
if
(
left_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
break
;
double
sum_left_hessian
=
sum_hessian
-
sum_right_hessian
;
// if sum hessian too small
if
(
sum_left_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
break
;
double
sum_left_gradient
=
sum_gradient
-
sum_right_gradient
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_left_gradient
,
sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
)
+
GetLeafSplitGain
(
sum_right_gradient
,
sum_right_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
// gain with split is worse than without split
if
(
current_gain
<=
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_left_count
=
left_count
;
best_sum_left_gradient
=
sum_left_gradient
;
best_sum_left_hessian
=
sum_left_hessian
;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold
=
static_cast
<
uint32_t
>
(
t
-
1
+
bias
);
best_gain
=
current_gain
;
}
FindBestThresholdSequence
(
sum_gradient
,
sum_hessian
,
num_data
,
min_gain_shift
,
output
,
0
);
// Zero is not in leftmost or rightmost
if
(
static_cast
<
int
>
(
meta_
->
default_bin
)
>
0
&&
static_cast
<
int
>
(
meta_
->
default_bin
)
<
meta_
->
num_bin
-
1
)
{
FindBestThresholdSequence
(
sum_gradient
,
sum_hessian
,
num_data
,
min_gain_shift
,
output
,
meta_
->
default_bin
);
}
if
(
is_splittable_
)
{
// update split information
output
->
threshold
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
best_sum_left_gradient
,
best_sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
output
->
left_count
=
best_left_count
;
output
->
left_sum_gradient
=
best_sum_left_gradient
;
output
->
left_sum_hessian
=
best_sum_left_hessian
-
kEpsilon
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradient
-
best_sum_left_gradient
,
sum_hessian
-
best_sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
output
->
right_count
=
num_data
-
best_left_count
;
output
->
right_sum_gradient
=
sum_gradient
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessian
-
best_sum_left_hessian
-
kEpsilon
;
output
->
gain
=
best_gain
-
gain_shift
;
}
else
{
output
->
gain
=
kMinScore
;
if
(
meta_
->
num_bin
>
2
)
{
FindBestThresholdSequence
(
sum_gradient
,
sum_hessian
,
num_data
,
min_gain_shift
,
output
,
meta_
->
num_bin
-
1
);
}
output
->
gain
-=
min_gain_shift
;
}
void
FindBestThresholdCategorical
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
...
...
@@ -242,10 +190,8 @@ public:
output
->
right_count
=
num_data
-
best_left_count
;
output
->
right_sum_gradient
=
sum_gradient
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessian
-
best_sum_left_hessian
-
kEpsilon
;
output
->
gain
=
best_gain
-
gain_shift
;
}
else
{
output
->
gain
=
kMinScore
;
}
output
->
gain
=
best_gain
-
min_gain_shift
;
}
}
/*!
...
...
@@ -301,6 +247,142 @@ public:
private:
void
FindBestThresholdSequence
(
double
sum_gradient
,
double
sum_hessian
,
data_size_t
num_data
,
double
min_gain_shift
,
SplitInfo
*
output
,
uint32_t
default_bin_for_zero
)
{
int
dir
=
-
1
;
if
(
static_cast
<
int
>
(
default_bin_for_zero
)
==
meta_
->
num_bin
-
1
)
{
dir
=
1
;
};
bool
skip_default_bin
=
true
;
if
(
static_cast
<
int
>
(
default_bin_for_zero
)
>
0
&&
static_cast
<
int
>
(
default_bin_for_zero
)
<
meta_
->
num_bin
-
1
)
{
skip_default_bin
=
false
;
}
const
int
bias
=
meta_
->
bias
;
double
best_sum_left_gradient
=
NAN
;
double
best_sum_left_hessian
=
NAN
;
double
best_gain
=
kMinScore
;
data_size_t
best_left_count
=
0
;
uint32_t
best_threshold
=
static_cast
<
uint32_t
>
(
meta_
->
num_bin
);
if
(
dir
==
-
1
)
{
double
sum_right_gradient
=
0.0
f
;
double
sum_right_hessian
=
kEpsilon
;
data_size_t
right_count
=
0
;
int
t
=
meta_
->
num_bin
-
1
-
bias
;
const
int
t_end
=
1
-
bias
;
// from right to left, and we don't need data in bin0
for
(;
t
>=
t_end
;
--
t
)
{
// need to skip default bin
if
(
skip_default_bin
&&
(
t
+
bias
)
==
static_cast
<
int
>
(
meta_
->
default_bin
))
{
continue
;
}
sum_right_gradient
+=
data_
[
t
].
sum_gradients
;
sum_right_hessian
+=
data_
[
t
].
sum_hessians
;
right_count
+=
data_
[
t
].
cnt
;
// if data not enough, or sum hessian too small
if
(
right_count
<
meta_
->
tree_config
->
min_data_in_leaf
||
sum_right_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
left_count
=
num_data
-
right_count
;
// if data not enough
if
(
left_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
break
;
double
sum_left_hessian
=
sum_hessian
-
sum_right_hessian
;
// if sum hessian too small
if
(
sum_left_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
break
;
double
sum_left_gradient
=
sum_gradient
-
sum_right_gradient
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_left_gradient
,
sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
)
+
GetLeafSplitGain
(
sum_right_gradient
,
sum_right_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
// gain with split is worse than without split
if
(
current_gain
<=
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_left_count
=
left_count
;
best_sum_left_gradient
=
sum_left_gradient
;
best_sum_left_hessian
=
sum_left_hessian
;
// left is <= threshold, right is > threshold. so this is t-1
best_threshold
=
static_cast
<
uint32_t
>
(
t
-
1
+
bias
);
best_gain
=
current_gain
;
}
}
}
else
{
double
sum_left_gradient
=
0.0
f
;
double
sum_left_hessian
=
kEpsilon
;
data_size_t
left_count
=
0
;
int
t
=
0
;
const
int
t_end
=
meta_
->
num_bin
-
2
-
bias
;
// from right to left, and we don't need data in bin0
for
(;
t
<=
t_end
;
++
t
)
{
// need to skip default bin
if
(
skip_default_bin
&&
(
t
+
bias
)
==
static_cast
<
int
>
(
meta_
->
default_bin
))
{
continue
;
}
sum_left_gradient
+=
data_
[
t
].
sum_gradients
;
sum_left_hessian
+=
data_
[
t
].
sum_hessians
;
left_count
+=
data_
[
t
].
cnt
;
// if data not enough, or sum hessian too small
if
(
left_count
<
meta_
->
tree_config
->
min_data_in_leaf
||
sum_left_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
continue
;
data_size_t
right_count
=
num_data
-
left_count
;
// if data not enough
if
(
right_count
<
meta_
->
tree_config
->
min_data_in_leaf
)
break
;
double
sum_right_hessian
=
sum_hessian
-
sum_left_hessian
;
// if sum hessian too small
if
(
sum_right_hessian
<
meta_
->
tree_config
->
min_sum_hessian_in_leaf
)
break
;
double
sum_right_gradient
=
sum_gradient
-
sum_left_gradient
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_left_gradient
,
sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
)
+
GetLeafSplitGain
(
sum_right_gradient
,
sum_right_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
// gain with split is worse than without split
if
(
current_gain
<=
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
// better split point
if
(
current_gain
>
best_gain
)
{
best_left_count
=
left_count
;
best_sum_left_gradient
=
sum_left_gradient
;
best_sum_left_hessian
=
sum_left_hessian
;
best_threshold
=
static_cast
<
uint32_t
>
(
t
+
bias
);
best_gain
=
current_gain
;
}
}
}
if
(
is_splittable_
&&
best_gain
>
output
->
gain
)
{
// update split information
output
->
threshold
=
best_threshold
;
output
->
left_output
=
CalculateSplittedLeafOutput
(
best_sum_left_gradient
,
best_sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
output
->
left_count
=
best_left_count
;
output
->
left_sum_gradient
=
best_sum_left_gradient
;
output
->
left_sum_hessian
=
best_sum_left_hessian
-
kEpsilon
;
output
->
right_output
=
CalculateSplittedLeafOutput
(
sum_gradient
-
best_sum_left_gradient
,
sum_hessian
-
best_sum_left_hessian
,
meta_
->
tree_config
->
lambda_l1
,
meta_
->
tree_config
->
lambda_l2
);
output
->
right_count
=
num_data
-
best_left_count
;
output
->
right_sum_gradient
=
sum_gradient
-
best_sum_left_gradient
;
output
->
right_sum_hessian
=
sum_hessian
-
best_sum_left_hessian
-
kEpsilon
;
output
->
gain
=
best_gain
;
output
->
default_bin_for_zero
=
default_bin_for_zero
;
}
}
const
FeatureMetainfo
*
meta_
;
/*! \brief sum of gradient of each bin */
HistogramBinEntry
*
data_
;
...
...
@@ -364,6 +446,7 @@ public:
#pragma omp parallel for schedule(static, 512) if(num_feature >= 1024)
for
(
int
i
=
0
;
i
<
num_feature
;
++
i
)
{
feature_metas_
[
i
].
num_bin
=
train_data
->
FeatureNumBin
(
i
);
feature_metas_
[
i
].
default_bin
=
train_data
->
FeatureBinMapper
(
i
)
->
GetDefaultBin
();
if
(
train_data
->
FeatureBinMapper
(
i
)
->
GetDefaultBin
()
==
0
)
{
feature_metas_
[
i
].
bias
=
1
;
}
else
{
...
...
src/treelearner/serial_tree_learner.cpp
View file @
e984b0d6
...
...
@@ -543,6 +543,10 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
const
int
inner_feature_index
=
train_data_
->
InnerFeatureIndex
(
best_split_info
.
feature
);
// left = parent
*
left_leaf
=
best_Leaf
;
double
default_value
=
0.0
f
;
if
(
train_data_
->
FeatureBinMapper
(
inner_feature_index
)
->
GetDefaultBin
()
!=
best_split_info
.
default_bin_for_zero
)
{
default_value
=
train_data_
->
RealThreshold
(
inner_feature_index
,
best_split_info
.
default_bin_for_zero
);
}
// split tree, will return right leaf
*
right_leaf
=
tree
->
Split
(
best_Leaf
,
inner_feature_index
,
...
...
@@ -554,10 +558,13 @@ void SerialTreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* ri
static_cast
<
double
>
(
best_split_info
.
right_output
),
static_cast
<
data_size_t
>
(
best_split_info
.
left_count
),
static_cast
<
data_size_t
>
(
best_split_info
.
right_count
),
static_cast
<
double
>
(
best_split_info
.
gain
));
static_cast
<
double
>
(
best_split_info
.
gain
),
train_data_
->
FeatureBinMapper
(
inner_feature_index
)
->
GetDefaultBin
(),
best_split_info
.
default_bin_for_zero
,
default_value
);
// split data partition
data_partition_
->
Split
(
best_Leaf
,
train_data_
,
inner_feature_index
,
best_split_info
.
threshold
,
*
right_leaf
);
best_split_info
.
threshold
,
best_split_info
.
default_bin_for_zero
,
*
right_leaf
,
best_split_info
.
left_count
);
// init the leaves that used on next iteration
if
(
best_split_info
.
left_count
<
best_split_info
.
right_count
)
{
...
...
src/treelearner/split_info.hpp
View file @
e984b0d6
...
...
@@ -19,7 +19,9 @@ public:
/*! \brief Feature index */
int
feature
;
/*! \brief Split threshold */
unsigned
int
threshold
;
uint32_t
threshold
;
uint32_t
default_bin_for_zero
;
/*! \brief Left output after split */
double
left_output
;
/*! \brief Right output after split */
...
...
src/treelearner/voting_parallel_tree_learner.cpp
View file @
e984b0d6
...
...
@@ -67,6 +67,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, b
#pragma omp parallel for schedule(static)
for
(
int
i
=
0
;
i
<
train_data
->
num_features
();
++
i
)
{
feature_metas_
[
i
].
num_bin
=
train_data
->
FeatureNumBin
(
i
);
feature_metas_
[
i
].
default_bin
=
train_data
->
FeatureBinMapper
(
i
)
->
GetDefaultBin
();
if
(
train_data
->
FeatureBinMapper
(
i
)
->
GetDefaultBin
()
==
0
)
{
feature_metas_
[
i
].
bias
=
1
;
}
else
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment