Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
63d974bc
"vscode:/vscode.git/clone" did not exist on "b4535b8d843c9f0881d9f30a0869def4a1aec83f"
Commit
63d974bc
authored
Nov 08, 2016
by
Allard van Mossel
Committed by
Guolin Ke
Nov 15, 2016
Browse files
Added complexity regularization parameters (L1, L2, min_gain_to_split) (#69)
parent
9ca29e66
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
53 additions
and
15 deletions
+53
-15
include/LightGBM/config.h
include/LightGBM/config.h
+3
-0
src/io/config.cpp
src/io/config.cpp
+6
-0
src/treelearner/feature_histogram.hpp
src/treelearner/feature_histogram.hpp
+29
-12
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+7
-1
src/treelearner/serial_tree_learner.h
src/treelearner/serial_tree_learner.h
+8
-2
No files found.
include/LightGBM/config.h
View file @
63d974bc
...
...
@@ -154,6 +154,9 @@ struct TreeConfig: public ConfigBase {
public:
int
min_data_in_leaf
=
100
;
double
min_sum_hessian_in_leaf
=
10.0
f
;
double
lambda_l1
=
0.0
f
;
double
lambda_l2
=
0.0
f
;
double
min_gain_to_split
=
0.0
f
;
// should > 1, only one leaf means not need to learning
int
num_leaves
=
127
;
int
feature_fraction_seed
=
2
;
...
...
src/io/config.cpp
View file @
63d974bc
...
...
@@ -265,6 +265,12 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
GetInt
(
params
,
"min_data_in_leaf"
,
&
min_data_in_leaf
);
GetDouble
(
params
,
"min_sum_hessian_in_leaf"
,
&
min_sum_hessian_in_leaf
);
CHECK
(
min_sum_hessian_in_leaf
>
1.0
f
||
min_data_in_leaf
>
0
);
GetDouble
(
params
,
"lambda_l1"
,
&
lambda_l1
);
CHECK
(
lambda_l1
>=
0.0
f
)
GetDouble
(
params
,
"lambda_l2"
,
&
lambda_l2
);
CHECK
(
lambda_l2
>=
0.0
f
)
GetDouble
(
params
,
"min_gain_to_split"
,
&
min_gain_to_split
);
CHECK
(
min_gain_to_split
>=
0.0
f
)
GetInt
(
params
,
"num_leaves"
,
&
num_leaves
);
CHECK
(
num_leaves
>
1
);
GetInt
(
params
,
"feature_fraction_seed"
,
&
feature_fraction_seed
);
...
...
src/treelearner/feature_histogram.hpp
View file @
63d974bc
...
...
@@ -26,10 +26,13 @@ public:
* \param min_num_data_one_leaf minimal number of data in one leaf
*/
void
Init
(
const
Feature
*
feature
,
int
feature_idx
,
data_size_t
min_num_data_one_leaf
,
double
min_sum_hessian_one_leaf
)
{
double
min_sum_hessian_one_leaf
,
double
lambda_l1
,
double
lambda_l2
,
double
min_gain_to_split
)
{
feature_idx_
=
feature_idx
;
min_num_data_one_leaf_
=
min_num_data_one_leaf
;
min_sum_hessian_one_leaf_
=
min_sum_hessian_one_leaf
;
lambda_l1_
=
lambda_l1
;
lambda_l2_
=
lambda_l2
;
min_gain_to_split_
=
min_gain_to_split
;
bin_data_
=
feature
->
bin_data
();
num_bins_
=
feature
->
num_bin
();
data_
=
new
HistogramBinEntry
[
num_bins_
];
...
...
@@ -113,6 +116,7 @@ public:
double
sum_right_hessian
=
kEpsilon
;
data_size_t
right_count
=
0
;
double
gain_shift
=
GetLeafSplitGain
(
sum_gradients_
,
sum_hessians_
);
double
min_gain_shift
=
gain_shift
+
min_gain_to_split_
;
is_splittable_
=
false
;
// from right to left, and we don't need data in bin0
for
(
unsigned
int
t
=
num_bins_
-
1
;
t
>
0
;
--
t
)
{
...
...
@@ -127,16 +131,14 @@ public:
double
sum_left_hessian
=
sum_hessians_
-
sum_right_hessian
;
// if sum hessian too small
if
(
sum_left_hessian
<
min_sum_hessian_one_leaf_
)
{
break
;
}
if
(
sum_left_hessian
<
min_sum_hessian_one_leaf_
)
break
;
double
sum_left_gradient
=
sum_gradients_
-
sum_right_gradient
;
// current split gain
double
current_gain
=
GetLeafSplitGain
(
sum_left_gradient
,
sum_left_hessian
)
+
GetLeafSplitGain
(
sum_right_gradient
,
sum_right_hessian
);
// gain is worst than no perform split
if
(
current_gain
<
gain_shift
)
{
continue
;
}
// gain with split is worse than without split
if
(
current_gain
<
min_gain_shift
)
continue
;
// mark to is splittable
is_splittable_
=
true
;
// better split point
...
...
@@ -211,23 +213,32 @@ public:
private:
/*!
* \brief Calculate the split gain based on sum_gradients and sum_hessians
* \brief Calculate the split gain based on
regularized
sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \return split gain
*/
double
GetLeafSplitGain
(
double
sum_gradients
,
double
sum_hessians
)
const
{
return
(
sum_gradients
*
sum_gradients
)
/
(
sum_hessians
);
double
abs_sum_gradients
=
std
::
fabs
(
sum_gradients
);
if
(
abs_sum_gradients
>
lambda_l1_
)
{
double
reg_abs_sum_gradients
=
abs_sum_gradients
-
lambda_l1_
;
return
(
reg_abs_sum_gradients
*
reg_abs_sum_gradients
)
/
(
sum_hessians
+
lambda_l2_
);
}
return
0.0
f
;
}
/*!
* \brief Calculate the output of a leaf based on sum_gradients and sum_hessians
* \brief Calculate the output of a leaf based on
regularized
sum_gradients and sum_hessians
* \param sum_gradients
* \param sum_hessians
* \return leaf output
*/
double
CalculateSplittedLeafOutput
(
double
sum_gradients
,
double
sum_hessians
)
const
{
return
-
(
sum_gradients
)
/
(
sum_hessians
);
double
abs_sum_gradients
=
std
::
fabs
(
sum_gradients
);
if
(
abs_sum_gradients
>
lambda_l1_
)
{
return
-
std
::
copysign
(
abs_sum_gradients
-
lambda_l1_
,
sum_gradients
)
/
(
sum_hessians
+
lambda_l2_
);
}
return
0.0
f
;
}
int
feature_idx_
;
...
...
@@ -235,6 +246,12 @@ private:
data_size_t
min_num_data_one_leaf_
;
/*! \brief minimal sum hessian of data in one leaf */
double
min_sum_hessian_one_leaf_
;
/*! \brief lambda of the L1 weights regularization */
double
lambda_l1_
;
/*! \brief lambda of the L2 weights regularization */
double
lambda_l2_
;
/*! \brief minimal gain (loss reduction) to split */
double
min_gain_to_split_
;
/*! \brief the bin data of current feature */
const
Bin
*
bin_data_
;
/*! \brief number of bin of histogram */
...
...
src/treelearner/serial_tree_learner.cpp
View file @
63d974bc
...
...
@@ -16,6 +16,9 @@ SerialTreeLearner::SerialTreeLearner(const TreeConfig& tree_config)
num_leaves_
=
tree_config
.
num_leaves
;
min_num_data_one_leaf_
=
static_cast
<
data_size_t
>
(
tree_config
.
min_data_in_leaf
);
min_sum_hessian_one_leaf_
=
static_cast
<
double
>
(
tree_config
.
min_sum_hessian_in_leaf
);
lambda_l1_
=
tree_config
.
lambda_l1
;
lambda_l2_
=
tree_config
.
lambda_l2
;
min_gain_to_split_
=
tree_config
.
min_gain_to_split
;
feature_fraction_
=
tree_config
.
feature_fraction
;
random_
=
Random
(
tree_config
.
feature_fraction_seed
);
histogram_pool_size_
=
tree_config
.
histogram_pool_size
;
...
...
@@ -68,7 +71,10 @@ void SerialTreeLearner::Init(const Dataset* train_data) {
for
(
int
j
=
0
;
j
<
train_data_
->
num_features
();
++
j
)
{
tmp_histogram_array
[
j
].
Init
(
train_data_
->
FeatureAt
(
j
),
j
,
min_num_data_one_leaf_
,
min_sum_hessian_one_leaf_
);
min_sum_hessian_one_leaf_
,
lambda_l1_
,
lambda_l2_
,
min_gain_to_split_
);
}
return
tmp_histogram_array
;
};
...
...
src/treelearner/serial_tree_learner.h
View file @
63d974bc
...
...
@@ -111,10 +111,16 @@ protected:
const
score_t
*
hessians_
;
/*! \brief number of total leaves */
int
num_leaves_
;
/*! \brief mini
n
al data on one leaf */
/*! \brief mini
m
al data on one leaf */
data_size_t
min_num_data_one_leaf_
;
/*! \brief mini
n
al sum hessian on one leaf */
/*! \brief mini
m
al sum hessian on one leaf */
double
min_sum_hessian_one_leaf_
;
/*! \brief lambda of the L1 weights regularization */
double
lambda_l1_
;
/*! \brief lambda of the L2 weights regularization */
double
lambda_l2_
;
/*! \brief minimal gain (loss reduction) to split */
double
min_gain_to_split_
;
/*! \brief sub-feature fraction rate */
double
feature_fraction_
;
/*! \brief training data partition on leaves */
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment