Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
091f41b6
Unverified
Commit
091f41b6
authored
Jul 25, 2020
by
Guolin Ke
Committed by
GitHub
Jul 25, 2020
Browse files
fix bug in CEGB when reset training data or config (#3246)
* fix * Apply suggestions from code review
parent
e2f11b05
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
75 additions
and
33 deletions
+75
-33
src/treelearner/cost_effective_gradient_boosting.hpp
src/treelearner/cost_effective_gradient_boosting.hpp
+72
-32
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+3
-1
No files found.
src/treelearner/cost_effective_gradient_boosting.hpp
View file @
091f41b6
/*!
/*!
* Copyright (c) 2019 Microsoft Corporation. All rights reserved.
* Copyright (c) 2019 Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE file in the project root for license information.
* Licensed under the MIT License. See LICENSE file in the project root for
* license information.
*/
*/
#ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
#ifndef LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
#define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
#define LIGHTGBM_TREELEARNER_COST_EFFECTIVE_GRADIENT_BOOSTING_HPP_
...
@@ -20,11 +21,12 @@ namespace LightGBM {
...
@@ -20,11 +21,12 @@ namespace LightGBM {
class
CostEfficientGradientBoosting
{
class
CostEfficientGradientBoosting
{
public:
public:
explicit
CostEfficientGradientBoosting
(
const
SerialTreeLearner
*
tree_learner
)
:
tree_learner_
(
tree_learner
)
{
explicit
CostEfficientGradientBoosting
(
const
SerialTreeLearner
*
tree_learner
)
}
:
init_
(
false
),
tree_learner_
(
tree_learner
)
{
}
static
bool
IsEnable
(
const
Config
*
config
)
{
static
bool
IsEnable
(
const
Config
*
config
)
{
if
(
config
->
cegb_tradeoff
>=
1.0
f
&&
config
->
cegb_penalty_split
<=
0.0
f
if
(
config
->
cegb_tradeoff
>=
1.0
f
&&
config
->
cegb_penalty_split
<=
0.0
f
&&
&&
config
->
cegb_penalty_feature_coupled
.
empty
()
&&
config
->
cegb_penalty_feature_lazy
.
empty
())
{
config
->
cegb_penalty_feature_coupled
.
empty
()
&&
config
->
cegb_penalty_feature_lazy
.
empty
())
{
return
false
;
return
false
;
}
else
{
}
else
{
return
true
;
return
true
;
...
@@ -32,44 +34,73 @@ class CostEfficientGradientBoosting {
...
@@ -32,44 +34,73 @@ class CostEfficientGradientBoosting {
}
}
void
Init
()
{
void
Init
()
{
auto
train_data
=
tree_learner_
->
train_data_
;
auto
train_data
=
tree_learner_
->
train_data_
;
splits_per_leaf_
.
resize
(
static_cast
<
size_t
>
(
tree_learner_
->
config_
->
num_leaves
)
*
train_data
->
num_features
());
if
(
!
init_
)
{
is_feature_used_in_split_
.
clear
();
splits_per_leaf_
.
resize
(
is_feature_used_in_split_
.
resize
(
train_data
->
num_features
());
static_cast
<
size_t
>
(
tree_learner_
->
config_
->
num_leaves
)
*
train_data
->
num_features
());
is_feature_used_in_split_
.
clear
();
is_feature_used_in_split_
.
resize
(
train_data
->
num_features
());
}
if
(
!
tree_learner_
->
config_
->
cegb_penalty_feature_coupled
.
empty
()
if
(
!
tree_learner_
->
config_
->
cegb_penalty_feature_coupled
.
empty
()
&&
&&
tree_learner_
->
config_
->
cegb_penalty_feature_coupled
.
size
()
!=
static_cast
<
size_t
>
(
train_data
->
num_total_features
()))
{
tree_learner_
->
config_
->
cegb_penalty_feature_coupled
.
size
()
!=
Log
::
Fatal
(
"cegb_penalty_feature_coupled should be the same size as feature number."
);
static_cast
<
size_t
>
(
train_data
->
num_total_features
()))
{
Log
::
Fatal
(
"cegb_penalty_feature_coupled should be the same size as feature "
"number."
);
}
}
if
(
!
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
.
empty
())
{
if
(
!
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
.
empty
())
{
if
(
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
.
size
()
!=
static_cast
<
size_t
>
(
train_data
->
num_total_features
()))
{
if
(
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
.
size
()
!=
Log
::
Fatal
(
"cegb_penalty_feature_lazy should be the same size as feature number."
);
static_cast
<
size_t
>
(
train_data
->
num_total_features
()))
{
Log
::
Fatal
(
"cegb_penalty_feature_lazy should be the same size as feature "
"number."
);
}
if
(
!
init_
)
{
feature_used_in_data_
=
Common
::
EmptyBitset
(
train_data
->
num_features
()
*
tree_learner_
->
num_data_
);
}
}
feature_used_in_data_
=
Common
::
EmptyBitset
(
train_data
->
num_features
()
*
tree_learner_
->
num_data_
);
}
}
init_
=
true
;
}
}
double
DetlaGain
(
int
feature_index
,
int
real_fidx
,
int
leaf_index
,
int
num_data_in_leaf
,
SplitInfo
split_info
)
{
double
DetlaGain
(
int
feature_index
,
int
real_fidx
,
int
leaf_index
,
int
num_data_in_leaf
,
SplitInfo
split_info
)
{
auto
config
=
tree_learner_
->
config_
;
auto
config
=
tree_learner_
->
config_
;
double
delta
=
config
->
cegb_tradeoff
*
config
->
cegb_penalty_split
*
num_data_in_leaf
;
double
delta
=
if
(
!
config
->
cegb_penalty_feature_coupled
.
empty
()
&&
!
is_feature_used_in_split_
[
feature_index
])
{
config
->
cegb_tradeoff
*
config
->
cegb_penalty_split
*
num_data_in_leaf
;
delta
+=
config
->
cegb_tradeoff
*
config
->
cegb_penalty_feature_coupled
[
real_fidx
];
if
(
!
config
->
cegb_penalty_feature_coupled
.
empty
()
&&
!
is_feature_used_in_split_
[
feature_index
])
{
delta
+=
config
->
cegb_tradeoff
*
config
->
cegb_penalty_feature_coupled
[
real_fidx
];
}
}
if
(
!
config
->
cegb_penalty_feature_lazy
.
empty
())
{
if
(
!
config
->
cegb_penalty_feature_lazy
.
empty
())
{
delta
+=
config
->
cegb_tradeoff
*
CalculateOndemandCosts
(
feature_index
,
real_fidx
,
leaf_index
);
delta
+=
config
->
cegb_tradeoff
*
CalculateOndemandCosts
(
feature_index
,
real_fidx
,
leaf_index
);
}
}
splits_per_leaf_
[
static_cast
<
size_t
>
(
leaf_index
)
*
tree_learner_
->
train_data_
->
num_features
()
+
feature_index
]
=
split_info
;
splits_per_leaf_
[
static_cast
<
size_t
>
(
leaf_index
)
*
tree_learner_
->
train_data_
->
num_features
()
+
feature_index
]
=
split_info
;
return
delta
;
return
delta
;
}
}
void
UpdateLeafBestSplits
(
Tree
*
tree
,
int
best_leaf
,
const
SplitInfo
*
best_split_info
,
std
::
vector
<
SplitInfo
>*
best_split_per_leaf
)
{
void
UpdateLeafBestSplits
(
Tree
*
tree
,
int
best_leaf
,
const
SplitInfo
*
best_split_info
,
std
::
vector
<
SplitInfo
>*
best_split_per_leaf
)
{
auto
config
=
tree_learner_
->
config_
;
auto
config
=
tree_learner_
->
config_
;
auto
train_data
=
tree_learner_
->
train_data_
;
auto
train_data
=
tree_learner_
->
train_data_
;
const
int
inner_feature_index
=
train_data
->
InnerFeatureIndex
(
best_split_info
->
feature
);
const
int
inner_feature_index
=
train_data
->
InnerFeatureIndex
(
best_split_info
->
feature
);
auto
&
ref_best_split_per_leaf
=
*
best_split_per_leaf
;
auto
&
ref_best_split_per_leaf
=
*
best_split_per_leaf
;
if
(
!
config
->
cegb_penalty_feature_coupled
.
empty
()
&&
!
is_feature_used_in_split_
[
inner_feature_index
])
{
if
(
!
config
->
cegb_penalty_feature_coupled
.
empty
()
&&
!
is_feature_used_in_split_
[
inner_feature_index
])
{
is_feature_used_in_split_
[
inner_feature_index
]
=
true
;
is_feature_used_in_split_
[
inner_feature_index
]
=
true
;
for
(
int
i
=
0
;
i
<
tree
->
num_leaves
();
++
i
)
{
for
(
int
i
=
0
;
i
<
tree
->
num_leaves
();
++
i
)
{
if
(
i
==
best_leaf
)
continue
;
if
(
i
==
best_leaf
)
continue
;
auto
split
=
&
splits_per_leaf_
[
static_cast
<
size_t
>
(
i
)
*
train_data
->
num_features
()
+
inner_feature_index
];
auto
split
=
&
splits_per_leaf_
[
static_cast
<
size_t
>
(
i
)
*
split
->
gain
+=
config
->
cegb_tradeoff
*
config
->
cegb_penalty_feature_coupled
[
best_split_info
->
feature
];
train_data
->
num_features
()
+
inner_feature_index
];
split
->
gain
+=
config
->
cegb_tradeoff
*
config
->
cegb_penalty_feature_coupled
[
best_split_info
->
feature
];
// Avoid to update the leaf that cannot split
// Avoid to update the leaf that cannot split
if
(
ref_best_split_per_leaf
[
i
].
gain
>
kMinScore
&&
if
(
ref_best_split_per_leaf
[
i
].
gain
>
kMinScore
&&
*
split
>
ref_best_split_per_leaf
[
i
])
{
*
split
>
ref_best_split_per_leaf
[
i
])
{
...
@@ -79,36 +110,45 @@ class CostEfficientGradientBoosting {
...
@@ -79,36 +110,45 @@ class CostEfficientGradientBoosting {
}
}
if
(
!
config
->
cegb_penalty_feature_lazy
.
empty
())
{
if
(
!
config
->
cegb_penalty_feature_lazy
.
empty
())
{
data_size_t
cnt_leaf_data
=
0
;
data_size_t
cnt_leaf_data
=
0
;
auto
tmp_idx
=
tree_learner_
->
data_partition_
->
GetIndexOnLeaf
(
best_leaf
,
&
cnt_leaf_data
);
auto
tmp_idx
=
tree_learner_
->
data_partition_
->
GetIndexOnLeaf
(
best_leaf
,
&
cnt_leaf_data
);
for
(
data_size_t
i_input
=
0
;
i_input
<
cnt_leaf_data
;
++
i_input
)
{
for
(
data_size_t
i_input
=
0
;
i_input
<
cnt_leaf_data
;
++
i_input
)
{
int
real_idx
=
tmp_idx
[
i_input
];
int
real_idx
=
tmp_idx
[
i_input
];
Common
::
InsertBitset
(
&
feature_used_in_data_
,
train_data
->
num_data
()
*
inner_feature_index
+
real_idx
);
Common
::
InsertBitset
(
&
feature_used_in_data_
,
train_data
->
num_data
()
*
inner_feature_index
+
real_idx
);
}
}
}
}
}
}
private:
private:
double
CalculateOndemandCosts
(
int
feature_index
,
int
real_fidx
,
int
leaf_index
)
const
{
double
CalculateOndemandCosts
(
int
feature_index
,
int
real_fidx
,
int
leaf_index
)
const
{
if
(
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
.
empty
())
{
if
(
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
.
empty
())
{
return
0.0
f
;
return
0.0
f
;
}
}
auto
train_data
=
tree_learner_
->
train_data_
;
auto
train_data
=
tree_learner_
->
train_data_
;
double
penalty
=
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
[
real_fidx
];
double
penalty
=
tree_learner_
->
config_
->
cegb_penalty_feature_lazy
[
real_fidx
];
double
total
=
0.0
f
;
double
total
=
0.0
f
;
data_size_t
cnt_leaf_data
=
0
;
data_size_t
cnt_leaf_data
=
0
;
auto
tmp_idx
=
tree_learner_
->
data_partition_
->
GetIndexOnLeaf
(
leaf_index
,
&
cnt_leaf_data
);
auto
tmp_idx
=
tree_learner_
->
data_partition_
->
GetIndexOnLeaf
(
leaf_index
,
&
cnt_leaf_data
);
for
(
data_size_t
i_input
=
0
;
i_input
<
cnt_leaf_data
;
++
i_input
)
{
for
(
data_size_t
i_input
=
0
;
i_input
<
cnt_leaf_data
;
++
i_input
)
{
int
real_idx
=
tmp_idx
[
i_input
];
int
real_idx
=
tmp_idx
[
i_input
];
if
(
Common
::
FindInBitset
(
feature_used_in_data_
.
data
(),
train_data
->
num_data
()
*
train_data
->
num_features
(),
train_data
->
num_data
()
*
feature_index
+
real_idx
))
{
if
(
Common
::
FindInBitset
(
feature_used_in_data_
.
data
(),
train_data
->
num_data
()
*
train_data
->
num_features
(),
train_data
->
num_data
()
*
feature_index
+
real_idx
))
{
continue
;
continue
;
}
}
total
+=
penalty
;
total
+=
penalty
;
}
}
return
total
;
return
total
;
}
}
bool
init_
;
const
SerialTreeLearner
*
tree_learner_
;
const
SerialTreeLearner
*
tree_learner_
;
std
::
vector
<
SplitInfo
>
splits_per_leaf_
;
std
::
vector
<
SplitInfo
>
splits_per_leaf_
;
std
::
vector
<
bool
>
is_feature_used_in_split_
;
std
::
vector
<
bool
>
is_feature_used_in_split_
;
...
...
src/treelearner/serial_tree_learner.cpp
View file @
091f41b6
...
@@ -141,7 +141,9 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
...
@@ -141,7 +141,9 @@ void SerialTreeLearner::ResetConfig(const Config* config) {
col_sampler_
.
SetConfig
(
config_
);
col_sampler_
.
SetConfig
(
config_
);
histogram_pool_
.
ResetConfig
(
train_data_
,
config_
);
histogram_pool_
.
ResetConfig
(
train_data_
,
config_
);
if
(
CostEfficientGradientBoosting
::
IsEnable
(
config_
))
{
if
(
CostEfficientGradientBoosting
::
IsEnable
(
config_
))
{
cegb_
.
reset
(
new
CostEfficientGradientBoosting
(
this
));
if
(
cegb_
==
nullptr
)
{
cegb_
.
reset
(
new
CostEfficientGradientBoosting
(
this
));
}
cegb_
->
Init
();
cegb_
->
Init
();
}
}
constraints_
.
reset
(
LeafConstraintsBase
::
Create
(
config_
,
config_
->
num_leaves
));
constraints_
.
reset
(
LeafConstraintsBase
::
Create
(
config_
,
config_
->
num_leaves
));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment