Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1e7ccbbb
Commit
1e7ccbbb
authored
Jul 04, 2017
by
Guolin Ke
Browse files
clean code for Boosting::ResetTrainingData.
parent
a98b23d2
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
204 additions
and
153 deletions
+204
-153
include/LightGBM/boosting.h
include/LightGBM/boosting.h
+4
-8
include/LightGBM/config.h
include/LightGBM/config.h
+1
-1
src/boosting/dart.hpp
src/boosting/dart.hpp
+0
-4
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+160
-113
src/boosting/gbdt.h
src/boosting/gbdt.h
+4
-7
src/boosting/goss.hpp
src/boosting/goss.hpp
+17
-10
src/c_api.cpp
src/c_api.cpp
+18
-10
No files found.
include/LightGBM/boosting.h
View file @
1e7ccbbb
...
@@ -43,14 +43,10 @@ public:
...
@@ -43,14 +43,10 @@ public:
*/
*/
virtual
void
MergeFrom
(
const
Boosting
*
other
)
=
0
;
virtual
void
MergeFrom
(
const
Boosting
*
other
)
=
0
;
/*!
virtual
void
ResetTrainingData
(
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
* \brief Reset training data for current boosting
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
=
0
;
* \param config Configs for boosting
* \param train_data Training data
virtual
void
ResetConfig
(
const
BoostingConfig
*
config
)
=
0
;
* \param objective_function Training objective function
* \param training_metrics Training metric
*/
virtual
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
=
0
;
/*!
/*!
* \brief Add a validation data
* \brief Add a validation data
...
...
include/LightGBM/config.h
View file @
1e7ccbbb
...
@@ -91,7 +91,7 @@ public:
...
@@ -91,7 +91,7 @@ public:
int
data_random_seed
=
1
;
int
data_random_seed
=
1
;
std
::
string
data_filename
=
""
;
std
::
string
data_filename
=
""
;
std
::
vector
<
std
::
string
>
valid_data_filenames
;
std
::
vector
<
std
::
string
>
valid_data_filenames
;
int
snapshot_freq
=
1
00
;
int
snapshot_freq
=
-
1
;
std
::
string
output_model
=
"LightGBM_model.txt"
;
std
::
string
output_model
=
"LightGBM_model.txt"
;
std
::
string
output_result
=
"LightGBM_predict_result.txt"
;
std
::
string
output_result
=
"LightGBM_predict_result.txt"
;
std
::
string
convert_model
=
"gbdt_prediction.cpp"
;
std
::
string
convert_model
=
"gbdt_prediction.cpp"
;
...
...
src/boosting/dart.hpp
View file @
1e7ccbbb
...
@@ -39,10 +39,6 @@ public:
...
@@ -39,10 +39,6 @@ public:
sum_weight_
=
0.0
f
;
sum_weight_
=
0.0
f
;
}
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
ResetTrainingData
(
config
,
train_data
,
objective_function
,
training_metrics
);
}
/*!
/*!
* \brief one training iteration
* \brief one training iteration
*/
*/
...
...
src/boosting/gbdt.cpp
View file @
1e7ccbbb
...
@@ -44,9 +44,9 @@ GBDT::GBDT()
...
@@ -44,9 +44,9 @@ GBDT::GBDT()
boost_from_average_
(
false
)
{
boost_from_average_
(
false
)
{
#pragma omp parallel
#pragma omp parallel
#pragma omp master
#pragma omp master
{
{
num_threads_
=
omp_get_num_threads
();
num_threads_
=
omp_get_num_threads
();
}
}
}
}
GBDT
::~
GBDT
()
{
GBDT
::~
GBDT
()
{
...
@@ -64,24 +64,104 @@ GBDT::~GBDT() {
...
@@ -64,24 +64,104 @@ GBDT::~GBDT() {
void
GBDT
::
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
void
GBDT
::
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
{
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
{
train_data_
=
train_data
;
iter_
=
0
;
iter_
=
0
;
num_iteration_for_pred_
=
0
;
num_iteration_for_pred_
=
0
;
max_feature_idx_
=
0
;
max_feature_idx_
=
0
;
num_class_
=
config
->
num_class
;
num_class_
=
config
->
num_class
;
train_data_
=
nullptr
;
gbdt_config_
=
std
::
unique_ptr
<
BoostingConfig
>
(
new
BoostingConfig
(
*
config
));
gbdt_config_
=
nullptr
;
early_stopping_round_
=
gbdt_config_
->
early_stopping_round
;
tree_learner_
=
nullptr
;
shrinkage_rate_
=
gbdt_config_
->
learning_rate
;
ResetTrainingData
(
config
,
train_data
,
objective_function
,
training_metrics
);
objective_function_
=
objective_function
;
num_tree_per_iteration_
=
num_class_
;
if
(
objective_function_
!=
nullptr
)
{
is_constant_hessian_
=
objective_function_
->
IsConstantHessian
();
num_tree_per_iteration_
=
objective_function_
->
NumTreePerIteration
();
}
else
{
is_constant_hessian_
=
false
;
}
tree_learner_
=
std
::
unique_ptr
<
TreeLearner
>
(
TreeLearner
::
CreateTreeLearner
(
gbdt_config_
->
tree_learner_type
,
gbdt_config_
->
device_type
,
&
gbdt_config_
->
tree_config
));
// init tree learner
tree_learner_
->
Init
(
train_data_
,
is_constant_hessian_
);
// push training metrics
training_metrics_
.
clear
();
for
(
const
auto
&
metric
:
training_metrics
)
{
training_metrics_
.
push_back
(
metric
);
}
training_metrics_
.
shrink_to_fit
();
train_score_updater_
.
reset
(
new
ScoreUpdater
(
train_data_
,
num_tree_per_iteration_
));
num_data_
=
train_data_
->
num_data
();
// create buffer for gradients and hessians
if
(
objective_function_
!=
nullptr
)
{
size_t
total_size
=
static_cast
<
size_t
>
(
num_data_
)
*
num_tree_per_iteration_
;
gradients_
.
resize
(
total_size
);
hessians_
.
resize
(
total_size
);
}
// get max feature index
max_feature_idx_
=
train_data_
->
num_total_features
()
-
1
;
// get label index
label_idx_
=
train_data_
->
label_idx
();
// get feature names
feature_names_
=
train_data_
->
feature_names
();
feature_infos_
=
train_data_
->
feature_infos
();
// if need bagging, create buffer
ResetBaggingConfig
(
gbdt_config_
.
get
());
// reset config for tree learner
class_need_train_
=
std
::
vector
<
bool
>
(
num_tree_per_iteration_
,
true
);
if
(
objective_function_
!=
nullptr
&&
objective_function_
->
SkipEmptyClass
())
{
CHECK
(
num_tree_per_iteration_
==
num_class_
);
// + 1 here for the binary classification
class_default_output_
=
std
::
vector
<
double
>
(
num_tree_per_iteration_
,
0.0
f
);
auto
label
=
train_data_
->
metadata
().
label
();
if
(
num_tree_per_iteration_
>
1
)
{
// multi-class
std
::
vector
<
data_size_t
>
cnt_per_class
(
num_tree_per_iteration_
,
0
);
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
int
index
=
static_cast
<
int
>
(
label
[
i
]);
CHECK
(
index
<
num_tree_per_iteration_
);
++
cnt_per_class
[
index
];
}
for
(
int
i
=
0
;
i
<
num_tree_per_iteration_
;
++
i
)
{
if
(
cnt_per_class
[
i
]
==
num_data_
)
{
class_need_train_
[
i
]
=
false
;
class_default_output_
[
i
]
=
-
std
::
log
(
kEpsilon
);
}
else
if
(
cnt_per_class
[
i
]
==
0
)
{
class_need_train_
[
i
]
=
false
;
class_default_output_
[
i
]
=
-
std
::
log
(
1.0
f
/
kEpsilon
-
1.0
f
);
}
}
}
else
{
// binary class
data_size_t
cnt_pos
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
label
[
i
]
>
0
)
{
++
cnt_pos
;
}
}
if
(
cnt_pos
==
0
)
{
class_need_train_
[
0
]
=
false
;
class_default_output_
[
0
]
=
-
std
::
log
(
1.0
f
/
kEpsilon
-
1.0
f
);
}
else
if
(
cnt_pos
==
num_data_
)
{
class_need_train_
[
0
]
=
false
;
class_default_output_
[
0
]
=
-
std
::
log
(
kEpsilon
);
}
}
}
}
}
void
GBDT
::
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
void
GBDT
::
ResetTrainingData
(
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
{
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
{
auto
new_config
=
std
::
unique_ptr
<
BoostingConfig
>
(
new
BoostingConfig
(
*
config
));
if
(
train_data
!=
train_data_
&&
!
train_data_
->
CheckAlign
(
*
train_data
))
{
if
(
train_data_
!=
nullptr
&&
!
train_data_
->
CheckAlign
(
*
train_data
))
{
Log
::
Fatal
(
"cannot reset training data, since new training data has different bin mappers"
);
Log
::
Fatal
(
"cannot reset training data, since new training data has different bin mappers"
);
}
}
early_stopping_round_
=
new_config
->
early_stopping_round
;
shrinkage_rate_
=
new_config
->
learning_rate
;
objective_function_
=
objective_function
;
objective_function_
=
objective_function
;
num_tree_per_iteration_
=
num_class_
;
num_tree_per_iteration_
=
num_class_
;
...
@@ -92,22 +172,18 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
...
@@ -92,22 +172,18 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
is_constant_hessian_
=
false
;
is_constant_hessian_
=
false
;
}
}
if
(
train_data_
!=
train_data
&&
train_data
!=
nullptr
)
{
// push training metrics
if
(
tree_learner_
==
nullptr
)
{
training_metrics_
.
clear
();
tree_learner_
=
std
::
unique_ptr
<
TreeLearner
>
(
TreeLearner
::
CreateTreeLearner
(
new_config
->
tree_learner_type
,
new_config
->
device_type
,
&
new_config
->
tree_config
));
for
(
const
auto
&
metric
:
training_metrics
)
{
}
training_metrics_
.
push_back
(
metric
);
// init tree learner
}
tr
ee_learner_
->
Init
(
train_data
,
is_constant_hessian_
);
tr
aining_metrics_
.
shrink_to_fit
(
);
// push training metrics
if
(
train_data
!=
train_data_
)
{
training_metrics_
.
clear
();
train_data_
=
train_data
;
for
(
const
auto
&
metric
:
training_metrics
)
{
training_metrics_
.
push_back
(
metric
);
}
training_metrics_
.
shrink_to_fit
();
// not same training data, need reset score and others
// not same training data, need reset score and others
// create score tracker
// create score tracker
train_score_updater_
.
reset
(
new
ScoreUpdater
(
train_data
,
num_tree_per_iteration_
));
train_score_updater_
.
reset
(
new
ScoreUpdater
(
train_data
_
,
num_tree_per_iteration_
));
// update score
// update score
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
for
(
int
cur_tree_id
=
0
;
cur_tree_id
<
num_tree_per_iteration_
;
++
cur_tree_id
)
{
for
(
int
cur_tree_id
=
0
;
cur_tree_id
<
num_tree_per_iteration_
;
++
cur_tree_id
)
{
...
@@ -115,106 +191,77 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
...
@@ -115,106 +191,77 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
train_score_updater_
->
AddScore
(
models_
[
curr_tree
].
get
(),
cur_tree_id
);
train_score_updater_
->
AddScore
(
models_
[
curr_tree
].
get
(),
cur_tree_id
);
}
}
}
}
num_data_
=
train_data
->
num_data
();
num_data_
=
train_data_
->
num_data
();
// create buffer for gradients and hessians
// create buffer for gradients and hessians
if
(
objective_function_
!=
nullptr
)
{
if
(
objective_function_
!=
nullptr
)
{
size_t
total_size
=
static_cast
<
size_t
>
(
num_data_
)
*
num_tree_per_iteration_
;
size_t
total_size
=
static_cast
<
size_t
>
(
num_data_
)
*
num_tree_per_iteration_
;
gradients_
.
resize
(
total_size
);
gradients_
.
resize
(
total_size
);
hessians_
.
resize
(
total_size
);
hessians_
.
resize
(
total_size
);
}
}
// get max feature index
// get max feature index
max_feature_idx_
=
train_data
->
num_total_features
()
-
1
;
max_feature_idx_
=
train_data
_
->
num_total_features
()
-
1
;
// get label index
// get label index
label_idx_
=
train_data
->
label_idx
();
label_idx_
=
train_data
_
->
label_idx
();
// get feature names
// get feature names
feature_names_
=
train_data
->
feature_names
();
feature_names_
=
train_data_
->
feature_names
();
feature_infos_
=
train_data_
->
feature_infos
();
ResetBaggingConfig
(
gbdt_config_
.
get
());
feature_infos_
=
train_data
->
feature_infos
(
);
tree_learner_
->
ResetTrainingData
(
train_data
);
}
}
}
if
((
train_data_
!=
train_data
&&
train_data
!=
nullptr
)
void
GBDT
::
ResetConfig
(
const
BoostingConfig
*
config
)
{
||
(
gbdt_config_
!=
nullptr
&&
gbdt_config_
->
bagging_fraction
!=
new_config
->
bagging_fraction
))
{
auto
new_config
=
std
::
unique_ptr
<
BoostingConfig
>
(
new
BoostingConfig
(
*
config
));
// if need bagging, create buffer
if
(
new_config
->
bagging_fraction
<
1.0
&&
new_config
->
bagging_freq
>
0
)
{
early_stopping_round_
=
new_config
->
early_stopping_round
;
bag_data_cnt_
=
shrinkage_rate_
=
new_config
->
learning_rate
;
static_cast
<
data_size_t
>
(
new_config
->
bagging_fraction
*
num_data_
);
bag_data_indices_
.
resize
(
num_data_
);
ResetBaggingConfig
(
new_config
.
get
()
);
tmp_indices_
.
resize
(
num_data_
);
offsets_buf_
.
resize
(
num_threads_
);
tree_learner_
->
ResetConfig
(
&
new_config
->
tree_config
);
left_cnts_buf_
.
resize
(
num_threads_
);
gbdt_config_
.
reset
(
new_config
.
release
()
);
right_cnts_buf_
.
resize
(
num_threads_
);
}
left_write_pos_buf_
.
resize
(
num_threads_
);
right_write_pos_buf_
.
resize
(
num_threads_
);
void
GBDT
::
ResetBaggingConfig
(
const
BoostingConfig
*
config
)
{
double
average_bag_rate
=
new_config
->
bagging_fraction
/
new_config
->
bagging_freq
;
// if need bagging, create buffer
int
sparse_group
=
0
;
if
(
config
->
bagging_fraction
<
1.0
&&
config
->
bagging_freq
>
0
)
{
for
(
int
i
=
0
;
i
<
train_data
->
num_feature_groups
();
++
i
)
{
bag_data_cnt_
=
if
(
train_data
->
FeatureGroupIsSparse
(
i
))
{
static_cast
<
data_size_t
>
(
config
->
bagging_fraction
*
num_data_
);
++
sparse_group
;
bag_data_indices_
.
resize
(
num_data_
)
;
}
tmp_indices_
.
resize
(
num_data_
);
}
offsets_buf_
.
resize
(
num_threads_
);
is_use_subset_
=
false
;
left_cnts_buf_
.
resize
(
num_threads_
)
;
const
int
group_threshold_usesubset
=
100
;
right_cnts_buf_
.
resize
(
num_threads_
)
;
const
int
sparse_group_threshold_usesubset
=
train_data
->
num_feature_groups
()
/
4
;
left_write_pos_buf_
.
resize
(
num_threads_
)
;
if
(
average_bag_rate
<=
0.5
right_write_pos_buf_
.
resize
(
num_threads_
);
&&
(
train_data
->
num_feature_groups
()
<
group_threshold_usesubset
||
sparse_group
<
sparse_group_threshold_usesubset
))
{
double
average_bag_rate
=
config
->
bagging_fraction
/
config
->
bagging_freq
;
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt_
))
;
int
sparse_group
=
0
;
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data
);
for
(
int
i
=
0
;
i
<
train_data_
->
num_feature_groups
();
++
i
)
{
is_use_subset_
=
true
;
if
(
train_data_
->
FeatureGroupIsSparse
(
i
))
{
Log
::
Debug
(
"use subset for bagging"
)
;
++
sparse_group
;
}
}
}
else
{
bag_data_cnt_
=
num_data_
;
bag_data_indices_
.
clear
();
tmp_indices_
.
clear
();
is_use_subset_
=
false
;
}
}
}
is_use_subset_
=
false
;
train_data_
=
train_data
;
const
int
group_threshold_usesubset
=
100
;
if
(
train_data_
!=
nullptr
)
{
const
int
sparse_group_threshold_usesubset
=
train_data_
->
num_feature_groups
()
/
4
;
// reset config for tree learner
if
(
average_bag_rate
<=
0.5
tree_learner_
->
ResetConfig
(
&
new_config
->
tree_config
);
&&
(
train_data_
->
num_feature_groups
()
<
group_threshold_usesubset
||
sparse_group
<
sparse_group_threshold_usesubset
))
{
class_need_train_
=
std
::
vector
<
bool
>
(
num_tree_per_iteration_
,
true
);
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt_
));
if
(
objective_function_
!=
nullptr
&&
objective_function_
->
SkipEmptyClass
())
{
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data_
);
CHECK
(
num_tree_per_iteration_
==
num_class_
);
is_use_subset_
=
true
;
// + 1 here for the binary classification
Log
::
Debug
(
"use subset for bagging"
);
class_default_output_
=
std
::
vector
<
double
>
(
num_tree_per_iteration_
,
0.0
f
);
auto
label
=
train_data_
->
metadata
().
label
();
if
(
num_tree_per_iteration_
>
1
)
{
// multi-class
std
::
vector
<
data_size_t
>
cnt_per_class
(
num_tree_per_iteration_
,
0
);
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
int
index
=
static_cast
<
int
>
(
label
[
i
]);
CHECK
(
index
<
num_tree_per_iteration_
);
++
cnt_per_class
[
index
];
}
for
(
int
i
=
0
;
i
<
num_tree_per_iteration_
;
++
i
)
{
if
(
cnt_per_class
[
i
]
==
num_data_
)
{
class_need_train_
[
i
]
=
false
;
class_default_output_
[
i
]
=
-
std
::
log
(
kEpsilon
);
}
else
if
(
cnt_per_class
[
i
]
==
0
)
{
class_need_train_
[
i
]
=
false
;
class_default_output_
[
i
]
=
-
std
::
log
(
1.0
f
/
kEpsilon
-
1.0
f
);
}
}
}
else
{
// binary class
data_size_t
cnt_pos
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
label
[
i
]
>
0
)
{
++
cnt_pos
;
}
}
if
(
cnt_pos
==
0
)
{
class_need_train_
[
0
]
=
false
;
class_default_output_
[
0
]
=
-
std
::
log
(
1.0
f
/
kEpsilon
-
1.0
f
);
}
else
if
(
cnt_pos
==
num_data_
)
{
class_need_train_
[
0
]
=
false
;
class_default_output_
[
0
]
=
-
std
::
log
(
kEpsilon
);
}
}
}
}
}
else
{
bag_data_cnt_
=
num_data_
;
bag_data_indices_
.
clear
();
tmp_indices_
.
clear
();
is_use_subset_
=
false
;
}
}
gbdt_config_
.
reset
(
new_config
.
release
());
}
}
void
GBDT
::
AddValidDataset
(
const
Dataset
*
valid_data
,
void
GBDT
::
AddValidDataset
(
const
Dataset
*
valid_data
,
...
@@ -358,7 +405,7 @@ double LabelAverage(const float* label, data_size_t num_data) {
...
@@ -358,7 +405,7 @@ double LabelAverage(const float* label, data_size_t num_data) {
Network
::
Allreduce
(
reinterpret_cast
<
char
*>
(
&
init_score
),
Network
::
Allreduce
(
reinterpret_cast
<
char
*>
(
&
init_score
),
sizeof
(
init_score
),
sizeof
(
init_score
),
sizeof
(
init_score
),
sizeof
(
init_score
),
reinterpret_cast
<
char
*>
(
&
global_init_score
),
reinterpret_cast
<
char
*>
(
&
global_init_score
),
[](
const
char
*
src
,
char
*
dst
,
int
len
)
{
[]
(
const
char
*
src
,
char
*
dst
,
int
len
)
{
int
used_size
=
0
;
int
used_size
=
0
;
const
int
type_size
=
sizeof
(
double
);
const
int
type_size
=
sizeof
(
double
);
const
double
*
p1
;
const
double
*
p1
;
...
@@ -833,7 +880,7 @@ bool GBDT::SaveModelToIfElse(int num_iteration, const char* filename) const {
...
@@ -833,7 +880,7 @@ bool GBDT::SaveModelToIfElse(int num_iteration, const char* filename) const {
std
::
ifstream
ifs
(
filename
);
std
::
ifstream
ifs
(
filename
);
if
(
ifs
.
good
())
{
if
(
ifs
.
good
())
{
std
::
string
origin
((
std
::
istreambuf_iterator
<
char
>
(
ifs
)),
std
::
string
origin
((
std
::
istreambuf_iterator
<
char
>
(
ifs
)),
(
std
::
istreambuf_iterator
<
char
>
()));
(
std
::
istreambuf_iterator
<
char
>
()));
output_file
.
open
(
filename
);
output_file
.
open
(
filename
);
output_file
<<
"#define USE_HARD_CODE 0"
<<
std
::
endl
;
output_file
<<
"#define USE_HARD_CODE 0"
<<
std
::
endl
;
output_file
<<
"#ifndef USE_HARD_CODE"
<<
std
::
endl
;
output_file
<<
"#ifndef USE_HARD_CODE"
<<
std
::
endl
;
...
@@ -1027,8 +1074,8 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
...
@@ -1027,8 +1074,8 @@ std::vector<std::pair<size_t, std::string>> GBDT::FeatureImportance() const {
}
}
// sort the importance
// sort the importance
std
::
sort
(
pairs
.
begin
(),
pairs
.
end
(),
std
::
sort
(
pairs
.
begin
(),
pairs
.
end
(),
[](
const
std
::
pair
<
size_t
,
std
::
string
>&
lhs
,
[]
(
const
std
::
pair
<
size_t
,
std
::
string
>&
lhs
,
const
std
::
pair
<
size_t
,
std
::
string
>&
rhs
)
{
const
std
::
pair
<
size_t
,
std
::
string
>&
rhs
)
{
return
lhs
.
first
>
rhs
.
first
;
return
lhs
.
first
>
rhs
.
first
;
});
});
return
pairs
;
return
pairs
;
...
...
src/boosting/gbdt.h
View file @
1e7ccbbb
...
@@ -63,14 +63,10 @@ public:
...
@@ -63,14 +63,10 @@ public:
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_tree_per_iteration_
;
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_tree_per_iteration_
;
}
}
/*!
void
ResetTrainingData
(
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
* \brief Reset training data for current boosting
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
;
* \param train_data Training data
* \param objective_function Training objective function
* \param training_metrics Training metric
*/
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
;
void
ResetConfig
(
const
BoostingConfig
*
config
)
override
;
/*!
/*!
* \brief Adding a validation dataset
* \brief Adding a validation dataset
* \param valid_data Validation dataset
* \param valid_data Validation dataset
...
@@ -258,6 +254,7 @@ public:
...
@@ -258,6 +254,7 @@ public:
virtual
const
char
*
SubModelName
()
const
override
{
return
"tree"
;
}
virtual
const
char
*
SubModelName
()
const
override
{
return
"tree"
;
}
protected:
protected:
void
ResetBaggingConfig
(
const
BoostingConfig
*
config
);
/*!
/*!
* \brief Implement bagging logic
* \brief Implement bagging logic
* \param iter Current interation
* \param iter Current interation
...
...
src/boosting/goss.hpp
View file @
1e7ccbbb
...
@@ -41,21 +41,28 @@ public:
...
@@ -41,21 +41,28 @@ public:
void
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
void
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
Init
(
config
,
train_data
,
objective_function
,
training_metrics
);
GBDT
::
Init
(
config
,
train_data
,
objective_function
,
training_metrics
);
ResetGoss
();
}
void
ResetTrainingData
(
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
GBDT
::
ResetTrainingData
(
train_data
,
objective_function
,
training_metrics
);
ResetGoss
();
}
void
ResetConfig
(
const
BoostingConfig
*
config
)
override
{
GBDT
::
ResetConfig
(
config
);
ResetGoss
();
}
void
ResetGoss
()
{
CHECK
(
gbdt_config_
->
top_rate
+
gbdt_config_
->
other_rate
<=
1.0
f
);
CHECK
(
gbdt_config_
->
top_rate
+
gbdt_config_
->
other_rate
<=
1.0
f
);
CHECK
(
gbdt_config_
->
top_rate
>
0.0
f
&&
gbdt_config_
->
other_rate
>
0.0
f
);
CHECK
(
gbdt_config_
->
top_rate
>
0.0
f
&&
gbdt_config_
->
other_rate
>
0.0
f
);
if
(
gbdt_config_
->
bagging_freq
>
0
&&
gbdt_config_
->
bagging_fraction
!=
1.0
f
)
{
if
(
gbdt_config_
->
bagging_freq
>
0
&&
gbdt_config_
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot use bagging in GOSS"
);
Log
::
Fatal
(
"cannot use bagging in GOSS"
);
}
}
Log
::
Info
(
"using GOSS"
);
Log
::
Info
(
"using GOSS"
);
}
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
objective_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
{
if
(
config
->
bagging_freq
>
0
&&
config
->
bagging_fraction
!=
1.0
f
)
{
Log
::
Fatal
(
"cannot use bagging in GOSS"
);
}
GBDT
::
ResetTrainingData
(
config
,
train_data
,
objective_function
,
training_metrics
);
if
(
train_data_
==
nullptr
)
{
return
;
}
bag_data_indices_
.
resize
(
num_data_
);
bag_data_indices_
.
resize
(
num_data_
);
tmp_indices_
.
resize
(
num_data_
);
tmp_indices_
.
resize
(
num_data_
);
tmp_indice_right_
.
resize
(
num_data_
);
tmp_indice_right_
.
resize
(
num_data_
);
...
@@ -66,8 +73,8 @@ public:
...
@@ -66,8 +73,8 @@ public:
right_write_pos_buf_
.
resize
(
num_threads_
);
right_write_pos_buf_
.
resize
(
num_threads_
);
is_use_subset_
=
false
;
is_use_subset_
=
false
;
if
(
config
->
top_rate
+
config
->
other_rate
<=
0.5
)
{
if
(
gbdt_
config
_
->
top_rate
+
gbdt_
config
_
->
other_rate
<=
0.5
)
{
auto
bag_data_cnt
=
static_cast
<
data_size_t
>
((
config
->
top_rate
+
config
->
other_rate
)
*
num_data_
);
auto
bag_data_cnt
=
static_cast
<
data_size_t
>
((
gbdt_
config
_
->
top_rate
+
gbdt_
config
_
->
other_rate
)
*
num_data_
);
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt
));
tmp_subset_
.
reset
(
new
Dataset
(
bag_data_cnt
));
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data_
);
tmp_subset_
->
CopyFeatureMapperFrom
(
train_data_
);
is_use_subset_
=
true
;
is_use_subset_
=
true
;
...
...
src/c_api.cpp
View file @
1e7ccbbb
...
@@ -51,11 +51,12 @@ public:
...
@@ -51,11 +51,12 @@ public:
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
config_
.
boosting_type
,
nullptr
));
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
config_
.
boosting_type
,
nullptr
));
train_data_
=
train_data
;
CreateObjectiveAndMetrics
();
// initialize the boosting
// initialize the boosting
boosting_
->
Init
(
&
config_
.
boosting_config
,
nullptr
,
objective_fun_
.
get
(),
boosting_
->
Init
(
&
config_
.
boosting_config
,
train_data_
,
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
ResetTrainingData
(
train_data
);
}
}
void
MergeFrom
(
const
Booster
*
other
)
{
void
MergeFrom
(
const
Booster
*
other
)
{
...
@@ -67,9 +68,7 @@ public:
...
@@ -67,9 +68,7 @@ public:
}
}
void
ResetTrainingData
(
const
Dataset
*
train_data
)
{
void
CreateObjectiveAndMetrics
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
train_data_
=
train_data
;
// create objective function
// create objective function
objective_fun_
.
reset
(
ObjectiveFunction
::
CreateObjectiveFunction
(
config_
.
objective_type
,
objective_fun_
.
reset
(
ObjectiveFunction
::
CreateObjectiveFunction
(
config_
.
objective_type
,
config_
.
objective_config
));
config_
.
objective_config
));
...
@@ -91,9 +90,17 @@ public:
...
@@ -91,9 +90,17 @@ public:
train_metric_
.
push_back
(
std
::
move
(
metric
));
train_metric_
.
push_back
(
std
::
move
(
metric
));
}
}
train_metric_
.
shrink_to_fit
();
train_metric_
.
shrink_to_fit
();
// reset the boosting
}
boosting_
->
ResetTrainingData
(
&
config_
.
boosting_config
,
train_data_
,
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
void
ResetTrainingData
(
const
Dataset
*
train_data
)
{
if
(
train_data
!=
train_data_
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
train_data_
=
train_data
;
CreateObjectiveAndMetrics
();
// reset the boosting
boosting_
->
ResetTrainingData
(
train_data_
,
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
}
}
}
void
ResetConfig
(
const
char
*
parameters
)
{
void
ResetConfig
(
const
char
*
parameters
)
{
...
@@ -125,10 +132,11 @@ public:
...
@@ -125,10 +132,11 @@ public:
if
(
objective_fun_
!=
nullptr
)
{
if
(
objective_fun_
!=
nullptr
)
{
objective_fun_
->
Init
(
train_data_
->
metadata
(),
train_data_
->
num_data
());
objective_fun_
->
Init
(
train_data_
->
metadata
(),
train_data_
->
num_data
());
}
}
boosting_
->
ResetTrainingData
(
train_data_
,
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
}
}
boosting_
->
ResetTrainingData
(
&
config_
.
boosting_config
,
train_data_
,
boosting_
->
ResetConfig
(
&
config_
.
boosting_config
);
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment