Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
16d1853d
Commit
16d1853d
authored
Dec 01, 2016
by
Guolin Ke
Committed by
GitHub
Dec 01, 2016
Browse files
Merge pull request #94 from Microsoft/python-package
Python package (#11)
parents
65e711a2
29cf97e9
Changes
40
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
974 additions
and
399 deletions
+974
-399
python-package/setup.py
python-package/setup.py
+33
-0
src/application/application.cpp
src/application/application.cpp
+35
-33
src/boosting/boosting.cpp
src/boosting/boosting.cpp
+1
-1
src/boosting/dart.hpp
src/boosting/dart.hpp
+9
-13
src/boosting/gbdt.cpp
src/boosting/gbdt.cpp
+122
-90
src/boosting/gbdt.h
src/boosting/gbdt.h
+65
-16
src/c_api.cpp
src/c_api.cpp
+293
-144
src/io/config.cpp
src/io/config.cpp
+11
-11
src/io/dataset.cpp
src/io/dataset.cpp
+35
-6
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+54
-29
src/io/metadata.cpp
src/io/metadata.cpp
+129
-0
src/metric/binary_metric.hpp
src/metric/binary_metric.hpp
+6
-11
src/metric/multiclass_metric.hpp
src/metric/multiclass_metric.hpp
+5
-6
src/metric/rank_metric.hpp
src/metric/rank_metric.hpp
+2
-5
src/metric/regression_metric.hpp
src/metric/regression_metric.hpp
+4
-6
src/network/linkers_socket.cpp
src/network/linkers_socket.cpp
+5
-4
src/objective/binary_objective.hpp
src/objective/binary_objective.hpp
+3
-0
tests/c_api_test/test.py
tests/c_api_test/test.py
+19
-24
tests/python_package_test/test_basic.py
tests/python_package_test/test_basic.py
+22
-0
tests/python_package_test/test_sklearn.py
tests/python_package_test/test_sklearn.py
+121
-0
No files found.
python-package/setup.py
0 → 100644
View file @
16d1853d
# pylint: disable=invalid-name, exec-used
"""Setup lightgbm package."""
from
__future__
import
absolute_import
import
sys
import
os
from
setuptools
import
setup
,
find_packages
# import subprocess
sys
.
path
.
insert
(
0
,
'.'
)
CURRENT_DIR
=
os
.
path
.
dirname
(
__file__
)
libpath_py
=
os
.
path
.
join
(
CURRENT_DIR
,
'lightgbm/libpath.py'
)
libpath
=
{
'__file__'
:
libpath_py
}
exec
(
compile
(
open
(
libpath_py
,
"rb"
).
read
(),
libpath_py
,
'exec'
),
libpath
,
libpath
)
LIB_PATH
=
libpath
[
'find_lib_path'
]()
print
(
"Install lib_lightgbm from: %s"
%
LIB_PATH
)
setup
(
name
=
'lightgbm'
,
version
=
0.1
,
description
=
"LightGBM Python Package"
,
install_requires
=
[
'numpy'
,
'scipy'
,
],
maintainer
=
'Guolin Ke'
,
maintainer_email
=
'guolin.ke@microsoft.com'
,
zip_safe
=
False
,
packages
=
find_packages
(),
include_package_data
=
True
,
data_files
=
[(
'lightgbm'
,
LIB_PATH
)],
url
=
'https://github.com/Microsoft/LightGBM'
)
src/application/application.cpp
View file @
16d1853d
...
...
@@ -108,7 +108,7 @@ void Application::LoadData() {
// prediction is needed if using input initial model(continued train)
PredictFunction
predict_fun
=
nullptr
;
// need to continue training
if
(
boosting_
->
NumberOf
Sub
Model
s
()
>
0
)
{
if
(
boosting_
->
NumberOf
Total
Model
()
>
0
)
{
Predictor
predictor
(
boosting_
.
get
(),
true
,
false
);
predict_fun
=
predictor
.
GetPredictFunction
();
}
...
...
@@ -139,12 +139,16 @@ void Application::LoadData() {
for
(
auto
metric_type
:
config_
.
metric_types
)
{
auto
metric
=
std
::
unique_ptr
<
Metric
>
(
Metric
::
CreateMetric
(
metric_type
,
config_
.
metric_config
));
if
(
metric
==
nullptr
)
{
continue
;
}
metric
->
Init
(
"training"
,
train_data_
->
metadata
(),
train_data_
->
num_data
());
metric
->
Init
(
train_data_
->
metadata
(),
train_data_
->
num_data
());
train_metric_
.
push_back
(
std
::
move
(
metric
));
}
}
train_metric_
.
shrink_to_fit
();
if
(
config_
.
metric_types
.
size
()
>
0
)
{
// only when have metrics then need to construct validation data
// Add validation data, if it exists
for
(
size_t
i
=
0
;
i
<
config_
.
io_config
.
valid_data_filenames
.
size
();
++
i
)
{
// add
...
...
@@ -164,8 +168,7 @@ void Application::LoadData() {
for
(
auto
metric_type
:
config_
.
metric_types
)
{
auto
metric
=
std
::
unique_ptr
<
Metric
>
(
Metric
::
CreateMetric
(
metric_type
,
config_
.
metric_config
));
if
(
metric
==
nullptr
)
{
continue
;
}
metric
->
Init
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
valid_datas_
.
back
()
->
metadata
(),
metric
->
Init
(
valid_datas_
.
back
()
->
metadata
(),
valid_datas_
.
back
()
->
num_data
());
valid_metrics_
.
back
().
push_back
(
std
::
move
(
metric
));
}
...
...
@@ -173,6 +176,7 @@ void Application::LoadData() {
}
valid_datas_
.
shrink_to_fit
();
valid_metrics_
.
shrink_to_fit
();
}
auto
end_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
// output used time on each iteration
Log
::
Info
(
"Finished loading data in %f seconds"
,
...
...
@@ -209,7 +213,7 @@ void Application::InitTrain() {
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
// add validation data into boosting
for
(
size_t
i
=
0
;
i
<
valid_datas_
.
size
();
++
i
)
{
boosting_
->
AddDataset
(
valid_datas_
[
i
].
get
(),
boosting_
->
Add
Valid
Dataset
(
valid_datas_
[
i
].
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
valid_metrics_
[
i
]));
}
Log
::
Info
(
"Finished initializing training"
);
...
...
@@ -227,17 +231,15 @@ void Application::Train() {
// output used time per iteration
Log
::
Info
(
"%f seconds elapsed, finished iteration %d"
,
std
::
chrono
::
duration
<
double
,
std
::
milli
>
(
end_time
-
start_time
)
*
1e-3
,
iter
+
1
);
boosting_
->
SaveModelToFile
(
NO_LIMIT
,
is_finished
,
config_
.
io_config
.
output_model
.
c_str
());
}
is_finished
=
true
;
// save model to file
boosting_
->
SaveModelToFile
(
NO_LIMIT
,
is_finished
,
config_
.
io_config
.
output_model
.
c_str
());
boosting_
->
SaveModelToFile
(
-
1
,
config_
.
io_config
.
output_model
.
c_str
());
Log
::
Info
(
"Finished training"
);
}
void
Application
::
Predict
()
{
boosting_
->
SetNum
UsedModel
(
config_
.
io_config
.
num_
model
_predict
);
boosting_
->
SetNum
IterationForPred
(
config_
.
io_config
.
num_
iteration
_predict
);
// create predictor
Predictor
predictor
(
boosting_
.
get
(),
config_
.
io_config
.
is_predict_raw_score
,
config_
.
io_config
.
is_predict_leaf_index
);
...
...
src/boosting/boosting.cpp
View file @
16d1853d
...
...
@@ -15,7 +15,7 @@ BoostingType GetBoostingTypeFromModelFile(const char* filename) {
return
BoostingType
::
kUnknow
;
}
void
LoadFileToBoosting
(
Boosting
*
boosting
,
const
char
*
filename
)
{
void
Boosting
::
LoadFileToBoosting
(
Boosting
*
boosting
,
const
char
*
filename
)
{
if
(
boosting
!=
nullptr
)
{
TextReader
<
size_t
>
model_reader
(
filename
,
true
);
model_reader
.
ReadAllLines
();
...
...
src/boosting/dart.hpp
View file @
16d1853d
...
...
@@ -43,6 +43,7 @@ public:
* \brief one training iteration
*/
bool
TrainOneIter
(
const
score_t
*
gradient
,
const
score_t
*
hessian
,
bool
is_eval
)
override
{
is_update_score_cur_iter_
=
false
;
GBDT
::
TrainOneIter
(
gradient
,
hessian
,
false
);
// normalize
Normalize
();
...
...
@@ -58,22 +59,15 @@ public:
* \return training score
*/
const
score_t
*
GetTrainingScore
(
data_size_t
*
out_len
)
override
{
if
(
!
is_update_score_cur_iter_
)
{
// only drop one time in one iteration
DroppingTrees
();
is_update_score_cur_iter_
=
true
;
}
*
out_len
=
train_score_updater_
->
num_data
()
*
num_class_
;
return
train_score_updater_
->
score
();
}
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param filename filename that want to save to
*/
void
SaveModelToFile
(
int
num_used_model
,
bool
is_finish
,
const
char
*
filename
)
override
{
// only save model once when is_finish = true
if
(
is_finish
&&
saved_model_size_
<
0
)
{
GBDT
::
SaveModelToFile
(
num_used_model
,
is_finish
,
filename
);
}
}
/*!
* \brief Get Type name of this boosting object
*/
...
...
@@ -133,6 +127,8 @@ private:
double
drop_rate_
;
/*! \brief Random generator, used to select dropping trees */
Random
random_for_drop_
;
/*! \brief Flag that the score is update on current iter or not*/
bool
is_update_score_cur_iter_
;
};
}
// namespace LightGBM
...
...
src/boosting/gbdt.cpp
View file @
16d1853d
...
...
@@ -16,7 +16,9 @@
namespace
LightGBM
{
GBDT
::
GBDT
()
:
saved_model_size_
(
-
1
),
num_used_model_
(
0
)
{
GBDT
::
GBDT
()
:
num_iteration_for_pred_
(
0
),
num_init_iteration_
(
0
)
{
}
...
...
@@ -26,47 +28,66 @@ GBDT::~GBDT() {
void
GBDT
::
Init
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
{
gbdt_config_
=
config
;
iter_
=
0
;
saved_model_size_
=
-
1
;
num_used_model_
=
0
;
num_iteration_for_pred_
=
0
;
max_feature_idx_
=
0
;
num_class_
=
config
->
num_class
;
train_data_
=
nullptr
;
ResetTrainingData
(
config
,
train_data
,
object_function
,
training_metrics
);
}
void
GBDT
::
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
{
if
(
train_data_
!=
nullptr
&&
!
train_data_
->
CheckAlign
(
*
train_data
))
{
Log
::
Fatal
(
"cannot reset training data, since new training data has different bin mappers"
);
}
gbdt_config_
=
config
;
early_stopping_round_
=
gbdt_config_
->
early_stopping_round
;
shrinkage_rate_
=
gbdt_config_
->
learning_rate
;
train_data_
=
train_data
;
num_class_
=
config
->
num_class
;
random_
=
Random
(
gbdt_config_
->
bagging_seed
);
// create tree learner
tree_learner_
.
clear
();
for
(
int
i
=
0
;
i
<
num_class_
;
++
i
)
{
auto
new_tree_learner
=
std
::
unique_ptr
<
TreeLearner
>
(
TreeLearner
::
CreateTreeLearner
(
gbdt_config_
->
tree_learner_type
,
gbdt_config_
->
tree_config
));
new_tree_learner
->
Init
(
train_data
_
);
new_tree_learner
->
Init
(
train_data
);
// init tree learner
tree_learner_
.
push_back
(
std
::
move
(
new_tree_learner
));
}
tree_learner_
.
shrink_to_fit
();
object_function_
=
object_function
;
// push training metrics
training_metrics_
.
clear
();
for
(
const
auto
&
metric
:
training_metrics
)
{
training_metrics_
.
push_back
(
metric
);
}
training_metrics_
.
shrink_to_fit
();
// create score tracker
train_score_updater_
.
reset
(
new
ScoreUpdater
(
train_data_
,
num_class_
));
num_data_
=
train_data_
->
num_data
();
// create buffer for gradients and hessians
if
(
object_function_
!=
nullptr
)
{
gradients_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
hessians_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
}
sigmoid_
=
-
1.0
f
;
if
(
object_function_
!=
nullptr
&&
std
::
string
(
object_function_
->
GetName
())
==
std
::
string
(
"binary"
))
{
// only binary classification need sigmoid transform
sigmoid_
=
gbdt_config_
->
sigmoid
;
}
if
(
train_data_
!=
train_data
)
{
// not same training data, need reset score and others
// create score tracker
train_score_updater_
.
reset
(
new
ScoreUpdater
(
train_data
,
num_class_
));
// update score
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
curr_tree
=
(
i
+
num_init_iteration_
)
*
num_class_
+
curr_class
;
train_score_updater_
->
AddScore
(
models_
[
curr_tree
].
get
(),
curr_class
);
}
}
num_data_
=
train_data
->
num_data
();
// create buffer for gradients and hessians
if
(
object_function_
!=
nullptr
)
{
gradients_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
hessians_
=
std
::
vector
<
score_t
>
(
num_data_
*
num_class_
);
}
// get max feature index
max_feature_idx_
=
train_data
_
->
num_total_features
()
-
1
;
max_feature_idx_
=
train_data
->
num_total_features
()
-
1
;
// get label index
label_idx_
=
train_data
_
->
label_idx
();
label_idx_
=
train_data
->
label_idx
();
// if need bagging, create buffer
if
(
gbdt_config_
->
bagging_fraction
<
1.0
&&
gbdt_config_
->
bagging_freq
>
0
)
{
out_of_bag_data_indices_
=
std
::
vector
<
data_size_t
>
(
num_data_
);
...
...
@@ -77,18 +98,24 @@ void GBDT::Init(const BoostingConfig* config, const Dataset* train_data, const O
bag_data_cnt_
=
num_data_
;
bag_data_indices_
.
clear
();
}
// initialize random generator
random_
=
Random
(
gbdt_config_
->
bagging_seed
);
}
train_data_
=
train_data
;
}
void
GBDT
::
AddDataset
(
const
Dataset
*
valid_data
,
void
GBDT
::
Add
Valid
Dataset
(
const
Dataset
*
valid_data
,
const
std
::
vector
<
const
Metric
*>&
valid_metrics
)
{
if
(
iter_
>
0
)
{
Log
::
Fatal
(
"
C
annot add validation data
after
training
started
"
);
if
(
!
train_data_
->
CheckAlign
(
*
valid_data
)
)
{
Log
::
Fatal
(
"
c
annot add validation data
, since it has different bin mappers with
training
data
"
);
}
// for a validation dataset, we need its score and metric
auto
new_score_updater
=
std
::
unique_ptr
<
ScoreUpdater
>
(
new
ScoreUpdater
(
valid_data
,
num_class_
));
// update score
for
(
int
i
=
0
;
i
<
iter_
;
++
i
)
{
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
curr_tree
=
(
i
+
num_init_iteration_
)
*
num_class_
+
curr_class
;
new_score_updater
->
AddScore
(
models_
[
curr_tree
].
get
(),
curr_class
);
}
}
valid_score_updater_
.
push_back
(
std
::
move
(
new_score_updater
));
valid_metrics_
.
emplace_back
();
if
(
early_stopping_round_
>
0
)
{
...
...
@@ -204,6 +231,25 @@ bool GBDT::TrainOneIter(const score_t* gradient, const score_t* hessian, bool is
}
void
GBDT
::
RollbackOneIter
()
{
if
(
iter_
==
0
)
{
return
;
}
int
cur_iter
=
iter_
+
num_init_iteration_
-
1
;
// reset score
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
auto
curr_tree
=
cur_iter
*
num_class_
+
curr_class
;
models_
[
curr_tree
]
->
Shrinkage
(
-
1.0
);
train_score_updater_
->
AddScore
(
models_
[
curr_tree
].
get
(),
curr_class
);
for
(
auto
&
score_updater
:
valid_score_updater_
)
{
score_updater
->
AddScore
(
models_
[
curr_tree
].
get
(),
curr_class
);
}
}
// remove model
for
(
int
curr_class
=
0
;
curr_class
<
num_class_
;
++
curr_class
)
{
models_
.
pop_back
();
}
--
iter_
;
}
bool
GBDT
::
EvalAndCheckEarlyStopping
()
{
bool
is_met_early_stopping
=
false
;
// print message for metric
...
...
@@ -236,7 +282,7 @@ bool GBDT::OutputMetric(int iter) {
auto
name
=
sub_metric
->
GetName
();
auto
scores
=
sub_metric
->
Eval
(
train_score_updater_
->
score
());
for
(
size_t
k
=
0
;
k
<
name
.
size
();
++
k
)
{
Log
::
Info
(
"Iteration:
%d, %s : %f"
,
iter
,
name
[
k
].
c_str
(),
scores
[
k
]);
Log
::
Info
(
"Iteration:%d,
training
%s : %f"
,
iter
,
name
[
k
].
c_str
(),
scores
[
k
]);
}
}
}
...
...
@@ -248,7 +294,7 @@ bool GBDT::OutputMetric(int iter) {
if
((
iter
%
gbdt_config_
->
output_freq
)
==
0
)
{
auto
name
=
valid_metrics_
[
i
][
j
]
->
GetName
();
for
(
size_t
k
=
0
;
k
<
name
.
size
();
++
k
)
{
Log
::
Info
(
"Iteration:
%d
,
%s : %f"
,
iter
,
name
[
k
].
c_str
(),
test_scores
[
k
]);
Log
::
Info
(
"Iteration:
%d, valid_
%d %s : %f"
,
iter
,
i
+
1
,
name
[
k
].
c_str
(),
test_scores
[
k
]);
}
}
if
(
!
ret
&&
early_stopping_round_
>
0
)
{
...
...
@@ -296,24 +342,23 @@ const score_t* GBDT::GetTrainingScore(data_size_t* out_len) {
return
train_score_updater_
->
score
();
}
void
GBDT
::
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
const
{
void
GBDT
::
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
{
CHECK
(
data_idx
>=
0
&&
data_idx
<=
static_cast
<
int
>
(
valid_metrics_
.
size
()));
std
::
vector
<
double
>
ret
;
const
score_t
*
raw_scores
=
nullptr
;
data_size_t
num_data
=
0
;
if
(
data_idx
==
0
)
{
raw_scores
=
train_score_updater_
->
score
(
);
raw_scores
=
GetTrainingScore
(
out_len
);
num_data
=
train_score_updater_
->
num_data
();
}
else
{
auto
used_idx
=
data_idx
-
1
;
raw_scores
=
valid_score_updater_
[
used_idx
]
->
score
();
num_data
=
valid_score_updater_
[
used_idx
]
->
num_data
();
}
*
out_len
=
num_data
*
num_class_
;
}
if
(
num_class_
>
1
)
{
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
std
::
vector
<
double
>
tmp_result
;
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
...
...
@@ -325,12 +370,12 @@ void GBDT::GetPredictAt(int data_idx, score_t* out_result, data_size_t* out_len)
}
}
}
else
if
(
sigmoid_
>
0.0
f
){
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
out_result
[
i
]
=
static_cast
<
score_t
>
(
1.0
f
/
(
1.0
f
+
std
::
exp
(
-
2.0
f
*
sigmoid_
*
raw_scores
[
i
])));
}
}
else
{
#pragma omp parallel for schedule(
guided
)
#pragma omp parallel for schedule(
static
)
for
(
data_size_t
i
=
0
;
i
<
num_data
;
++
i
)
{
out_result
[
i
]
=
raw_scores
[
i
];
}
...
...
@@ -348,55 +393,41 @@ void GBDT::Boosting() {
GetGradients
(
GetTrainingScore
(
&
num_score
),
gradients_
.
data
(),
hessians_
.
data
());
}
void
GBDT
::
SaveModelToFile
(
int
num_
used_model
,
bool
is_finish
,
const
char
*
filename
)
{
/
/ first time to this function, open file
if
(
saved_model_size_
<
0
)
{
model_
output_file
_
.
open
(
filename
);
void
GBDT
::
SaveModelToFile
(
int
num_
iteration
,
const
char
*
filename
)
const
{
/
*! \brief File to write models */
std
::
ofstream
output_file
;
output_file
.
open
(
filename
);
// output model type
model_
output_file
_
<<
Name
()
<<
std
::
endl
;
output_file
<<
Name
()
<<
std
::
endl
;
// output number of class
model_
output_file
_
<<
"num_class="
<<
num_class_
<<
std
::
endl
;
output_file
<<
"num_class="
<<
num_class_
<<
std
::
endl
;
// output label index
model_
output_file
_
<<
"label_index="
<<
label_idx_
<<
std
::
endl
;
output_file
<<
"label_index="
<<
label_idx_
<<
std
::
endl
;
// output max_feature_idx
model_
output_file
_
<<
"max_feature_idx="
<<
max_feature_idx_
<<
std
::
endl
;
output_file
<<
"max_feature_idx="
<<
max_feature_idx_
<<
std
::
endl
;
// output objective name
if
(
object_function_
!=
nullptr
)
{
model_
output_file
_
<<
"objective="
<<
object_function_
->
GetName
()
<<
std
::
endl
;
output_file
<<
"objective="
<<
object_function_
->
GetName
()
<<
std
::
endl
;
}
// output sigmoid parameter
model_output_file_
<<
"sigmoid="
<<
sigmoid_
<<
std
::
endl
;
model_output_file_
<<
std
::
endl
;
saved_model_size_
=
0
;
}
// already saved
if
(
!
model_output_file_
.
is_open
())
{
return
;
}
if
(
num_used_model
==
NO_LIMIT
)
{
output_file
<<
"sigmoid="
<<
sigmoid_
<<
std
::
endl
;
output_file
<<
std
::
endl
;
int
num_used_model
=
0
;
if
(
num_iteration
<=
0
)
{
num_used_model
=
static_cast
<
int
>
(
models_
.
size
());
}
else
{
num_used_model
=
num_
used_model
*
num_class_
;
num_used_model
=
num_
iteration
*
num_class_
;
}
int
rest
=
num_used_model
-
early_stopping_round_
*
num_class_
;
num_used_model
=
std
::
min
(
num_used_model
,
static_cast
<
int
>
(
models_
.
size
()))
;
// output tree models
for
(
int
i
=
saved_model_size_
;
i
<
rest
;
++
i
)
{
model_
output_file
_
<<
"Tree="
<<
i
<<
std
::
endl
;
model_
output_file
_
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
num_used_model
;
++
i
)
{
output_file
<<
"Tree="
<<
i
<<
std
::
endl
;
output_file
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
}
saved_model_size_
=
std
::
max
(
saved_model_size_
,
rest
);
model_output_file_
.
flush
();
// training finished, can close file
if
(
is_finish
)
{
for
(
int
i
=
saved_model_size_
;
i
<
num_used_model
;
++
i
)
{
model_output_file_
<<
"Tree="
<<
i
<<
std
::
endl
;
model_output_file_
<<
models_
[
i
]
->
ToString
()
<<
std
::
endl
;
}
model_output_file_
<<
std
::
endl
<<
FeatureImportance
()
<<
std
::
endl
;
model_output_file_
.
close
();
}
output_file
<<
std
::
endl
<<
FeatureImportance
()
<<
std
::
endl
;
output_file
.
close
();
}
void
GBDT
::
LoadModelFromString
(
const
std
::
string
&
model_str
)
{
...
...
@@ -452,7 +483,8 @@ void GBDT::LoadModelFromString(const std::string& model_str) {
}
}
Log
::
Info
(
"Finished loading %d models"
,
models_
.
size
());
num_used_model_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
num_init_iteration_
=
num_iteration_for_pred_
;
}
std
::
string
GBDT
::
FeatureImportance
()
const
{
...
...
@@ -486,7 +518,7 @@ std::string GBDT::FeatureImportance() const {
std
::
vector
<
double
>
GBDT
::
PredictRaw
(
const
double
*
value
)
const
{
std
::
vector
<
double
>
ret
(
num_class_
,
0.0
f
);
for
(
int
i
=
0
;
i
<
num_
used_model
_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
iteration_for_pred
_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
ret
[
j
]
+=
models_
[
i
*
num_class_
+
j
]
->
Predict
(
value
);
}
...
...
@@ -496,7 +528,7 @@ std::vector<double> GBDT::PredictRaw(const double* value) const {
std
::
vector
<
double
>
GBDT
::
Predict
(
const
double
*
value
)
const
{
std
::
vector
<
double
>
ret
(
num_class_
,
0.0
f
);
for
(
int
i
=
0
;
i
<
num_
used_model
_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
iteration_for_pred
_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
ret
[
j
]
+=
models_
[
i
*
num_class_
+
j
]
->
Predict
(
value
);
}
...
...
@@ -512,7 +544,7 @@ std::vector<double> GBDT::Predict(const double* value) const {
std
::
vector
<
int
>
GBDT
::
PredictLeafIndex
(
const
double
*
value
)
const
{
std
::
vector
<
int
>
ret
;
for
(
int
i
=
0
;
i
<
num_
used_model
_
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_
iteration_for_pred
_
;
++
i
)
{
for
(
int
j
=
0
;
j
<
num_class_
;
++
j
)
{
ret
.
push_back
(
models_
[
i
*
num_class_
+
j
]
->
PredictLeafIndex
(
value
));
}
...
...
src/boosting/gbdt.h
View file @
16d1853d
...
...
@@ -35,12 +35,53 @@ public:
void
Init
(
const
BoostingConfig
*
gbdt_config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
;
/*!
* \brief Merge model from other boosting object
Will insert to the front of current boosting object
* \param other
*/
void
MergeFrom
(
const
Boosting
*
other
)
override
{
auto
other_gbdt
=
reinterpret_cast
<
const
GBDT
*>
(
other
);
// tmp move to other vector
auto
original_models
=
std
::
move
(
models_
);
models_
=
std
::
vector
<
std
::
unique_ptr
<
Tree
>>
();
// push model from other first
for
(
const
auto
&
tree
:
other_gbdt
->
models_
)
{
auto
new_tree
=
std
::
unique_ptr
<
Tree
>
(
new
Tree
(
*
(
tree
.
get
())));
models_
.
push_back
(
std
::
move
(
new_tree
));
}
num_init_iteration_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
// push model in current object
for
(
const
auto
&
tree
:
original_models
)
{
auto
new_tree
=
std
::
unique_ptr
<
Tree
>
(
new
Tree
(
*
(
tree
.
get
())));
models_
.
push_back
(
std
::
move
(
new_tree
));
}
num_iteration_for_pred_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
}
/*!
* \brief Reset training data for current boosting
* \param train_data Training data
* \param object_function Training objective function
* \param training_metrics Training metric
*/
void
ResetTrainingData
(
const
BoostingConfig
*
config
,
const
Dataset
*
train_data
,
const
ObjectiveFunction
*
object_function
,
const
std
::
vector
<
const
Metric
*>&
training_metrics
)
override
;
/*!
* \brief Reset shrinkage_rate data for current boosting
* \param shrinkage_rate Configs for boosting
*/
void
ResetShrinkageRate
(
double
shrinkage_rate
)
override
{
shrinkage_rate_
=
shrinkage_rate
;
}
/*!
* \brief Adding a validation dataset
* \param valid_data Validation dataset
* \param valid_metrics Metrics for validation dataset
*/
void
AddDataset
(
const
Dataset
*
valid_data
,
void
Add
Valid
Dataset
(
const
Dataset
*
valid_data
,
const
std
::
vector
<
const
Metric
*>&
valid_metrics
)
override
;
/*!
* \brief Training logic
...
...
@@ -51,6 +92,13 @@ public:
*/
virtual
bool
TrainOneIter
(
const
score_t
*
gradient
,
const
score_t
*
hessian
,
bool
is_eval
)
override
;
/*!
* \brief Rollback one iteration
*/
void
RollbackOneIter
()
override
;
int
GetCurrentIteration
()
const
override
{
return
iter_
+
num_init_iteration_
;
}
bool
EvalAndCheckEarlyStopping
()
override
;
/*!
...
...
@@ -73,7 +121,7 @@ public:
* \param result used to store prediction result, should allocate memory before call this function
* \param out_len lenght of returned score
*/
void
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
const
override
;
void
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
override
;
/*!
* \brief Prediction for one record without sigmoid transformation
...
...
@@ -98,11 +146,11 @@ public:
/*!
* \brief save model to file
* \param num_used_model number of model that want to save, -1 means save all
* \param is_finish is training finished or not
* \param num_iterations Iterations that want to save, -1 means save all
* \param filename filename that want to save to
*/
virtual
void
SaveModelToFile
(
int
num_used_model
,
bool
is_finish
,
const
char
*
filename
)
override
;
virtual
void
SaveModelToFile
(
int
num_iterations
,
const
char
*
filename
)
const
override
;
/*!
* \brief Restore from a serialized string
*/
...
...
@@ -119,11 +167,12 @@ public:
*/
inline
int
LabelIdx
()
const
override
{
return
label_idx_
;
}
/*!
* \brief Get number of weak sub-models
* \return Number of weak sub-models
*/
inline
int
NumberOf
Sub
Model
s
()
const
override
{
return
static_cast
<
int
>
(
models_
.
size
());
}
inline
int
NumberOf
Total
Model
()
const
override
{
return
static_cast
<
int
>
(
models_
.
size
());
}
/*!
* \brief Get number of classes
...
...
@@ -132,14 +181,16 @@ public:
inline
int
NumberOfClasses
()
const
override
{
return
num_class_
;
}
/*!
* \brief Set number of
used model
for prediction
* \brief Set number of
iterations
for prediction
*/
inline
void
SetNum
UsedModel
(
int
num_used_model
)
{
if
(
num_
used_model
>
=
0
)
{
num_
used_model_
=
static_cast
<
int
>
(
num_used_model
/
num_class_
)
;
inline
void
SetNum
IterationForPred
(
int
num_iteration
)
override
{
if
(
num_
iteration
>
0
)
{
num_
iteration_for_pred_
=
num_iteration
;
}
else
{
num_
used_model
_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
num_
iteration_for_pred
_
=
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
;
}
num_iteration_for_pred_
=
std
::
min
(
num_iteration_for_pred_
,
static_cast
<
int
>
(
models_
.
size
())
/
num_class_
);
}
/*!
...
...
@@ -233,14 +284,12 @@ protected:
double
sigmoid_
;
/*! \brief Index of label column */
data_size_t
label_idx_
;
/*! \brief Saved number of models */
int
saved_model_size_
;
/*! \brief File to write models */
std
::
ofstream
model_output_file_
;
/*! \brief number of used model */
int
num_
used_model
_
;
int
num_
iteration_for_pred
_
;
/*! \brief Shrinkage rate for one iteration */
double
shrinkage_rate_
;
/*! \brief Number of loaded initial models */
int
num_init_iteration_
;
};
}
// namespace LightGBM
...
...
src/c_api.cpp
View file @
16d1853d
...
...
@@ -16,6 +16,7 @@
#include <cstring>
#include <memory>
#include <stdexcept>
#include <mutex>
#include "./application/predictor.hpp"
...
...
@@ -28,75 +29,88 @@ public:
}
Booster
(
const
Dataset
*
train_data
,
std
::
vector
<
const
Dataset
*>
valid_data
,
std
::
vector
<
std
::
string
>
valid_names
,
const
char
*
parameters
)
:
train_data_
(
train_data
),
valid_datas_
(
valid_data
)
{
config_
.
LoadFromString
(
parameters
);
const
char
*
parameters
)
{
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
config_
.
Set
(
param
);
// create boosting
if
(
config_
.
io_config
.
input_model
.
size
()
>
0
)
{
Log
::
Warning
(
"continued train from model is not support for c_api, \
please use continued train with input score"
);
}
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
config_
.
boosting_type
,
""
));
// create objective function
objective_fun_
.
reset
(
ObjectiveFunction
::
CreateObjectiveFunction
(
config_
.
objective_type
,
config_
.
objective_config
));
if
(
objective_fun_
==
nullptr
)
{
Log
::
Warning
(
"Using self-defined objective functions"
);
boosting_
.
reset
(
Boosting
::
CreateBoosting
(
config_
.
boosting_type
,
nullptr
));
ConstructObjectAndTrainingMetrics
(
train_data
);
// initialize the boosting
boosting_
->
Init
(
&
config_
.
boosting_config
,
train_data
,
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
}
// create training metric
for
(
auto
metric_type
:
config_
.
metric_types
)
{
auto
metric
=
std
::
unique_ptr
<
Metric
>
(
Metric
::
CreateMetric
(
metric_type
,
config_
.
metric_config
));
if
(
metric
==
nullptr
)
{
continue
;
}
metric
->
Init
(
"training"
,
train_data_
->
metadata
(),
train_data_
->
num_data
());
train_metric_
.
push_back
(
std
::
move
(
metric
));
void
MergeFrom
(
const
Booster
*
other
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
boosting_
->
MergeFrom
(
other
->
boosting_
.
get
());
}
train_metric_
.
shrink_to_fit
();
// add metric for validation data
for
(
size_t
i
=
0
;
i
<
valid_datas_
.
size
();
++
i
)
{
~
Booster
()
{
}
void
ResetTrainingData
(
const
Dataset
*
train_data
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
train_data_
=
train_data
;
ConstructObjectAndTrainingMetrics
(
train_data_
);
// initialize the boosting
boosting_
->
ResetTrainingData
(
&
config_
.
boosting_config
,
train_data_
,
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
}
void
ResetConfig
(
const
char
*
parameters
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
if
(
param
.
count
(
"num_class"
))
{
Log
::
Fatal
(
"cannot change num class during training"
);
}
if
(
param
.
count
(
"boosting_type"
))
{
Log
::
Fatal
(
"cannot change boosting_type during training"
);
}
config_
.
Set
(
param
);
if
(
param
.
size
()
==
1
&&
(
param
.
count
(
"learning_rate"
)
||
param
.
count
(
"shrinkage_rate"
)))
{
// only need to set learning rate
boosting_
->
ResetShrinkageRate
(
config_
.
boosting_config
.
learning_rate
);
}
else
{
ResetTrainingData
(
train_data_
);
}
}
void
AddValidData
(
const
Dataset
*
valid_data
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
valid_metrics_
.
emplace_back
();
for
(
auto
metric_type
:
config_
.
metric_types
)
{
auto
metric
=
std
::
unique_ptr
<
Metric
>
(
Metric
::
CreateMetric
(
metric_type
,
config_
.
metric_config
));
if
(
metric
==
nullptr
)
{
continue
;
}
metric
->
Init
(
valid_names
[
i
].
c_str
(),
valid_datas_
[
i
]
->
metadata
(),
valid_datas_
[
i
]
->
num_data
());
metric
->
Init
(
valid_data
->
metadata
(),
valid_data
->
num_data
());
valid_metrics_
.
back
().
push_back
(
std
::
move
(
metric
));
}
valid_metrics_
.
back
().
shrink_to_fit
();
boosting_
->
AddValidDataset
(
valid_data
,
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
valid_metrics_
.
back
()));
}
valid_metrics_
.
shrink_to_fit
();
// initialize the objective function
if
(
objective_fun_
!=
nullptr
)
{
objective_fun_
->
Init
(
train_data_
->
metadata
(),
train_data_
->
num_data
());
}
// initialize the boosting
boosting_
->
Init
(
&
config_
.
boosting_config
,
train_data_
,
objective_fun_
.
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
train_metric_
));
// add validation data into boosting
for
(
size_t
i
=
0
;
i
<
valid_datas_
.
size
();
++
i
)
{
boosting_
->
AddDataset
(
valid_datas_
[
i
],
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
valid_metrics_
[
i
]));
}
}
~
Booster
()
{
}
bool
TrainOneIter
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
boosting_
->
TrainOneIter
(
nullptr
,
nullptr
,
false
);
}
bool
TrainOneIter
(
const
float
*
gradients
,
const
float
*
hessians
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
boosting_
->
TrainOneIter
(
gradients
,
hessians
,
false
);
}
void
PrepareForPrediction
(
int
num_used_model
,
int
predict_type
)
{
boosting_
->
SetNumUsedModel
(
num_used_model
);
void
RollbackOneIter
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
boosting_
->
RollbackOneIter
();
}
void
PrepareForPrediction
(
int
num_iteration
,
int
predict_type
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
boosting_
->
SetNumIterationForPred
(
num_iteration
);
bool
is_predict_leaf
=
false
;
bool
is_raw_score
=
false
;
if
(
predict_type
==
C_API_PREDICT_LEAF_INDEX
)
{
...
...
@@ -109,6 +123,10 @@ public:
predictor_
.
reset
(
new
Predictor
(
boosting_
.
get
(),
is_raw_score
,
is_predict_leaf
));
}
void
GetPredictAt
(
int
data_idx
,
score_t
*
out_result
,
data_size_t
*
out_len
)
{
boosting_
->
GetPredictAt
(
data_idx
,
out_result
,
out_len
);
}
std
::
vector
<
double
>
Predict
(
const
std
::
vector
<
std
::
pair
<
int
,
double
>>&
features
)
{
return
predictor_
->
GetPredictFunction
()(
features
);
}
...
...
@@ -117,25 +135,60 @@ public:
predictor_
->
Predict
(
data_filename
,
result_filename
,
data_has_header
);
}
void
SaveModelToFile
(
int
num_
used_model
,
const
char
*
filename
)
{
boosting_
->
SaveModelToFile
(
num_
used_model
,
true
,
filename
);
void
SaveModelToFile
(
int
num_
iteration
,
const
char
*
filename
)
{
boosting_
->
SaveModelToFile
(
num_
iteration
,
filename
);
}
const
Boosting
*
GetBoosting
()
const
{
return
boosting_
.
get
();
}
int
GetEvalCounts
()
const
{
int
ret
=
0
;
for
(
const
auto
&
metric
:
train_metric_
)
{
ret
+=
static_cast
<
int
>
(
metric
->
GetName
().
size
());
}
return
ret
;
}
const
float
*
GetTrainingScore
(
int
*
out_len
)
const
{
return
boosting_
->
GetTrainingScore
(
out_len
);
}
int
GetEvalNames
(
char
**
out_strs
)
const
{
int
idx
=
0
;
for
(
const
auto
&
metric
:
train_metric_
)
{
for
(
const
auto
&
name
:
metric
->
GetName
())
{
std
::
strcpy
(
out_strs
[
idx
],
name
.
c_str
());
++
idx
;
}
}
return
idx
;
}
const
inline
int
NumberOfClasses
()
const
{
return
boosting_
->
NumberOfClasses
();
}
const
Boosting
*
GetBoosting
()
const
{
return
boosting_
.
get
();
}
private:
void
ConstructObjectAndTrainingMetrics
(
const
Dataset
*
train_data
)
{
// create objective function
objective_fun_
.
reset
(
ObjectiveFunction
::
CreateObjectiveFunction
(
config_
.
objective_type
,
config_
.
objective_config
));
if
(
objective_fun_
==
nullptr
)
{
Log
::
Warning
(
"Using self-defined objective functions"
);
}
// create training metric
train_metric_
.
clear
();
for
(
auto
metric_type
:
config_
.
metric_types
)
{
auto
metric
=
std
::
unique_ptr
<
Metric
>
(
Metric
::
CreateMetric
(
metric_type
,
config_
.
metric_config
));
if
(
metric
==
nullptr
)
{
continue
;
}
metric
->
Init
(
train_data
->
metadata
(),
train_data
->
num_data
());
train_metric_
.
push_back
(
std
::
move
(
metric
));
}
train_metric_
.
shrink_to_fit
();
// initialize the objective function
if
(
objective_fun_
!=
nullptr
)
{
objective_fun_
->
Init
(
train_data
->
metadata
(),
train_data
->
num_data
());
}
}
const
Dataset
*
train_data_
;
std
::
unique_ptr
<
Boosting
>
boosting_
;
/*! \brief All configs */
OverallConfig
config_
;
/*! \brief Training data */
const
Dataset
*
train_data_
;
/*! \brief Validation data */
std
::
vector
<
const
Dataset
*>
valid_datas_
;
/*! \brief Metric for training data */
std
::
vector
<
std
::
unique_ptr
<
Metric
>>
train_metric_
;
/*! \brief Metrics for validation data */
...
...
@@ -144,7 +197,8 @@ private:
std
::
unique_ptr
<
ObjectiveFunction
>
objective_fun_
;
/*! \brief Using predictor for prediction task */
std
::
unique_ptr
<
Predictor
>
predictor_
;
/*! \brief mutex for threading safe call */
std
::
mutex
mutex_
;
};
}
...
...
@@ -152,17 +206,18 @@ private:
using
namespace
LightGBM
;
DllExport
const
char
*
LGBM_GetLastError
()
{
return
LastErrorMsg
()
.
c_str
()
;
return
LastErrorMsg
();
}
DllExport
int
LGBM_
Create
DatasetFromFile
(
const
char
*
filename
,
DllExport
int
LGBM_Dataset
Create
FromFile
(
const
char
*
filename
,
const
char
*
parameters
,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
)
{
API_BEGIN
();
OverallConfig
config
;
config
.
LoadFromString
(
parameters
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
DatasetLoader
loader
(
io_config
,
nullptr
);
loader
.
SetHeader
(
filename
);
if
(
reference
==
nullptr
)
{
*
out
=
loader
.
LoadFromFile
(
filename
);
...
...
@@ -173,16 +228,7 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
API_END
();
}
DllExport
int
LGBM_CreateDatasetFromBinaryFile
(
const
char
*
filename
,
DatesetHandle
*
out
)
{
API_BEGIN
();
OverallConfig
config
;
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
*
out
=
loader
.
LoadFromBinFile
(
filename
,
0
,
1
);
API_END
();
}
DllExport
int
LGBM_CreateDatasetFromMat
(
const
void
*
data
,
DllExport
int
LGBM_DatasetCreateFromMat
(
const
void
*
data
,
int
data_type
,
int32_t
nrow
,
int32_t
ncol
,
...
...
@@ -191,15 +237,16 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
)
{
API_BEGIN
();
OverallConfig
config
;
config
.
LoadFromString
(
parameters
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
DatasetLoader
loader
(
io_config
,
nullptr
);
std
::
unique_ptr
<
Dataset
>
ret
;
auto
get_row_fun
=
RowFunctionFromDenseMatric
(
data
,
nrow
,
ncol
,
data_type
,
is_row_major
);
if
(
reference
==
nullptr
)
{
// sample data first
Random
rand
(
config
.
io_config
.
data_random_seed
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
config
.
io_config
.
bin_construct_sample_cnt
?
nrow
:
config
.
io_config
.
bin_construct_sample_cnt
);
Random
rand
(
io_config
.
data_random_seed
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol
);
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
...
...
@@ -213,10 +260,10 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
}
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
));
ret
.
reset
(
new
Dataset
(
nrow
,
io_config
.
num_class
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
io_config
.
is_enable_sparse
);
}
#pragma omp parallel for schedule(guided)
...
...
@@ -230,7 +277,7 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
API_END
();
}
DllExport
int
LGBM_
Create
DatasetFromCSR
(
const
void
*
indptr
,
DllExport
int
LGBM_Dataset
Create
FromCSR
(
const
void
*
indptr
,
int
indptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
...
...
@@ -242,16 +289,17 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
)
{
API_BEGIN
();
OverallConfig
config
;
config
.
LoadFromString
(
parameters
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
DatasetLoader
loader
(
io_config
,
nullptr
);
std
::
unique_ptr
<
Dataset
>
ret
;
auto
get_row_fun
=
RowFunctionFromCSR
(
indptr
,
indptr_type
,
indices
,
data
,
data_type
,
nindptr
,
nelem
);
int32_t
nrow
=
static_cast
<
int32_t
>
(
nindptr
-
1
);
if
(
reference
==
nullptr
)
{
// sample data first
Random
rand
(
config
.
io_config
.
data_random_seed
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
config
.
io_config
.
bin_construct_sample_cnt
?
nrow
:
config
.
io_config
.
bin_construct_sample_cnt
);
Random
rand
(
io_config
.
data_random_seed
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
...
...
@@ -274,10 +322,10 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
));
ret
.
reset
(
new
Dataset
(
nrow
,
io_config
.
num_class
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
io_config
.
is_enable_sparse
);
}
#pragma omp parallel for schedule(guided)
...
...
@@ -291,7 +339,7 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
API_END
();
}
DllExport
int
LGBM_
Create
DatasetFromCSC
(
const
void
*
col_ptr
,
DllExport
int
LGBM_Dataset
Create
FromCSC
(
const
void
*
col_ptr
,
int
col_ptr_type
,
const
int32_t
*
indices
,
const
void
*
data
,
...
...
@@ -303,17 +351,18 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
const
DatesetHandle
*
reference
,
DatesetHandle
*
out
)
{
API_BEGIN
();
OverallConfig
config
;
config
.
LoadFromString
(
parameters
);
DatasetLoader
loader
(
config
.
io_config
,
nullptr
);
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
DatasetLoader
loader
(
io_config
,
nullptr
);
std
::
unique_ptr
<
Dataset
>
ret
;
auto
get_col_fun
=
ColumnFunctionFromCSC
(
col_ptr
,
col_ptr_type
,
indices
,
data
,
data_type
,
ncol_ptr
,
nelem
);
int32_t
nrow
=
static_cast
<
int32_t
>
(
num_row
);
if
(
reference
==
nullptr
)
{
Log
::
Warning
(
"Construct from CSC format is not efficient"
);
// sample data first
Random
rand
(
config
.
io_config
.
data_random_seed
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
config
.
io_config
.
bin_construct_sample_cnt
?
nrow
:
config
.
io_config
.
bin_construct_sample_cnt
);
Random
rand
(
io_config
.
data_random_seed
);
const
int
sample_cnt
=
static_cast
<
int
>
(
nrow
<
io_config
.
bin_construct_sample_cnt
?
nrow
:
io_config
.
bin_construct_sample_cnt
);
auto
sample_indices
=
rand
.
Sample
(
nrow
,
sample_cnt
);
std
::
vector
<
std
::
vector
<
double
>>
sample_values
(
ncol_ptr
-
1
);
#pragma omp parallel for schedule(guided)
...
...
@@ -323,10 +372,10 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
}
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
,
config
.
io_config
.
num_class
));
ret
.
reset
(
new
Dataset
(
nrow
,
io_config
.
num_class
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
*
reference
),
config
.
io_config
.
is_enable_sparse
);
io_config
.
is_enable_sparse
);
}
#pragma omp parallel for schedule(guided)
...
...
@@ -340,6 +389,26 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
API_END
();
}
DllExport
int
LGBM_DatasetGetSubset
(
const
DatesetHandle
*
handle
,
const
int32_t
*
used_row_indices
,
int32_t
num_used_row_indices
,
const
char
*
parameters
,
DatesetHandle
*
out
)
{
API_BEGIN
();
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
auto
full_dataset
=
reinterpret_cast
<
const
Dataset
*>
(
*
handle
);
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
full_dataset
->
Subset
(
used_row_indices
,
num_used_row_indices
,
io_config
.
is_enable_sparse
));
ret
->
FinishLoad
();
*
out
=
ret
.
release
();
API_END
();
}
DllExport
int
LGBM_DatasetFree
(
DatesetHandle
handle
)
{
API_BEGIN
();
delete
reinterpret_cast
<
Dataset
*>
(
handle
);
...
...
@@ -387,6 +456,7 @@ DllExport int LGBM_DatasetGetField(DatesetHandle handle,
is_success
=
true
;
}
if
(
!
is_success
)
{
throw
std
::
runtime_error
(
"Field not found"
);
}
if
(
*
out_ptr
==
nullptr
)
{
*
out_len
=
0
;
}
API_END
();
}
...
...
@@ -410,28 +480,24 @@ DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
// ---- start of booster
DllExport
int
LGBM_BoosterCreate
(
const
DatesetHandle
train_data
,
const
DatesetHandle
valid_datas
[],
const
char
*
valid_names
[],
int
n_valid_datas
,
const
char
*
parameters
,
BoosterHandle
*
out
)
{
API_BEGIN
();
const
Dataset
*
p_train_data
=
reinterpret_cast
<
const
Dataset
*>
(
train_data
);
std
::
vector
<
const
Dataset
*>
p_valid_datas
;
std
::
vector
<
std
::
string
>
p_valid_names
;
for
(
int
i
=
0
;
i
<
n_valid_datas
;
++
i
)
{
p_valid_datas
.
emplace_back
(
reinterpret_cast
<
const
Dataset
*>
(
valid_datas
[
i
]));
p_valid_names
.
emplace_back
(
valid_names
[
i
]);
}
*
out
=
new
Booster
(
p_train_data
,
p_valid_datas
,
p_valid_names
,
parameters
);
auto
ret
=
std
::
unique_ptr
<
Booster
>
(
new
Booster
(
p_train_data
,
parameters
));
*
out
=
ret
.
release
();
API_END
();
}
DllExport
int
LGBM_Booster
Load
FromModelfile
(
DllExport
int
LGBM_Booster
Create
FromModelfile
(
const
char
*
filename
,
int64_t
*
out_num_iterations
,
BoosterHandle
*
out
)
{
API_BEGIN
();
*
out
=
new
Booster
(
filename
);
auto
ret
=
std
::
unique_ptr
<
Booster
>
(
new
Booster
(
filename
));
*
out_num_iterations
=
static_cast
<
int64_t
>
(
ret
->
GetBoosting
()
->
NumberOfTotalModel
()
/
ret
->
GetBoosting
()
->
NumberOfClasses
());
*
out
=
ret
.
release
();
API_END
();
}
...
...
@@ -441,6 +507,47 @@ DllExport int LGBM_BoosterFree(BoosterHandle handle) {
API_END
();
}
DllExport
int
LGBM_BoosterMerge
(
BoosterHandle
handle
,
BoosterHandle
other_handle
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
Booster
*
ref_other_booster
=
reinterpret_cast
<
Booster
*>
(
other_handle
);
ref_booster
->
MergeFrom
(
ref_other_booster
);
API_END
();
}
DllExport
int
LGBM_BoosterAddValidData
(
BoosterHandle
handle
,
const
DatesetHandle
valid_data
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
const
Dataset
*
p_dataset
=
reinterpret_cast
<
const
Dataset
*>
(
valid_data
);
ref_booster
->
AddValidData
(
p_dataset
);
API_END
();
}
DllExport
int
LGBM_BoosterResetTrainingData
(
BoosterHandle
handle
,
const
DatesetHandle
train_data
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
const
Dataset
*
p_dataset
=
reinterpret_cast
<
const
Dataset
*>
(
train_data
);
ref_booster
->
ResetTrainingData
(
p_dataset
);
API_END
();
}
DllExport
int
LGBM_BoosterResetParameter
(
BoosterHandle
handle
,
const
char
*
parameters
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
ResetConfig
(
parameters
);
API_END
();
}
DllExport
int
LGBM_BoosterGetNumClasses
(
BoosterHandle
handle
,
int64_t
*
out_len
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetBoosting
()
->
NumberOfClasses
();
API_END
();
}
DllExport
int
LGBM_BoosterUpdateOneIter
(
BoosterHandle
handle
,
int
*
is_finished
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
...
...
@@ -466,14 +573,50 @@ DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterEval
(
BoosterHandle
handle
,
int
data
,
DllExport
int
LGBM_BoosterRollbackOneIter
(
BoosterHandle
handle
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
RollbackOneIter
();
API_END
();
}
DllExport
int
LGBM_BoosterGetCurrentIteration
(
BoosterHandle
handle
,
int64_t
*
out_iteration
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_iteration
=
ref_booster
->
GetBoosting
()
->
GetCurrentIteration
();
API_END
();
}
/*!
* \brief Get number of eval
* \return total number of eval result
*/
DllExport
int
LGBM_BoosterGetEvalCounts
(
BoosterHandle
handle
,
int64_t
*
out_len
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetEvalCounts
();
API_END
();
}
/*!
* \brief Get number of eval
* \return total number of eval result
*/
DllExport
int
LGBM_BoosterGetEvalNames
(
BoosterHandle
handle
,
int64_t
*
out_len
,
char
**
out_strs
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
*
out_len
=
ref_booster
->
GetEvalNames
(
out_strs
);
API_END
();
}
DllExport
int
LGBM_BoosterGetEval
(
BoosterHandle
handle
,
int
data_idx
,
int64_t
*
out_len
,
float
*
out_results
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
auto
boosting
=
ref_booster
->
GetBoosting
();
auto
result_buf
=
boosting
->
GetEvalAt
(
data
);
auto
result_buf
=
boosting
->
GetEvalAt
(
data
_idx
);
*
out_len
=
static_cast
<
int64_t
>
(
result_buf
.
size
());
for
(
size_t
i
=
0
;
i
<
result_buf
.
size
();
++
i
)
{
(
out_results
)[
i
]
=
static_cast
<
float
>
(
result_buf
[
i
]);
...
...
@@ -481,39 +624,27 @@ DllExport int LGBM_BoosterEval(BoosterHandle handle,
API_END
();
}
DllExport
int
LGBM_BoosterGetScore
(
BoosterHandle
handle
,
int64_t
*
out_len
,
const
float
**
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
int
len
=
0
;
*
out_result
=
ref_booster
->
GetTrainingScore
(
&
len
);
*
out_len
=
static_cast
<
int64_t
>
(
len
);
API_END
();
}
DllExport
int
LGBM_BoosterGetPredict
(
BoosterHandle
handle
,
int
data
,
int
data
_idx
,
int64_t
*
out_len
,
float
*
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
auto
boosting
=
ref_booster
->
GetBoosting
();
int
len
=
0
;
boost
ing
->
GetPredictAt
(
data
,
out_result
,
&
len
);
ref_
boost
er
->
GetPredictAt
(
data
_idx
,
out_result
,
&
len
);
*
out_len
=
static_cast
<
int64_t
>
(
len
);
API_END
();
}
DllExport
int
LGBM_BoosterPredictForFile
(
BoosterHandle
handle
,
int
predict_type
,
int64_t
n_used_trees
,
int
data_has_header
,
const
char
*
data_filename
,
int
data_has_header
,
int
predict_type
,
int64_t
num_iteration
,
const
char
*
result_filename
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
_used_trees
),
predict_type
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
um_iteration
),
predict_type
);
bool
bool_data_has_header
=
data_has_header
>
0
?
true
:
false
;
ref_booster
->
PredictForFile
(
data_filename
,
result_filename
,
bool_data_has_header
);
API_END
();
...
...
@@ -529,23 +660,32 @@ DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t
nelem
,
int64_t
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
)
{
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
_used_trees
),
predict_type
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
um_iteration
),
predict_type
);
auto
get_row_fun
=
RowFunctionFromCSR
(
indptr
,
indptr_type
,
indices
,
data
,
data_type
,
nindptr
,
nelem
);
int
num_class
=
ref_booster
->
NumberOfClasses
();
int
num_preb_in_one_row
=
ref_booster
->
GetBoosting
()
->
NumberOfClasses
();
if
(
predict_type
==
C_API_PREDICT_LEAF_INDEX
)
{
if
(
num_iteration
>
0
)
{
num_preb_in_one_row
*=
static_cast
<
int
>
(
num_iteration
);
}
else
{
num_preb_in_one_row
*=
ref_booster
->
GetBoosting
()
->
NumberOfTotalModel
()
/
num_preb_in_one_row
;
}
}
int
nrow
=
static_cast
<
int
>
(
nindptr
-
1
);
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
auto
one_row
=
get_row_fun
(
i
);
auto
predicton_result
=
ref_booster
->
Predict
(
one_row
);
for
(
int
j
=
0
;
j
<
num
_c
l
as
s
;
++
j
)
{
out_result
[
i
*
num_
class
+
j
]
=
predicton_result
[
j
];
for
(
int
j
=
0
;
j
<
static
_cas
t
<
int
>
(
predicton_result
.
size
())
;
++
j
)
{
out_result
[
i
*
num_
preb_in_one_row
+
j
]
=
static_cast
<
float
>
(
predicton_result
[
j
]
)
;
}
}
*
out_len
=
nrow
*
num_preb_in_one_row
;
API_END
();
}
...
...
@@ -556,31 +696,40 @@ DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
int32_t
ncol
,
int
is_row_major
,
int
predict_type
,
int64_t
n_used_trees
,
double
*
out_result
)
{
int64_t
num_iteration
,
int64_t
*
out_len
,
float
*
out_result
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
_used_trees
),
predict_type
);
ref_booster
->
PrepareForPrediction
(
static_cast
<
int
>
(
n
um_iteration
),
predict_type
);
auto
get_row_fun
=
RowPairFunctionFromDenseMatric
(
data
,
nrow
,
ncol
,
data_type
,
is_row_major
);
int
num_class
=
ref_booster
->
NumberOfClasses
();
int
num_preb_in_one_row
=
ref_booster
->
GetBoosting
()
->
NumberOfClasses
();
if
(
predict_type
==
C_API_PREDICT_LEAF_INDEX
)
{
if
(
num_iteration
>
0
)
{
num_preb_in_one_row
*=
static_cast
<
int
>
(
num_iteration
);
}
else
{
num_preb_in_one_row
*=
ref_booster
->
GetBoosting
()
->
NumberOfTotalModel
()
/
num_preb_in_one_row
;
}
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
nrow
;
++
i
)
{
auto
one_row
=
get_row_fun
(
i
);
auto
predicton_result
=
ref_booster
->
Predict
(
one_row
);
for
(
int
j
=
0
;
j
<
num
_c
l
as
s
;
++
j
)
{
out_result
[
i
*
num_
class
+
j
]
=
predicton_result
[
j
];
for
(
int
j
=
0
;
j
<
static
_cas
t
<
int
>
(
predicton_result
.
size
())
;
++
j
)
{
out_result
[
i
*
num_
preb_in_one_row
+
j
]
=
static_cast
<
float
>
(
predicton_result
[
j
]
)
;
}
}
*
out_len
=
nrow
*
num_preb_in_one_row
;
API_END
();
}
DllExport
int
LGBM_BoosterSaveModel
(
BoosterHandle
handle
,
int
num_
used_model
,
int
num_
iteration
,
const
char
*
filename
)
{
API_BEGIN
();
Booster
*
ref_booster
=
reinterpret_cast
<
Booster
*>
(
handle
);
ref_booster
->
SaveModelToFile
(
num_
used_model
,
filename
);
ref_booster
->
SaveModelToFile
(
num_
iteration
,
filename
);
API_END
();
}
...
...
src/io/config.cpp
View file @
16d1853d
...
...
@@ -5,14 +5,14 @@
#include <vector>
#include <string>
#include <unordered_
map
>
#include <unordered_
set
>
#include <algorithm>
namespace
LightGBM
{
void
OverallConfig
::
LoadFromString
(
const
char
*
str
)
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
ConfigBase
::
Str2Map
(
const
char
*
parameters
)
{
std
::
unordered_map
<
std
::
string
,
std
::
string
>
params
;
auto
args
=
Common
::
Split
(
str
,
"
\t\n\r
"
);
auto
args
=
Common
::
Split
(
parameters
,
"
\t\n\r
"
);
for
(
auto
arg
:
args
)
{
std
::
vector
<
std
::
string
>
tmp_strs
=
Common
::
Split
(
arg
.
c_str
(),
'='
);
if
(
tmp_strs
.
size
()
==
2
)
{
...
...
@@ -27,7 +27,7 @@ void OverallConfig::LoadFromString(const char* str) {
}
}
ParameterAlias
::
KeyAliasTransform
(
&
params
);
S
et
(
params
)
;
r
et
urn
params
;
}
void
OverallConfig
::
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
)
{
...
...
@@ -95,16 +95,15 @@ void OverallConfig::GetMetricType(const std::unordered_map<std::string, std::str
// split
std
::
vector
<
std
::
string
>
metrics
=
Common
::
Split
(
value
.
c_str
(),
','
);
// remove dumplicate
std
::
unordered_
map
<
std
::
string
,
int
>
metric_
map
s
;
std
::
unordered_
set
<
std
::
string
>
metric_
set
s
;
for
(
auto
&
metric
:
metrics
)
{
std
::
transform
(
metric
.
begin
(),
metric
.
end
(),
metric
.
begin
(),
Common
::
tolower
);
if
(
metric_
map
s
.
count
(
metric
)
<=
0
)
{
metric_
maps
[
metric
]
=
1
;
if
(
metric_
set
s
.
count
(
metric
)
<=
0
)
{
metric_
sets
.
insert
(
metric
)
;
}
}
for
(
auto
&
pair
:
metric_maps
)
{
std
::
string
sub_metric_str
=
pair
.
first
;
metric_types
.
push_back
(
sub_metric_str
);
for
(
auto
&
metric
:
metric_sets
)
{
metric_types
.
push_back
(
metric
);
}
metric_types
.
shrink_to_fit
();
}
...
...
@@ -183,7 +182,7 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt
(
params
,
"data_random_seed"
,
&
data_random_seed
);
GetString
(
params
,
"data"
,
&
data_filename
);
GetInt
(
params
,
"verbose"
,
&
verbosity
);
GetInt
(
params
,
"num_
model
_predict"
,
&
num_
model
_predict
);
GetInt
(
params
,
"num_
iteration
_predict"
,
&
num_
iteration
_predict
);
GetInt
(
params
,
"bin_construct_sample_cnt"
,
&
bin_construct_sample_cnt
);
GetBool
(
params
,
"is_pre_partition"
,
&
is_pre_partition
);
GetBool
(
params
,
"is_enable_sparse"
,
&
is_enable_sparse
);
...
...
@@ -214,6 +213,7 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
CHECK
(
max_position
>
0
);
GetInt
(
params
,
"num_class"
,
&
num_class
);
CHECK
(
num_class
>=
1
);
GetDouble
(
params
,
"scale_pos_weight"
,
&
scale_pos_weight
);
std
::
string
tmp_str
=
""
;
if
(
GetString
(
params
,
"label_gain"
,
&
tmp_str
))
{
label_gain
=
Common
::
StringToDoubleArray
(
tmp_str
,
','
);
...
...
src/io/dataset.cpp
View file @
16d1853d
...
...
@@ -14,17 +14,16 @@
namespace
LightGBM
{
const
char
*
Dataset
::
binary_file_token
=
"______LightGBM_Binary_File_Token______
\n
"
;
Dataset
::
Dataset
()
{
num_class_
=
1
;
num_data_
=
0
;
is_loading_from_binfile_
=
false
;
}
Dataset
::
Dataset
(
data_size_t
num_data
,
int
num_class
)
{
num_class_
=
num_class
;
num_data_
=
num_data
;
is_loading_from_binfile_
=
false
;
metadata_
.
Init
(
num_data_
,
num_class_
,
-
1
,
-
1
);
}
...
...
@@ -56,6 +55,21 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_spars
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
num_total_features_
=
dataset
->
num_total_features_
;
feature_names_
=
dataset
->
feature_names_
;
label_idx_
=
dataset
->
label_idx_
;
}
Dataset
*
Dataset
::
Subset
(
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
is_enable_sparse
)
const
{
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_used_indices
,
num_class_
));
ret
->
CopyFeatureMapperFrom
(
this
,
is_enable_sparse
);
#pragma omp parallel for schedule(guided)
for
(
int
fidx
=
0
;
fidx
<
num_features_
;
++
fidx
)
{
auto
iterator
=
features_
[
fidx
]
->
bin_data
()
->
GetIterator
(
0
);
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
++
i
)
{
ret
->
features_
[
fidx
]
->
PushBin
(
0
,
i
,
iterator
->
Get
(
used_indices
[
i
]));
}
}
ret
->
metadata_
.
Init
(
metadata_
,
used_indices
,
num_used_indices
);
return
ret
.
release
();
}
bool
Dataset
::
SetFloatField
(
const
char
*
field_name
,
const
float
*
field_data
,
data_size_t
num_element
)
{
...
...
@@ -78,6 +92,8 @@ bool Dataset::SetIntField(const char* field_name, const int* field_data, data_si
name
=
Common
::
Trim
(
name
);
if
(
name
==
std
::
string
(
"query"
)
||
name
==
std
::
string
(
"group"
))
{
metadata_
.
SetQueryBoundaries
(
field_data
,
num_element
);
}
else
if
(
name
==
std
::
string
(
"query_id"
)
||
name
==
std
::
string
(
"group_id"
))
{
metadata_
.
SetQueryId
(
field_data
,
num_element
);
}
else
{
return
false
;
}
...
...
@@ -107,7 +123,7 @@ bool Dataset::GetIntField(const char* field_name, int64_t* out_len, const int**
name
=
Common
::
Trim
(
name
);
if
(
name
==
std
::
string
(
"query"
)
||
name
==
std
::
string
(
"group"
))
{
*
out_ptr
=
metadata_
.
query_boundaries
();
*
out_len
=
num_
data_
;
*
out_len
=
meta
data_
.
num_queries
()
;
}
else
{
return
false
;
}
...
...
@@ -115,15 +131,27 @@ bool Dataset::GetIntField(const char* field_name, int64_t* out_len, const int**
}
void
Dataset
::
SaveBinaryFile
(
const
char
*
bin_filename
)
{
bool
is_file_existed
=
false
;
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
,
"rb"
);
#else
file
=
fopen
(
bin_filename
,
"rb"
);
#endif
if
(
!
is_loading_from_binfile_
)
{
if
(
file
!=
NULL
)
{
is_file_existed
=
true
;
Log
::
Warning
(
"File %s existed, cannot save binary to it"
,
bin_filename
);
fclose
(
file
);
}
if
(
!
is_file_existed
)
{
std
::
string
bin_filename_str
(
data_filename_
);
// if not pass a filename, just append ".bin" of original file
if
(
bin_filename
==
nullptr
||
bin_filename
[
0
]
==
'\0'
)
{
bin_filename_str
.
append
(
".bin"
);
bin_filename
=
bin_filename_str
.
c_str
();
}
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
,
"wb"
);
#else
...
...
@@ -133,7 +161,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log
::
Fatal
(
"Cannot write binary data to %s "
,
bin_filename
);
}
Log
::
Info
(
"Saving data to binary file %s"
,
bin_filename
);
size_t
size_of_token
=
std
::
strlen
(
binary_file_token
);
fwrite
(
binary_file_token
,
sizeof
(
char
),
size_of_token
,
file
);
// get size of header
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_class_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
...
...
src/io/dataset_loader.cpp
View file @
16d1853d
...
...
@@ -142,18 +142,18 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Please use an additional query file or pre-partition the data"
);
}
}
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
());
data_size_t
num_global_data
=
0
;
std
::
vector
<
data_size_t
>
used_data_indices
;
auto
bin_filename
=
CheckCanLoadFromBin
(
filename
);
if
(
bin_filename
.
size
()
==
0
)
{
auto
parser
=
std
::
unique_ptr
<
Parser
>
(
Parser
::
CreateParser
(
filename
,
io_config_
.
has_header
,
0
,
label_idx_
));
if
(
parser
==
nullptr
)
{
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
}
data_size_t
num_global_data
=
0
;
std
::
vector
<
data_size_t
>
used_data_indices
;
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
());
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
bool
is_loading_from_binfile
=
CheckCanLoadFromBin
(
filename
);
if
(
!
is_loading_from_binfile
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data to memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
rank
,
num_machines
,
&
num_global_data
,
&
used_data_indices
);
...
...
@@ -185,8 +185,6 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
}
else
{
// load data from binary file
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
dataset
.
reset
(
LoadFromBinFile
(
bin_filename
.
c_str
(),
rank
,
num_machines
));
}
// check meta data
...
...
@@ -199,18 +197,18 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Dataset
*
DatasetLoader
::
LoadFromFileAlignWithOtherDataset
(
const
char
*
filename
,
const
Dataset
*
train_data
)
{
data_size_t
num_global_data
=
0
;
std
::
vector
<
data_size_t
>
used_data_indices
;
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
());
auto
bin_filename
=
CheckCanLoadFromBin
(
filename
);
if
(
bin_filename
.
size
()
==
0
)
{
auto
parser
=
std
::
unique_ptr
<
Parser
>
(
Parser
::
CreateParser
(
filename
,
io_config_
.
has_header
,
0
,
label_idx_
));
if
(
parser
==
nullptr
)
{
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
}
data_size_t
num_global_data
=
0
;
std
::
vector
<
data_size_t
>
used_data_indices
;
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
());
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
bool
is_loading_from_binfile
=
CheckCanLoadFromBin
(
filename
);
if
(
!
is_loading_from_binfile
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data in memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
0
,
1
,
&
num_global_data
,
&
used_data_indices
);
...
...
@@ -234,8 +232,6 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
}
}
else
{
// load data from binary file
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
dataset
.
reset
(
LoadFromBinFile
(
bin_filename
.
c_str
(),
0
,
1
));
}
// not need to check validation data
...
...
@@ -261,8 +257,18 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
size_t
buffer_size
=
16
*
1024
*
1024
;
auto
buffer
=
std
::
vector
<
char
>
(
buffer_size
);
// check token
size_t
size_of_token
=
std
::
strlen
(
Dataset
::
binary_file_token
);
size_t
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
char
),
size_of_token
,
file
);
if
(
read_cnt
!=
size_of_token
)
{
Log
::
Fatal
(
"Binary file error: token has the wrong size"
);
}
if
(
std
::
string
(
buffer
.
data
())
!=
std
::
string
(
Dataset
::
binary_file_token
))
{
Log
::
Fatal
(
"input file is not LightGBM binary file"
);
}
// read size of header
size_t
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
size_t
),
1
,
file
);
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: header has the wrong size"
);
...
...
@@ -401,7 +407,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
}
dataset
->
features_
.
shrink_to_fit
();
fclose
(
file
);
dataset
->
is_loading_from_binfile_
=
true
;
return
dataset
.
release
();
}
...
...
@@ -849,7 +854,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
}
/*! \brief Check can load from binary file */
bool
DatasetLoader
::
CheckCanLoadFromBin
(
const
char
*
filename
)
{
std
::
string
DatasetLoader
::
CheckCanLoadFromBin
(
const
char
*
filename
)
{
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
...
...
@@ -860,12 +865,32 @@ bool DatasetLoader::CheckCanLoadFromBin(const char* filename) {
#else
file
=
fopen
(
bin_filename
.
c_str
(),
"rb"
);
#endif
if
(
file
==
NULL
)
{
return
false
;
}
else
{
bin_filename
=
std
::
string
(
filename
);
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
.
c_str
(),
"rb"
);
#else
file
=
fopen
(
bin_filename
.
c_str
(),
"rb"
);
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"cannot open data file %s"
,
bin_filename
.
c_str
());
}
}
size_t
buffer_size
=
256
;
auto
buffer
=
std
::
vector
<
char
>
(
buffer_size
);
// read size of token
size_t
size_of_token
=
std
::
strlen
(
Dataset
::
binary_file_token
);
size_t
read_cnt
=
fread
(
buffer
.
data
(),
sizeof
(
char
),
size_of_token
,
file
);
fclose
(
file
);
return
true
;
if
(
read_cnt
==
size_of_token
&&
std
::
string
(
buffer
.
data
())
==
std
::
string
(
Dataset
::
binary_file_token
))
{
return
bin_filename
;
}
else
{
return
std
::
string
();
}
}
}
\ No newline at end of file
src/io/metadata.cpp
View file @
16d1853d
...
...
@@ -50,6 +50,69 @@ void Metadata::Init(data_size_t num_data, int num_class, int weight_idx, int que
}
}
void
Metadata
::
Init
(
const
Metadata
&
fullset
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
num_data_
=
num_used_indices
;
num_class_
=
fullset
.
num_class_
;
label_
=
std
::
vector
<
float
>
(
num_used_indices
);
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
i
++
)
{
label_
[
i
]
=
fullset
.
label_
[
used_indices
[
i
]];
}
if
(
fullset
.
weights_
.
size
()
>
0
)
{
weights_
=
std
::
vector
<
float
>
(
num_used_indices
);
num_weights_
=
num_used_indices
;
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
i
++
)
{
weights_
[
i
]
=
fullset
.
weights_
[
used_indices
[
i
]];
}
}
else
{
num_weights_
=
0
;
}
if
(
fullset
.
init_score_
.
size
()
>
0
)
{
init_score_
=
std
::
vector
<
float
>
(
num_used_indices
);
num_init_score_
=
num_used_indices
;
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
i
++
)
{
init_score_
[
i
]
=
fullset
.
init_score_
[
used_indices
[
i
]];
}
}
else
{
num_init_score_
=
0
;
}
if
(
fullset
.
query_boundaries_
.
size
()
>
0
)
{
std
::
vector
<
data_size_t
>
used_query
;
data_size_t
data_idx
=
0
;
for
(
data_size_t
qid
=
0
;
qid
<
num_queries_
&&
data_idx
<
num_used_indices
;
++
qid
)
{
data_size_t
start
=
fullset
.
query_boundaries_
[
qid
];
data_size_t
end
=
fullset
.
query_boundaries_
[
qid
+
1
];
data_size_t
len
=
end
-
start
;
if
(
used_indices
[
data_idx
]
>
start
)
{
continue
;
}
else
if
(
used_indices
[
data_idx
]
==
start
)
{
if
(
num_used_indices
>=
data_idx
+
len
&&
used_indices
[
data_idx
+
len
-
1
]
==
end
-
1
)
{
used_query
.
push_back
(
qid
);
data_idx
+=
len
;
}
else
{
Log
::
Fatal
(
"Data partition error, data didn't match queries"
);
}
}
else
{
Log
::
Fatal
(
"Data partition error, data didn't match queries"
);
}
}
query_boundaries_
=
std
::
vector
<
data_size_t
>
(
used_query
.
size
()
+
1
);
num_queries_
=
static_cast
<
data_size_t
>
(
used_query
.
size
());
query_boundaries_
[
0
]
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_queries_
;
++
i
)
{
data_size_t
qid
=
used_query
[
i
];
data_size_t
len
=
fullset
.
query_boundaries_
[
qid
+
1
]
-
fullset
.
query_boundaries_
[
qid
];
query_boundaries_
[
i
+
1
]
=
query_boundaries_
[
i
]
+
len
;
}
}
else
{
num_queries_
=
0
;
}
}
void
Metadata
::
PartitionLabel
(
const
std
::
vector
<
data_size_t
>&
used_indices
)
{
if
(
used_indices
.
size
()
<=
0
)
{
return
;
...
...
@@ -196,6 +259,13 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
void
Metadata
::
SetInitScore
(
const
float
*
init_score
,
data_size_t
len
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
// save to nullptr
if
(
init_score
==
nullptr
||
len
==
0
)
{
init_score_
.
clear
();
num_init_score_
=
0
;
return
;
}
if
(
len
!=
num_data_
*
num_class_
)
{
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
}
...
...
@@ -208,6 +278,10 @@ void Metadata::SetInitScore(const float* init_score, data_size_t len) {
}
void
Metadata
::
SetLabel
(
const
float
*
label
,
data_size_t
len
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
label
==
nullptr
)
{
Log
::
Fatal
(
"label cannot be nullptr"
);
}
if
(
num_data_
!=
len
)
{
Log
::
Fatal
(
"len of label is not same with #data"
);
}
...
...
@@ -219,6 +293,13 @@ void Metadata::SetLabel(const float* label, data_size_t len) {
}
void
Metadata
::
SetWeights
(
const
float
*
weights
,
data_size_t
len
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
// save to nullptr
if
(
weights
==
nullptr
||
len
==
0
)
{
weights_
.
clear
();
num_weights_
=
0
;
return
;
}
if
(
num_data_
!=
len
)
{
Log
::
Fatal
(
"len of weights is not same with #data"
);
}
...
...
@@ -232,6 +313,13 @@ void Metadata::SetWeights(const float* weights, data_size_t len) {
}
void
Metadata
::
SetQueryBoundaries
(
const
data_size_t
*
query_boundaries
,
data_size_t
len
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
// save to nullptr
if
(
query_boundaries
==
nullptr
||
len
==
0
)
{
query_boundaries_
.
clear
();
num_queries_
=
0
;
return
;
}
data_size_t
sum
=
0
;
for
(
data_size_t
i
=
0
;
i
<
len
;
++
i
)
{
sum
+=
query_boundaries
[
i
];
...
...
@@ -248,6 +336,47 @@ void Metadata::SetQueryBoundaries(const data_size_t* query_boundaries, data_size
LoadQueryWeights
();
}
void
Metadata
::
SetQueryId
(
const
data_size_t
*
query_id
,
data_size_t
len
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
// save to nullptr
if
(
query_id
==
nullptr
||
len
==
0
)
{
query_boundaries_
.
clear
();
queries_
.
clear
();
num_queries_
=
0
;
return
;
}
if
(
num_data_
!=
len
)
{
Log
::
Fatal
(
"len of query id is not same with #data"
);
}
if
(
queries_
.
size
()
>
0
)
{
queries_
.
clear
();
}
queries_
=
std
::
vector
<
data_size_t
>
(
num_data_
);
for
(
data_size_t
i
=
0
;
i
<
num_weights_
;
++
i
)
{
queries_
[
i
]
=
query_id
[
i
];
}
// need convert query_id to boundaries
std
::
vector
<
data_size_t
>
tmp_buffer
;
data_size_t
last_qid
=
-
1
;
data_size_t
cur_cnt
=
0
;
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
last_qid
!=
queries_
[
i
])
{
if
(
cur_cnt
>
0
)
{
tmp_buffer
.
push_back
(
cur_cnt
);
}
cur_cnt
=
0
;
last_qid
=
queries_
[
i
];
}
++
cur_cnt
;
}
tmp_buffer
.
push_back
(
cur_cnt
);
query_boundaries_
=
std
::
vector
<
data_size_t
>
(
tmp_buffer
.
size
()
+
1
);
num_queries_
=
static_cast
<
data_size_t
>
(
tmp_buffer
.
size
());
query_boundaries_
[
0
]
=
0
;
for
(
size_t
i
=
0
;
i
<
tmp_buffer
.
size
();
++
i
)
{
query_boundaries_
[
i
+
1
]
=
query_boundaries_
[
i
]
+
tmp_buffer
[
i
];
}
queries_
.
clear
();
LoadQueryWeights
();
}
void
Metadata
::
LoadWeights
()
{
num_weights_
=
0
;
...
...
src/metric/binary_metric.hpp
View file @
16d1853d
...
...
@@ -29,11 +29,8 @@ public:
}
void
Init
(
const
char
*
test_name
,
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
std
::
stringstream
str_buf
;
str_buf
<<
test_name
<<
"'s : "
<<
PointWiseLossCalculator
::
Name
();
name_
.
emplace_back
(
str_buf
.
str
());
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
name_
.
emplace_back
(
PointWiseLossCalculator
::
Name
());
num_data_
=
num_data
;
// get label
...
...
@@ -119,7 +116,7 @@ public:
}
inline
static
const
char
*
Name
()
{
return
"log
loss"
;
return
"logloss"
;
}
};
/*!
...
...
@@ -138,7 +135,7 @@ public:
}
inline
static
const
char
*
Name
()
{
return
"error
rate
"
;
return
"error"
;
}
};
...
...
@@ -162,10 +159,8 @@ public:
return
1.0
f
;
}
void
Init
(
const
char
*
test_name
,
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
std
::
stringstream
str_buf
;
str_buf
<<
test_name
<<
"'s : AUC"
;
name_
.
emplace_back
(
str_buf
.
str
());
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
name_
.
emplace_back
(
"auc"
);
num_data_
=
num_data
;
// get label
...
...
src/metric/multiclass_metric.hpp
View file @
16d1853d
...
...
@@ -23,10 +23,9 @@ public:
}
void
Init
(
const
char
*
test_name
,
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
std
::
stringstream
str_buf
;
str_buf
<<
test_name
<<
" : "
<<
PointWiseLossCalculator
::
Name
();
name_
.
emplace_back
(
str_buf
.
str
());
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
name_
.
emplace_back
(
PointWiseLossCalculator
::
Name
());
num_data_
=
num_data
;
// get label
label_
=
metadata
.
label
();
...
...
@@ -110,7 +109,7 @@ public:
}
inline
static
const
char
*
Name
()
{
return
"multi
error"
;
return
"multi
_
error"
;
}
};
...
...
@@ -130,7 +129,7 @@ public:
}
inline
static
const
char
*
Name
()
{
return
"multi
logloss"
;
return
"multi
_
logloss"
;
}
};
...
...
src/metric/rank_metric.hpp
View file @
16d1853d
...
...
@@ -33,12 +33,9 @@ public:
~
NDCGMetric
()
{
}
void
Init
(
const
char
*
test_name
,
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
for
(
auto
k
:
eval_at_
)
{
std
::
stringstream
str_buf
;
str_buf
<<
test_name
<<
"'s : "
;
str_buf
<<
"NDCG@"
+
std
::
to_string
(
k
)
+
" "
;
name_
.
emplace_back
(
str_buf
.
str
());
name_
.
emplace_back
(
std
::
string
(
"ndcg@"
)
+
std
::
to_string
(
k
));
}
num_data_
=
num_data
;
// get label
...
...
src/metric/regression_metric.hpp
View file @
16d1853d
...
...
@@ -31,10 +31,8 @@ public:
return
-
1.0
f
;
}
void
Init
(
const
char
*
test_name
,
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
std
::
stringstream
str_buf
;
str_buf
<<
test_name
<<
" : "
<<
PointWiseLossCalculator
::
Name
();
name_
.
emplace_back
(
str_buf
.
str
());
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
name_
.
emplace_back
(
PointWiseLossCalculator
::
Name
());
num_data_
=
num_data
;
// get label
...
...
@@ -103,7 +101,7 @@ public:
}
inline
static
const
char
*
Name
()
{
return
"l2
loss
"
;
return
"l2"
;
}
};
...
...
@@ -116,7 +114,7 @@ public:
return
std
::
fabs
(
score
-
label
);
}
inline
static
const
char
*
Name
()
{
return
"l1
loss
"
;
return
"l1"
;
}
};
...
...
src/network/linkers_socket.cpp
View file @
16d1853d
...
...
@@ -28,10 +28,6 @@ Linkers::Linkers(NetworkConfig config) {
// parser clients from file
ParseMachineList
(
config
.
machine_list_filename
.
c_str
());
if
(
num_machines_
<=
1
)
{
return
;
}
if
(
rank_
==
-
1
)
{
// get ip list of local machine
std
::
unordered_set
<
std
::
string
>
local_ip_list
=
TcpSocket
::
GetLocalIpList
();
...
...
@@ -101,10 +97,15 @@ void Linkers::ParseMachineList(const char * filename) {
client_ips_
.
push_back
(
str_after_split
[
0
]);
client_ports_
.
push_back
(
atoi
(
str_after_split
[
1
].
c_str
()));
}
if
(
client_ips_
.
size
()
==
0
)
{
Log
::
Fatal
(
"Machine list file doesn't contain any ip and port. \
Please check it again"
);
}
if
(
client_ips_
.
size
()
!=
static_cast
<
size_t
>
(
num_machines_
))
{
Log
::
Warning
(
"World size is larger than the machine_list size, change world size to %d"
,
client_ips_
.
size
());
num_machines_
=
static_cast
<
int
>
(
client_ips_
.
size
());
}
}
void
Linkers
::
TryBind
(
int
port
)
{
...
...
src/objective/binary_objective.hpp
View file @
16d1853d
...
...
@@ -18,6 +18,7 @@ public:
if
(
sigmoid_
<=
0.0
)
{
Log
::
Fatal
(
"Sigmoid parameter %f should be greater than zero"
,
sigmoid_
);
}
scale_pos_weight_
=
static_cast
<
score_t
>
(
config
.
scale_pos_weight
);
}
~
BinaryLogloss
()
{}
void
Init
(
const
Metadata
&
metadata
,
data_size_t
num_data
)
override
{
...
...
@@ -55,6 +56,7 @@ public:
label_weights_
[
0
]
=
1.0
f
;
}
}
label_weights_
[
1
]
*=
scale_pos_weight_
;
}
void
GetGradients
(
const
score_t
*
score
,
score_t
*
gradients
,
score_t
*
hessians
)
const
override
{
...
...
@@ -104,6 +106,7 @@ private:
score_t
label_weights_
[
2
];
/*! \brief Weights for data */
const
float
*
weights_
;
score_t
scale_pos_weight_
;
};
}
// namespace LightGBM
...
...
tests/c_api_test/test.py
View file @
16d1853d
...
...
@@ -16,6 +16,8 @@ def LoadDll():
LIB
=
LoadDll
()
LIB
.
LGBM_GetLastError
.
restype
=
ctypes
.
c_char_p
dtype_float32
=
0
dtype_float64
=
1
dtype_int32
=
2
...
...
@@ -33,9 +35,10 @@ def test_load_from_file(filename, reference):
if
reference
!=
None
:
ref
=
ctypes
.
byref
(
reference
)
handle
=
ctypes
.
c_void_p
()
LIB
.
LGBM_
Create
DatasetFromFile
(
c_str
(
filename
),
LIB
.
LGBM_Dataset
Create
FromFile
(
c_str
(
filename
),
c_str
(
'max_bin=15'
),
ref
,
ctypes
.
byref
(
handle
)
)
print
(
LIB
.
LGBM_GetLastError
())
num_data
=
ctypes
.
c_long
()
LIB
.
LGBM_DatasetGetNumData
(
handle
,
ctypes
.
byref
(
num_data
)
)
num_feature
=
ctypes
.
c_long
()
...
...
@@ -46,15 +49,6 @@ def test_load_from_file(filename, reference):
def
test_save_to_binary
(
handle
,
filename
):
LIB
.
LGBM_DatasetSaveBinary
(
handle
,
c_str
(
filename
))
def
test_load_from_binary
(
filename
):
handle
=
ctypes
.
c_void_p
()
LIB
.
LGBM_CreateDatasetFromBinaryFile
(
c_str
(
filename
),
ctypes
.
byref
(
handle
)
)
num_data
=
ctypes
.
c_long
()
LIB
.
LGBM_DatasetGetNumData
(
handle
,
ctypes
.
byref
(
num_data
)
)
num_feature
=
ctypes
.
c_long
()
LIB
.
LGBM_DatasetGetNumFeature
(
handle
,
ctypes
.
byref
(
num_feature
)
)
print
(
'#data:%d #feature:%d'
%
(
num_data
.
value
,
num_feature
.
value
)
)
return
handle
def
test_load_from_csr
(
filename
,
reference
):
data
=
[]
...
...
@@ -72,7 +66,7 @@ def test_load_from_csr(filename, reference):
if
reference
!=
None
:
ref
=
ctypes
.
byref
(
reference
)
LIB
.
LGBM_
Create
DatasetFromCSR
(
c_array
(
ctypes
.
c_int
,
csr
.
indptr
),
LIB
.
LGBM_Dataset
Create
FromCSR
(
c_array
(
ctypes
.
c_int
,
csr
.
indptr
),
dtype_int32
,
c_array
(
ctypes
.
c_int
,
csr
.
indices
),
csr
.
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_void_p
)),
...
...
@@ -107,7 +101,7 @@ def test_load_from_csc(filename, reference):
if
reference
!=
None
:
ref
=
ctypes
.
byref
(
reference
)
LIB
.
LGBM_
Create
DatasetFromCSC
(
c_array
(
ctypes
.
c_int
,
csr
.
indptr
),
LIB
.
LGBM_Dataset
Create
FromCSC
(
c_array
(
ctypes
.
c_int
,
csr
.
indptr
),
dtype_int32
,
c_array
(
ctypes
.
c_int
,
csr
.
indices
),
csr
.
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_void_p
)),
...
...
@@ -142,7 +136,7 @@ def test_load_from_mat(filename, reference):
if
reference
!=
None
:
ref
=
ctypes
.
byref
(
reference
)
LIB
.
LGBM_
Create
DatasetFromMat
(
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_void_p
)),
LIB
.
LGBM_Dataset
Create
FromMat
(
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_void_p
)),
dtype_float64
,
mat
.
shape
[
0
],
mat
.
shape
[
1
],
...
...
@@ -170,35 +164,36 @@ def test_dataset():
test_free_dataset
(
test
)
test_save_to_binary
(
train
,
'train.binary.bin'
)
test_free_dataset
(
train
)
train
=
test_load_from_
binary
(
'train.binary.bin'
)
train
=
test_load_from_
file
(
'train.binary.bin'
,
None
)
test_free_dataset
(
train
)
def
test_booster
():
train
=
test_load_from_mat
(
'../../examples/binary_classification/binary.train'
,
None
)
test
=
[
test_load_from_mat
(
'../../examples/binary_classification/binary.test'
,
train
)]
name
=
[
c_str
(
'test'
)]
test
=
test_load_from_mat
(
'../../examples/binary_classification/binary.test'
,
train
)
booster
=
ctypes
.
c_void_p
()
LIB
.
LGBM_BoosterCreate
(
train
,
c_
array
(
ctypes
.
c_void_p
,
test
),
c_array
(
ctypes
.
c_char_p
,
name
),
len
(
test
),
c_str
(
"app=binary metric=auc num_leaves=31 verbose=0"
),
ctypes
.
byref
(
booster
)
)
LIB
.
LGBM_BoosterCreate
(
train
,
c_
str
(
"app=binary metric=auc num_leaves=31 verbose=0"
),
ctypes
.
byref
(
booster
))
LIB
.
LGBM_BoosterAddValidData
(
booster
,
test
)
is_finished
=
ctypes
.
c_int
(
0
)
for
i
in
range
(
100
):
LIB
.
LGBM_BoosterUpdateOneIter
(
booster
,
ctypes
.
byref
(
is_finished
))
result
=
np
.
array
([
0.0
],
dtype
=
np
.
float32
)
out_len
=
ctypes
.
c_ulong
(
0
)
LIB
.
LGBM_BoosterEval
(
booster
,
0
,
ctypes
.
byref
(
out_len
),
result
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
)))
LIB
.
LGBM_Booster
Get
Eval
(
booster
,
0
,
ctypes
.
byref
(
out_len
),
result
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_float
)))
print
(
'%d Iteration test AUC %f'
%
(
i
,
result
[
0
]))
LIB
.
LGBM_BoosterSaveModel
(
booster
,
-
1
,
c_str
(
'model.txt'
))
LIB
.
LGBM_BoosterFree
(
booster
)
test_free_dataset
(
train
)
test_free_dataset
(
test
[
0
]
)
test_free_dataset
(
test
)
booster2
=
ctypes
.
c_void_p
()
LIB
.
LGBM_BoosterLoadFromModelfile
(
c_str
(
'model.txt'
),
ctypes
.
byref
(
booster2
))
num_total_model
=
ctypes
.
c_long
()
LIB
.
LGBM_BoosterCreateFromModelfile
(
c_str
(
'model.txt'
),
ctypes
.
byref
(
num_total_model
),
ctypes
.
byref
(
booster2
))
data
=
[]
inp
=
open
(
'../../examples/binary_classification/binary.test'
,
'r'
)
for
line
in
inp
.
readlines
():
data
.
append
(
[
float
(
x
)
for
x
in
line
.
split
(
'
\t
'
)[
1
:]]
)
inp
.
close
()
mat
=
np
.
array
(
data
)
preb
=
np
.
zeros
((
mat
.
shape
[
0
],
1
),
dtype
=
np
.
float64
)
preb
=
np
.
zeros
(
mat
.
shape
[
0
],
dtype
=
np
.
float32
)
num_preb
=
ctypes
.
c_long
()
data
=
np
.
array
(
mat
.
reshape
(
mat
.
size
),
copy
=
False
)
LIB
.
LGBM_BoosterPredictForMat
(
booster2
,
data
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_void_p
)),
...
...
@@ -208,10 +203,10 @@ def test_booster():
1
,
1
,
50
,
ctypes
.
byref
(
num_preb
),
preb
.
ctypes
.
data_as
(
ctypes
.
POINTER
(
ctypes
.
c_double
)))
LIB
.
LGBM_BoosterPredictForFile
(
booster2
,
1
,
50
,
0
,
c_str
(
'../../examples/binary_classification/binary.test'
),
c_str
(
'preb.txt'
))
LIB
.
LGBM_BoosterPredictForFile
(
booster2
,
c_str
(
'../../examples/binary_classification/binary.test'
),
0
,
0
,
50
,
c_str
(
'preb.txt'
))
LIB
.
LGBM_BoosterFree
(
booster2
)
test_dataset
()
test_booster
()
tests/python_package_test/test_basic.py
0 → 100644
View file @
16d1853d
import
numpy
as
np
from
sklearn
import
datasets
,
metrics
,
model_selection
import
lightgbm
as
lgb
X
,
Y
=
datasets
.
make_classification
(
n_samples
=
100000
,
n_features
=
100
)
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
Y
,
test_size
=
0.1
)
train_data
=
lgb
.
Dataset
(
x_train
,
max_bin
=
255
,
label
=
y_train
)
valid_data
=
train_data
.
create_valid
(
x_test
,
label
=
y_test
)
config
=
{
"objective"
:
"binary"
,
"metric"
:
"auc"
,
"min_data"
:
1
,
"num_leaves"
:
15
}
bst
=
lgb
.
Booster
(
params
=
config
,
train_set
=
train_data
)
bst
.
add_valid
(
valid_data
,
"valid_1"
)
for
i
in
range
(
100
):
bst
.
update
()
if
i
%
10
==
0
:
print
(
bst
.
eval_train
())
print
(
bst
.
eval_valid
())
bst
.
save_model
(
"model.txt"
)
tests/python_package_test/test_sklearn.py
0 → 100644
View file @
16d1853d
import
numpy
as
np
import
random
import
lightgbm
as
lgb
rng
=
np
.
random
.
RandomState
(
2016
)
def
test_binary_classification
():
from
sklearn
import
datasets
,
metrics
,
model_selection
X
,
y
=
datasets
.
make_classification
(
n_samples
=
10000
,
n_features
=
100
)
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMClassifier
().
fit
(
x_train
,
y_train
)
from
sklearn.datasets
import
load_digits
digits
=
load_digits
(
2
)
y
=
digits
[
'target'
]
X
=
digits
[
'data'
]
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMClassifier
().
fit
(
x_train
,
y_train
)
preds
=
lgb_model
.
predict
(
x_test
)
err
=
sum
(
1
for
i
in
range
(
len
(
preds
))
if
int
(
preds
[
i
]
>
0.5
)
!=
y_test
[
i
])
/
float
(
len
(
preds
))
assert
err
<
0.1
def
test_multiclass_classification
():
from
sklearn.datasets
import
load_iris
from
sklearn
import
datasets
,
metrics
,
model_selection
def
check_pred
(
preds
,
labels
):
err
=
sum
(
1
for
i
in
range
(
len
(
preds
))
if
int
(
preds
[
i
]
>
0.5
)
!=
labels
[
i
])
/
float
(
len
(
preds
))
assert
err
<
0.7
X
,
y
=
datasets
.
make_classification
(
n_samples
=
10000
,
n_features
=
100
,
n_classes
=
4
,
n_informative
=
3
)
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMClassifier
().
fit
(
x_train
,
y_train
)
preds
=
lgb_model
.
predict
(
x_test
)
check_pred
(
preds
,
y_test
)
def
test_regression
():
from
sklearn.metrics
import
mean_squared_error
from
sklearn.datasets
import
load_boston
from
sklearn.cross_validation
import
KFold
from
sklearn
import
datasets
,
metrics
,
model_selection
boston
=
load_boston
()
y
=
boston
[
'target'
]
X
=
boston
[
'data'
]
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMRegressor
().
fit
(
x_train
,
y_train
)
preds
=
lgb_model
.
predict
(
x_test
)
assert
mean_squared_error
(
preds
,
y_test
)
<
100
def
test_regression_with_custom_objective
():
from
sklearn.metrics
import
mean_squared_error
from
sklearn.datasets
import
load_boston
from
sklearn.cross_validation
import
KFold
from
sklearn
import
datasets
,
metrics
,
model_selection
def
objective_ls
(
y_true
,
y_pred
):
grad
=
(
y_pred
-
y_true
)
hess
=
np
.
ones
(
len
(
y_true
))
return
grad
,
hess
boston
=
load_boston
()
y
=
boston
[
'target'
]
X
=
boston
[
'data'
]
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMRegressor
(
objective
=
objective_ls
).
fit
(
x_train
,
y_train
)
preds
=
lgb_model
.
predict
(
x_test
)
assert
mean_squared_error
(
preds
,
y_test
)
<
100
def
test_binary_classification_with_custom_objective
():
from
sklearn
import
datasets
,
metrics
,
model_selection
def
logregobj
(
y_true
,
y_pred
):
y_pred
=
1.0
/
(
1.0
+
np
.
exp
(
-
y_pred
))
grad
=
y_pred
-
y_true
hess
=
y_pred
*
(
1.0
-
y_pred
)
return
grad
,
hess
X
,
y
=
datasets
.
make_classification
(
n_samples
=
10000
,
n_features
=
100
)
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMClassifier
(
objective
=
logregobj
).
fit
(
x_train
,
y_train
)
from
sklearn.datasets
import
load_digits
digits
=
load_digits
(
2
)
y
=
digits
[
'target'
]
X
=
digits
[
'data'
]
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.2
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMClassifier
(
objective
=
logregobj
).
fit
(
x_train
,
y_train
)
preds
=
lgb_model
.
predict
(
x_test
)
err
=
sum
(
1
for
i
in
range
(
len
(
preds
))
if
int
(
preds
[
i
]
>
0.5
)
!=
y_test
[
i
])
/
float
(
len
(
preds
))
assert
err
<
0.1
def
test_early_stopping
():
from
sklearn.metrics
import
mean_squared_error
from
sklearn.datasets
import
load_boston
from
sklearn.cross_validation
import
KFold
from
sklearn
import
datasets
,
metrics
,
model_selection
boston
=
load_boston
()
y
=
boston
[
'target'
]
X
=
boston
[
'data'
]
x_train
,
x_test
,
y_train
,
y_test
=
model_selection
.
train_test_split
(
X
,
y
,
test_size
=
0.1
,
random_state
=
1
)
lgb_model
=
lgb
.
LGBMRegressor
(
n_estimators
=
500
)
\
.
fit
(
x_train
,
y_train
,
eval_set
=
[(
x_test
,
y_test
)],
eval_metric
=
'l2'
,
early_stopping_rounds
=
10
,
verbose
=
10
)
print
(
lgb_model
.
best_iteration
)
test_binary_classification
()
test_multiclass_classification
()
test_regression
()
test_regression_with_custom_objective
()
test_binary_classification_with_custom_objective
()
test_early_stopping
()
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment