Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
292f972e
Commit
292f972e
authored
Dec 28, 2016
by
Guolin Ke
Browse files
decouple num_class in Dataset class
parent
45e0da2c
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
103 additions
and
111 deletions
+103
-111
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+8
-12
include/LightGBM/dataset_loader.h
include/LightGBM/dataset_loader.h
+3
-1
src/application/application.cpp
src/application/application.cpp
+2
-1
src/boosting/score_updater.hpp
src/boosting/score_updater.hpp
+4
-0
src/c_api.cpp
src/c_api.cpp
+7
-7
src/io/config.cpp
src/io/config.cpp
+22
-26
src/io/dataset.cpp
src/io/dataset.cpp
+4
-8
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+15
-20
src/io/metadata.cpp
src/io/metadata.cpp
+38
-36
No files found.
include/LightGBM/dataset.h
View file @
292f972e
...
...
@@ -43,9 +43,8 @@ public:
* \brief Initialization will load qurey level informations, since it is need for sampling data
* \param data_filename Filename of data
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void
Init
(
const
char
*
data_filename
,
const
int
num_class
);
void
Init
(
const
char
*
data_filename
);
/*!
* \brief init as subset
* \param metadata Filename of data
...
...
@@ -64,11 +63,10 @@ public:
/*!
* \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
* \param num_data Number of training data
* \param num_class Number of classes
* \param weight_idx Index of weight column, < 0 means doesn't exists
* \param query_idx Index of query id column, < 0 means doesn't exists
*/
void
Init
(
data_size_t
num_data
,
int
num_class
,
int
weight_idx
,
int
query_idx
);
void
Init
(
data_size_t
num_data
,
int
weight_idx
,
int
query_idx
);
/*!
* \brief Partition label by used indices
...
...
@@ -203,6 +201,11 @@ public:
}
}
/*!
* \brief Get size of initial scores
*/
inline
data_size_t
num_init_score
()
const
{
return
num_init_score_
;
}
/*! \brief Disable copy */
Metadata
&
operator
=
(
const
Metadata
&
)
=
delete
;
/*! \brief Disable copy */
...
...
@@ -221,8 +224,6 @@ private:
const
char
*
data_filename_
;
/*! \brief Number of data */
data_size_t
num_data_
;
/*! \brief Number of classes */
int
num_class_
;
/*! \brief Number of weights, used to check correct weight file */
data_size_t
num_weights_
;
/*! \brief Label data */
...
...
@@ -281,7 +282,7 @@ public:
Dataset
();
Dataset
(
data_size_t
num_data
,
int
num_class
);
Dataset
(
data_size_t
num_data
);
/*! \brief Destructor */
~
Dataset
();
...
...
@@ -293,9 +294,6 @@ public:
if
(
num_total_features_
!=
other
.
num_total_features_
)
{
return
false
;
}
if
(
num_class_
!=
other
.
num_class_
)
{
return
false
;
}
if
(
label_idx_
!=
other
.
label_idx_
)
{
return
false
;
}
...
...
@@ -408,8 +406,6 @@ private:
int
num_total_features_
;
/*! \brief Number of total data*/
data_size_t
num_data_
;
/*! \brief Number of classes*/
int
num_class_
;
/*! \brief Store some label level data*/
Metadata
metadata_
;
/*! \brief index of label column */
...
...
include/LightGBM/dataset_loader.h
View file @
292f972e
...
...
@@ -8,7 +8,7 @@ namespace LightGBM {
class
DatasetLoader
{
public:
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
const
char
*
filename
);
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
int
num_class
,
const
char
*
filename
);
~
DatasetLoader
();
...
...
@@ -57,6 +57,8 @@ private:
Random
random_
;
/*! \brief prediction function for initial model */
const
PredictFunction
&
predict_fun_
;
/*! \brief number of classes */
int
num_class_
;
/*! \brief index of label column */
int
label_idx_
;
/*! \brief index of weight column */
...
...
src/application/application.cpp
View file @
292f972e
...
...
@@ -120,7 +120,8 @@ void Application::LoadData() {
GlobalSyncUpByMin
<
int
>
(
config_
.
io_config
.
data_random_seed
);
}
DatasetLoader
dataset_loader
(
config_
.
io_config
,
predict_fun
,
config_
.
io_config
.
data_filename
.
c_str
());
DatasetLoader
dataset_loader
(
config_
.
io_config
,
predict_fun
,
boosting_
->
NumberOfClasses
(),
config_
.
io_config
.
data_filename
.
c_str
());
// load Training data
if
(
config_
.
is_parallel_find_bin
)
{
// load data for parallel training
...
...
src/boosting/score_updater.hpp
View file @
292f972e
...
...
@@ -27,6 +27,10 @@ public:
const
float
*
init_score
=
data
->
metadata
().
init_score
();
// if exists initial score, will start from it
if
(
init_score
!=
nullptr
)
{
if
((
data
->
metadata
().
num_init_score
()
%
num_data_
)
!=
0
||
(
data
->
metadata
().
num_init_score
()
/
num_data_
)
!=
num_class
)
{
Log
::
Fatal
(
"number of class for initial score error"
);
}
for
(
size_t
i
=
0
;
i
<
total_size
;
++
i
)
{
score_
[
i
]
=
init_score
[
i
];
}
...
...
src/c_api.cpp
View file @
292f972e
...
...
@@ -245,7 +245,7 @@ DllExport int LGBM_DatasetCreateFromFile(const char* filename,
auto
param
=
ConfigBase
::
Str2Map
(
parameters
);
IOConfig
io_config
;
io_config
.
Set
(
param
);
DatasetLoader
loader
(
io_config
,
nullptr
,
filename
);
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
filename
);
if
(
reference
==
nullptr
)
{
*
out
=
loader
.
LoadFromFile
(
filename
);
}
else
{
...
...
@@ -284,10 +284,10 @@ DllExport int LGBM_DatasetCreateFromMat(const void* data,
}
}
}
DatasetLoader
loader
(
io_config
,
nullptr
,
nullptr
);
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
,
io_config
.
num_class
));
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
...
...
@@ -346,10 +346,10 @@ DllExport int LGBM_DatasetCreateFromCSR(const void* indptr,
}
}
CHECK
(
num_col
>=
static_cast
<
int
>
(
sample_values
.
size
()));
DatasetLoader
loader
(
io_config
,
nullptr
,
nullptr
);
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
,
io_config
.
num_class
));
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
...
...
@@ -396,10 +396,10 @@ DllExport int LGBM_DatasetCreateFromCSC(const void* col_ptr,
auto
cur_col
=
get_col_fun
(
i
);
sample_values
[
i
]
=
SampleFromOneColumn
(
cur_col
,
sample_indices
);
}
DatasetLoader
loader
(
io_config
,
nullptr
,
nullptr
);
DatasetLoader
loader
(
io_config
,
nullptr
,
1
,
nullptr
);
ret
.
reset
(
loader
.
CostructFromSampleData
(
sample_values
,
sample_cnt
,
nrow
));
}
else
{
ret
.
reset
(
new
Dataset
(
nrow
,
io_config
.
num_class
));
ret
.
reset
(
new
Dataset
(
nrow
));
ret
->
CopyFeatureMapperFrom
(
reinterpret_cast
<
const
Dataset
*>
(
reference
),
io_config
.
is_enable_sparse
);
...
...
src/io/config.cpp
View file @
292f972e
...
...
@@ -24,7 +24,7 @@ std::unordered_map<std::string, std::string> ConfigBase::Str2Map(const char* par
continue
;
}
params
[
key
]
=
value
;
}
else
if
(
Common
::
Trim
(
arg
).
size
()
>
0
){
}
else
if
(
Common
::
Trim
(
arg
).
size
()
>
0
)
{
Log
::
Warning
(
"Unknown parameter %s"
,
arg
.
c_str
());
}
}
...
...
@@ -62,14 +62,11 @@ void OverallConfig::Set(const std::unordered_map<std::string, std::string>& para
if
(
io_config
.
verbosity
==
1
)
{
LightGBM
::
Log
::
ResetLogLevel
(
LightGBM
::
LogLevel
::
Info
);
}
else
if
(
io_config
.
verbosity
==
0
)
{
}
else
if
(
io_config
.
verbosity
==
0
)
{
LightGBM
::
Log
::
ResetLogLevel
(
LightGBM
::
LogLevel
::
Warning
);
}
else
if
(
io_config
.
verbosity
>=
2
)
{
}
else
if
(
io_config
.
verbosity
>=
2
)
{
LightGBM
::
Log
::
ResetLogLevel
(
LightGBM
::
LogLevel
::
Debug
);
}
else
{
}
else
{
LightGBM
::
Log
::
ResetLogLevel
(
LightGBM
::
LogLevel
::
Fatal
);
}
}
...
...
@@ -141,22 +138,21 @@ void OverallConfig::CheckParamConflict() {
// check if objective_type, metric_type, and num_class match
bool
objective_type_multiclass
=
(
objective_type
==
std
::
string
(
"multiclass"
));
int
num_class_check
=
boosting_config
.
num_class
;
if
(
objective_type_multiclass
){
if
(
num_class_check
<=
1
){
Log
::
Fatal
(
"Number of classes should be specified and greater than 1 for multiclass training"
);
}
}
else
{
if
(
task_type
==
TaskType
::
kTrain
&&
num_class_check
!=
1
){
Log
::
Fatal
(
"Number of classes must be 1 for non-multiclass training"
);
}
if
(
objective_type_multiclass
)
{
if
(
num_class_check
<=
2
)
{
Log
::
Fatal
(
"Number of classes should be specified and greater than 2 for multiclass training"
);
}
}
else
{
if
(
task_type
==
TaskType
::
kTrain
&&
num_class_check
!=
1
)
{
Log
::
Fatal
(
"Number of classes must be 1 for non-multiclass training"
);
}
}
for
(
std
::
string
metric_type
:
metric_types
){
bool
metric_type_multiclass
=
(
metric_type
==
std
::
string
(
"multi_logloss"
)
||
metric_type
==
std
::
string
(
"multi_error"
));
if
((
objective_type_multiclass
&&
!
metric_type_multiclass
)
||
(
!
objective_type_multiclass
&&
metric_type_multiclass
)){
Log
::
Fatal
(
"Objective and metrics don't match"
);
}
for
(
std
::
string
metric_type
:
metric_types
)
{
bool
metric_type_multiclass
=
(
metric_type
==
std
::
string
(
"multi_logloss"
)
||
metric_type
==
std
::
string
(
"multi_error"
));
if
((
objective_type_multiclass
&&
!
metric_type_multiclass
)
||
(
!
objective_type_multiclass
&&
metric_type_multiclass
))
{
Log
::
Fatal
(
"Objective and metrics don't match"
);
}
}
if
(
network_config
.
num_machines
>
1
)
{
...
...
@@ -178,7 +174,7 @@ void OverallConfig::CheckParamConflict() {
is_parallel_find_bin
=
true
;
if
(
boosting_config
.
tree_config
.
histogram_pool_size
>=
0
)
{
Log
::
Warning
(
"Histogram LRU queue was enabled (histogram_pool_size=%f). Will disable this to reduce communication costs"
,
boosting_config
.
tree_config
.
histogram_pool_size
);
,
boosting_config
.
tree_config
.
histogram_pool_size
);
// Change pool size to -1 (not limit) when using data parallel to reduce communication costs
boosting_config
.
tree_config
.
histogram_pool_size
=
-
1
;
}
...
...
@@ -278,11 +274,11 @@ void TreeConfig::Set(const std::unordered_map<std::string, std::string>& params)
CHECK
(
min_sum_hessian_in_leaf
>
1.0
f
||
min_data_in_leaf
>
0
);
GetDouble
(
params
,
"lambda_l1"
,
&
lambda_l1
);
CHECK
(
lambda_l1
>=
0.0
f
)
GetDouble
(
params
,
"lambda_l2"
,
&
lambda_l2
);
GetDouble
(
params
,
"lambda_l2"
,
&
lambda_l2
);
CHECK
(
lambda_l2
>=
0.0
f
)
GetDouble
(
params
,
"min_gain_to_split"
,
&
min_gain_to_split
);
GetDouble
(
params
,
"min_gain_to_split"
,
&
min_gain_to_split
);
CHECK
(
min_gain_to_split
>=
0.0
f
)
GetInt
(
params
,
"num_leaves"
,
&
num_leaves
);
GetInt
(
params
,
"num_leaves"
,
&
num_leaves
);
CHECK
(
num_leaves
>
1
);
GetInt
(
params
,
"feature_fraction_seed"
,
&
feature_fraction_seed
);
GetDouble
(
params
,
"feature_fraction"
,
&
feature_fraction
);
...
...
src/io/dataset.cpp
View file @
292f972e
...
...
@@ -17,14 +17,12 @@ namespace LightGBM {
const
char
*
Dataset
::
binary_file_token
=
"______LightGBM_Binary_File_Token______
\n
"
;
Dataset
::
Dataset
()
{
num_class_
=
1
;
num_data_
=
0
;
}
Dataset
::
Dataset
(
data_size_t
num_data
,
int
num_class
)
{
num_class_
=
num_class
;
Dataset
::
Dataset
(
data_size_t
num_data
)
{
num_data_
=
num_data
;
metadata_
.
Init
(
num_data_
,
num_class_
,
-
1
,
-
1
);
metadata_
.
Init
(
num_data_
,
-
1
,
-
1
);
}
Dataset
::~
Dataset
()
{
...
...
@@ -50,7 +48,6 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_spars
));
}
features_
.
shrink_to_fit
();
num_class_
=
dataset
->
num_class_
;
used_feature_map_
=
dataset
->
used_feature_map_
;
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
num_total_features_
=
dataset
->
num_total_features_
;
...
...
@@ -59,7 +56,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_spars
}
Dataset
*
Dataset
::
Subset
(
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
,
bool
is_enable_sparse
)
const
{
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_used_indices
,
num_class_
));
auto
ret
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
(
num_used_indices
));
ret
->
CopyFeatureMapperFrom
(
this
,
is_enable_sparse
);
#pragma omp parallel for schedule(guided)
for
(
int
fidx
=
0
;
fidx
<
num_features_
;
++
fidx
)
{
...
...
@@ -169,7 +166,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
size_t
size_of_token
=
std
::
strlen
(
binary_file_token
);
fwrite
(
binary_file_token
,
sizeof
(
char
),
size_of_token
,
file
);
// get size of header
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_class_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
// size of feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
...
...
@@ -178,7 +175,6 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
fwrite
(
&
size_of_header
,
sizeof
(
size_of_header
),
1
,
file
);
// write header
fwrite
(
&
num_data_
,
sizeof
(
num_data_
),
1
,
file
);
fwrite
(
&
num_class_
,
sizeof
(
num_class_
),
1
,
file
);
fwrite
(
&
num_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_features_
),
1
,
file
);
size_t
num_used_feature_map
=
used_feature_map_
.
size
();
...
...
src/io/dataset_loader.cpp
View file @
292f972e
...
...
@@ -8,8 +8,8 @@
namespace
LightGBM
{
DatasetLoader
::
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
const
char
*
filename
)
:
io_config_
(
io_config
),
random_
(
io_config_
.
data_random_seed
),
predict_fun_
(
predict_fun
)
{
DatasetLoader
::
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
,
int
num_class
,
const
char
*
filename
)
:
io_config_
(
io_config
),
random_
(
io_config_
.
data_random_seed
),
predict_fun_
(
predict_fun
)
,
num_class_
(
num_class
)
{
label_idx_
=
0
;
weight_idx_
=
NO_SPECIFIC
;
group_idx_
=
NO_SPECIFIC
;
...
...
@@ -177,8 +177,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
}
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
dataset
->
metadata_
.
Init
(
filename
);
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data to memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
rank
,
num_machines
,
&
num_global_data
,
&
used_data_indices
);
...
...
@@ -188,7 +187,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
// construct feature bin mappers
ConstructBinMappersFromTextData
(
rank
,
num_machines
,
sample_data
,
parser
.
get
(),
dataset
.
get
());
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
io_config_
.
num_class
,
weight_idx_
,
group_idx_
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromMemory
(
text_data
,
parser
.
get
(),
dataset
.
get
());
text_data
.
clear
();
...
...
@@ -203,7 +202,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
// construct feature bin mappers
ConstructBinMappersFromTextData
(
rank
,
num_machines
,
sample_data
,
parser
.
get
(),
dataset
.
get
());
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
.
get
(),
used_data_indices
,
dataset
.
get
());
...
...
@@ -232,14 +231,13 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
}
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
dataset
->
metadata_
.
Init
(
filename
);
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data in memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
0
,
1
,
&
num_global_data
,
&
used_data_indices
);
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_data
.
size
());
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
dataset
->
CopyFeatureMapperFrom
(
train_data
,
io_config_
.
is_enable_sparse
);
// extract features
ExtractFeaturesFromMemory
(
text_data
,
parser
.
get
(),
dataset
.
get
());
...
...
@@ -250,7 +248,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_reader
.
CountLine
());
num_global_data
=
dataset
->
num_data_
;
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
dataset
->
CopyFeatureMapperFrom
(
train_data
,
io_config_
.
is_enable_sparse
);
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
.
get
(),
used_data_indices
,
dataset
.
get
());
...
...
@@ -316,8 +314,6 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
const
char
*
mem_ptr
=
buffer
.
data
();
dataset
->
num_data_
=
*
(
reinterpret_cast
<
const
data_size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_data_
);
dataset
->
num_class_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_class_
);
dataset
->
num_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_features_
);
dataset
->
num_total_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
...
...
@@ -448,7 +444,6 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
}
auto
dataset
=
std
::
unique_ptr
<
Dataset
>
(
new
Dataset
());
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
features_
.
clear
();
dataset
->
num_data_
=
num_data
;
// -1 means doesn't use this feature
...
...
@@ -482,7 +477,7 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
}
dataset
->
feature_names_
=
feature_names_
;
dataset
->
num_features_
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
NO_SPECIFIC
,
NO_SPECIFIC
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
NO_SPECIFIC
,
NO_SPECIFIC
);
return
dataset
.
release
();
}
...
...
@@ -798,7 +793,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
}
}
else
{
// if need to prediction with initial model
std
::
vector
<
score_t
>
init_score
(
dataset
->
num_data_
*
dataset
->
num_class_
);
std
::
vector
<
score_t
>
init_score
(
dataset
->
num_data_
*
num_class_
);
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
...
...
@@ -807,7 +802,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
parser
->
ParseOneLine
(
text_data
[
i
].
c_str
(),
&
oneline_features
,
&
tmp_label
);
// set initial score
std
::
vector
<
double
>
oneline_init_score
=
predict_fun_
(
oneline_features
);
for
(
int
k
=
0
;
k
<
dataset
->
num_class_
;
++
k
)
{
for
(
int
k
=
0
;
k
<
num_class_
;
++
k
)
{
init_score
[
k
*
dataset
->
num_data_
+
i
]
=
static_cast
<
float
>
(
oneline_init_score
[
k
]);
}
// set label
...
...
@@ -833,7 +828,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
}
}
// metadata_ will manage space of init_score
dataset
->
metadata_
.
SetInitScore
(
init_score
.
data
(),
dataset
->
num_data_
*
dataset
->
num_class_
);
dataset
->
metadata_
.
SetInitScore
(
init_score
.
data
(),
dataset
->
num_data_
*
num_class_
);
}
dataset
->
FinishLoad
();
// text data can be free after loaded feature values
...
...
@@ -844,7 +839,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>& text_dat
void
DatasetLoader
::
ExtractFeaturesFromFile
(
const
char
*
filename
,
const
Parser
*
parser
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
,
Dataset
*
dataset
)
{
std
::
vector
<
score_t
>
init_score
;
if
(
predict_fun_
!=
nullptr
)
{
init_score
=
std
::
vector
<
score_t
>
(
dataset
->
num_data_
*
dataset
->
num_class_
);
init_score
=
std
::
vector
<
score_t
>
(
dataset
->
num_data_
*
num_class_
);
}
std
::
function
<
void
(
data_size_t
,
const
std
::
vector
<
std
::
string
>&
)
>
process_fun
=
[
this
,
&
init_score
,
&
parser
,
&
dataset
]
...
...
@@ -860,7 +855,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
// set initial score
if
(
!
init_score
.
empty
())
{
std
::
vector
<
double
>
oneline_init_score
=
predict_fun_
(
oneline_features
);
for
(
int
k
=
0
;
k
<
dataset
->
num_class_
;
++
k
)
{
for
(
int
k
=
0
;
k
<
num_class_
;
++
k
)
{
init_score
[
k
*
dataset
->
num_data_
+
start_idx
+
i
]
=
static_cast
<
float
>
(
oneline_init_score
[
k
]);
}
}
...
...
@@ -894,7 +889,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
// metadata_ will manage space of init_score
if
(
!
init_score
.
empty
())
{
dataset
->
metadata_
.
SetInitScore
(
init_score
.
data
(),
dataset
->
num_data_
*
dataset
->
num_class_
);
dataset
->
metadata_
.
SetInitScore
(
init_score
.
data
(),
dataset
->
num_data_
*
num_class_
);
}
dataset
->
FinishLoad
();
}
...
...
src/io/metadata.cpp
View file @
292f972e
...
...
@@ -10,9 +10,8 @@ namespace LightGBM {
Metadata
::
Metadata
()
{
}
void
Metadata
::
Init
(
const
char
*
data_filename
,
const
int
num_class
)
{
void
Metadata
::
Init
(
const
char
*
data_filename
)
{
data_filename_
=
data_filename
;
num_class_
=
num_class
;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries
();
LoadWeights
();
...
...
@@ -20,15 +19,11 @@ void Metadata::Init(const char * data_filename, const int num_class) {
LoadInitialScore
();
}
Metadata
::~
Metadata
()
{
}
void
Metadata
::
Init
(
data_size_t
num_data
,
int
num_class
,
int
weight_idx
,
int
query_idx
)
{
void
Metadata
::
Init
(
data_size_t
num_data
,
int
weight_idx
,
int
query_idx
)
{
num_data_
=
num_data
;
num_class_
=
num_class
;
label_
=
std
::
vector
<
float
>
(
num_data_
);
if
(
weight_idx
>=
0
)
{
if
(
!
weights_
.
empty
())
{
...
...
@@ -52,7 +47,6 @@ void Metadata::Init(data_size_t num_data, int num_class, int weight_idx, int que
void
Metadata
::
Init
(
const
Metadata
&
fullset
,
const
data_size_t
*
used_indices
,
data_size_t
num_used_indices
)
{
num_data_
=
num_used_indices
;
num_class_
=
fullset
.
num_class_
;
label_
=
std
::
vector
<
float
>
(
num_used_indices
);
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
i
++
)
{
...
...
@@ -70,10 +64,13 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
}
if
(
!
fullset
.
init_score_
.
empty
())
{
init_score_
=
std
::
vector
<
float
>
(
num_used_indices
);
num_init_score_
=
num_used_indices
;
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
i
++
)
{
init_score_
[
i
]
=
fullset
.
init_score_
[
used_indices
[
i
]];
int
num_class
=
static_cast
<
int
>
(
fullset
.
num_init_score_
)
/
fullset
.
num_data_
;
init_score_
=
std
::
vector
<
float
>
(
num_used_indices
*
num_class
);
num_init_score_
=
num_used_indices
*
num_class
;
for
(
int
k
=
0
;
k
<
num_class
;
++
k
)
{
for
(
data_size_t
i
=
0
;
i
<
num_used_indices
;
i
++
)
{
init_score_
[
k
*
num_data_
+
i
]
=
fullset
.
init_score_
[
k
*
fullset
.
num_data_
+
used_indices
[
i
]];
}
}
}
else
{
num_init_score_
=
0
;
...
...
@@ -168,7 +165,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
}
// contain initial score file
if
(
!
init_score_
.
empty
()
&&
num_init_score_
!=
num_data_
)
{
if
(
!
init_score_
.
empty
()
&&
(
num_init_score_
%
num_data_
)
!=
0
)
{
init_score_
.
clear
();
num_init_score_
=
0
;
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
...
...
@@ -189,7 +186,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
}
// contain initial score file
if
(
!
init_score_
.
empty
()
&&
num_init_score_
!=
num_all_data
)
{
if
(
!
init_score_
.
empty
()
&&
(
num_init_score_
%
num_all_data
)
!=
0
)
{
init_score_
.
clear
();
num_init_score_
=
0
;
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
...
...
@@ -242,9 +239,10 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
// get local initial scores
if
(
!
init_score_
.
empty
())
{
auto
old_scores
=
init_score_
;
num_init_score_
=
num_data_
;
init_score_
=
std
::
vector
<
float
>
(
num_init_score_
*
num_class_
);
for
(
int
k
=
0
;
k
<
num_class_
;
++
k
){
int
num_class
=
num_init_score_
/
num_all_data
;
num_init_score_
=
num_data_
*
num_class
;
init_score_
=
std
::
vector
<
float
>
(
num_init_score_
);
for
(
int
k
=
0
;
k
<
num_class
;
++
k
){
for
(
size_t
i
=
0
;
i
<
used_data_indices
.
size
();
++
i
)
{
init_score_
[
k
*
num_data_
+
i
]
=
old_scores
[
k
*
num_all_data
+
used_data_indices
[
i
]];
}
...
...
@@ -266,11 +264,11 @@ void Metadata::SetInitScore(const float* init_score, data_size_t len) {
num_init_score_
=
0
;
return
;
}
if
(
len
!=
num_data_
*
num_class_
)
{
if
(
(
len
%
num_data_
)
!=
0
)
{
Log
::
Fatal
(
"Initial score size doesn't match data size"
);
}
if
(
!
init_score_
.
empty
())
{
init_score_
.
clear
();
}
num_init_score_
=
num_data_
;
num_init_score_
=
len
;
init_score_
=
std
::
vector
<
float
>
(
len
);
for
(
data_size_t
i
=
0
;
i
<
len
;
++
i
)
{
init_score_
[
i
]
=
init_score
[
i
];
...
...
@@ -410,28 +408,32 @@ void Metadata::LoadInitialScore() {
return
;
}
Log
::
Info
(
"Loading initial scores..."
);
num_init_score_
=
static_cast
<
data_size_t
>
(
reader
.
Lines
().
size
());
init_score_
=
std
::
vector
<
float
>
(
num_init_score_
*
num_class_
);
// use first line to count number class
int
num_class
=
static_cast
<
int
>
(
Common
::
Split
(
reader
.
Lines
()[
0
].
c_str
(),
'\t'
).
size
());
data_size_t
num_line
=
static_cast
<
data_size_t
>
(
reader
.
Lines
().
size
());
num_init_score_
=
static_cast
<
data_size_t
>
(
num_line
*
num_class
);
init_score_
=
std
::
vector
<
float
>
(
num_init_score_
);
double
tmp
=
0.0
f
;
if
(
num_class
_
==
1
){
for
(
data_size_t
i
=
0
;
i
<
num_in
it_score_
;
++
i
)
{
Common
::
Atof
(
reader
.
Lines
()[
i
].
c_str
(),
&
tmp
);
init_score_
[
i
]
=
static_cast
<
float
>
(
tmp
);
}
if
(
num_class
==
1
)
{
for
(
data_size_t
i
=
0
;
i
<
num_
l
in
e
;
++
i
)
{
Common
::
Atof
(
reader
.
Lines
()[
i
].
c_str
(),
&
tmp
);
init_score_
[
i
]
=
static_cast
<
float
>
(
tmp
);
}
}
else
{
std
::
vector
<
std
::
string
>
oneline_init_score
;
for
(
data_size_t
i
=
0
;
i
<
num_init_score_
;
++
i
)
{
oneline_init_score
=
Common
::
Split
(
reader
.
Lines
()[
i
].
c_str
(),
'\t'
);
if
(
static_cast
<
int
>
(
oneline_init_score
.
size
())
!=
num_class_
){
Log
::
Fatal
(
"Invalid initial score file. Redundant or insufficient columns."
);
}
for
(
int
k
=
0
;
k
<
num_class_
;
++
k
)
{
Common
::
Atof
(
oneline_init_score
[
k
].
c_str
(),
&
tmp
);
init_score_
[
k
*
num_init_score_
+
i
]
=
static_cast
<
float
>
(
tmp
);
}
std
::
vector
<
std
::
string
>
oneline_init_score
;
for
(
data_size_t
i
=
0
;
i
<
num_line
;
++
i
)
{
oneline_init_score
=
Common
::
Split
(
reader
.
Lines
()[
i
].
c_str
(),
'\t'
);
if
(
static_cast
<
int
>
(
oneline_init_score
.
size
())
!=
num_class
)
{
Log
::
Fatal
(
"Invalid initial score file. Redundant or insufficient columns."
);
}
for
(
int
k
=
0
;
k
<
num_class
;
++
k
)
{
Common
::
Atof
(
oneline_init_score
[
k
].
c_str
(),
&
tmp
);
init_score_
[
k
*
num_line
+
i
]
=
static_cast
<
float
>
(
tmp
);
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment