Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1c08e71e
Commit
1c08e71e
authored
Nov 04, 2016
by
Guolin Ke
Browse files
use dataset_loader to load data
parent
8696709e
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
898 additions
and
1067 deletions
+898
-1067
include/LightGBM/application.h
include/LightGBM/application.h
+3
-0
include/LightGBM/config.h
include/LightGBM/config.h
+0
-2
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+10
-149
src/application/application.cpp
src/application/application.cpp
+11
-16
src/c_api.cpp
src/c_api.cpp
+4
-1
src/io/config.cpp
src/io/config.cpp
+0
-1
src/io/dataset.cpp
src/io/dataset.cpp
+21
-888
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+833
-0
src/io/metadata.cpp
src/io/metadata.cpp
+8
-10
windows/LightGBM.vcxproj
windows/LightGBM.vcxproj
+2
-0
windows/LightGBM.vcxproj.filters
windows/LightGBM.vcxproj.filters
+6
-0
No files found.
include/LightGBM/application.h
View file @
1c08e71e
...
...
@@ -8,6 +8,7 @@
namespace
LightGBM
{
class
DatasetLoader
;
class
Dataset
;
class
Boosting
;
class
ObjectiveFunction
;
...
...
@@ -59,6 +60,8 @@ private:
/*! \brief All configs */
OverallConfig
config_
;
/*! \brief Dataset loader */
DatasetLoader
*
dataset_loader_
;
/*! \brief Training data */
Dataset
*
train_data_
;
/*! \brief Validation data */
...
...
include/LightGBM/config.h
View file @
1c08e71e
...
...
@@ -93,7 +93,6 @@ public:
std
::
string
output_model
=
"LightGBM_model.txt"
;
std
::
string
output_result
=
"LightGBM_predict_result.txt"
;
std
::
string
input_model
=
""
;
std
::
string
input_init_score
=
""
;
int
verbosity
=
1
;
int
num_model_predict
=
-
1
;
bool
is_pre_partition
=
false
;
...
...
@@ -318,7 +317,6 @@ struct ParameterAlias {
{
"model_out"
,
"output_model"
},
{
"model_input"
,
"input_model"
},
{
"model_in"
,
"input_model"
},
{
"init_score"
,
"input_init_score"
},
{
"predict_result"
,
"output_result"
},
{
"prediction_result"
,
"output_result"
},
{
"valid"
,
"valid_data"
},
...
...
include/LightGBM/dataset.h
View file @
1c08e71e
#ifndef LIGHTGBM_DATA_H_
#define LIGHTGBM_DATA_H_
#ifndef LIGHTGBM_DATA
SET
_H_
#define LIGHTGBM_DATA
SET
_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
...
...
@@ -18,6 +18,7 @@ namespace LightGBM {
/*! \brief forward declaration */
class
Feature
;
class
BinMapper
;
class
DatasetLoader
;
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
...
...
@@ -44,13 +45,7 @@ public:
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void
Init
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
int
num_class
);
/*!
* \brief Initialize, only load initial score
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void
Init
(
const
char
*
init_score_filename
,
const
int
num_class
);
void
Init
(
const
char
*
data_filename
,
const
int
num_class
);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
...
...
@@ -177,11 +172,10 @@ public:
* \return Pointer of initial scores
*/
inline
const
float
*
init_score
()
const
{
return
init_score_
;
}
/*! \brief Load initial scores from file */
void
LoadInitialScore
();
private:
/*! \brief Load initial scores from file */
void
LoadInitialScore
();
/*! \brief Load wights from file */
void
LoadWeights
();
/*! \brief Load query boundaries from file */
...
...
@@ -190,8 +184,6 @@ private:
void
LoadQueryWeights
();
/*! \brief Filename of current data */
const
char
*
data_filename_
;
/*! \brief Filename of initial scores */
const
char
*
init_score_filename_
;
/*! \brief Number of data */
data_size_t
num_data_
;
/*! \brief Number of classes */
...
...
@@ -251,79 +243,16 @@ using PredictFunction =
*/
class
Dataset
{
public:
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
);
friend
DatasetLoader
;
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
char
*
data_filename
,
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
Dataset
(
data_filename
,
""
,
io_config
,
predict_fun
)
{
}
/*!
* \brief Constructor, without filename, used to load data from memory
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
);
Dataset
();
/*! \brief Destructor */
~
Dataset
();
/*! \brief Init Dataset with specific binmapper */
void
InitByBinMapper
(
std
::
vector
<
const
BinMapper
*>
bin_mappers
,
data_size_t
num_data
);
/*! \brief push raw data into dataset */
void
PushData
(
const
std
::
vector
<
std
::
vector
<
std
::
pair
<
int
,
float
>>>&
datas
,
data_size_t
start_idx
,
bool
is_finished
);
void
SetField
(
const
char
*
field_name
,
const
void
*
field_data
,
data_size_t
num_element
,
int
type
);
/*!
* \brief Load training data on parallel training
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param use_two_round_loading True if need to use two round loading
*/
void
LoadTrainData
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
bool
use_two_round_loading
);
/*!
* \brief Load training data on single machine training
* \param use_two_round_loading True if need to use two round loading
*/
inline
void
LoadTrainData
(
bool
use_two_round_loading
)
{
LoadTrainData
(
0
,
1
,
false
,
use_two_round_loading
);
}
/*!
* \brief Load data and use bin mapper from other data set, general this function is used to extract feature for validation data
* \param train_set Other loaded data set
* \param use_two_round_loading True if need to use two round loading
*/
void
LoadValidationData
(
const
Dataset
*
train_set
,
bool
use_two_round_loading
);
/*!
* \brief Load data set from binary file
* \param bin_filename filename of bin data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void
LoadDataFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
bool
is_pre_partition
);
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
...
...
@@ -331,6 +260,8 @@ public:
std
::
vector
<
const
BinMapper
*>
GetBinMappers
()
const
;
void
CopyFeatureMetadataTo
(
Dataset
*
dataset
,
bool
is_enable_sparse
)
const
;
/*!
* \brief Get a feature pointer for specific index
* \param i Index for feature
...
...
@@ -365,57 +296,7 @@ public:
Dataset
(
const
Dataset
&
)
=
delete
;
private:
/*!
* \brief Load data content on memory. if num_machines > 1 and !is_pre_partition, will partition data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void
LoadDataToMemory
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
);
/*!
* \brief Sample data from memory, need load data to memory first
* \param out_data Store the sampled data
*/
void
SampleDataFromMemory
(
std
::
vector
<
std
::
string
>*
out_data
);
/*!
* \brief Sample data from file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param out_data Store the sampled data
*/
void
SampleDataFromFile
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
std
::
vector
<
std
::
string
>*
out_data
);
/*!
* \brief Get feature bin mapper from sampled data.
* if num_machines > 1, differnt machines will construct bin mapper for different features, then have a global sync up
* \param rank Rank of local machine
* \param num_machines Total number of all machines
*/
void
ConstructBinMappers
(
int
rank
,
int
num_machines
,
const
std
::
vector
<
std
::
string
>&
sample_data
);
/*! \brief Extract local features from memory */
void
ExtractFeaturesFromMemory
();
/*! \brief Extract local features from file */
void
ExtractFeaturesFromFile
();
/*! \brief Check can load from binary file */
void
CheckCanLoadFromBin
();
/*! \brief Check this data set is null or not */
void
CheckDataset
();
/*! \brief Filename of data */
const
char
*
data_filename_
;
/*! \brief A reader class that can read text data */
TextReader
<
data_size_t
>*
text_reader_
;
/*! \brief A parser class that can parse data */
Parser
*
parser_
;
/*! \brief Store used features */
std
::
vector
<
Feature
*>
features_
;
/*! \brief Mapper from real feature index to used index*/
...
...
@@ -430,32 +311,12 @@ private:
int
num_class_
;
/*! \brief Store some label level data*/
Metadata
metadata_
;
/*! \brief Random generator*/
Random
random_
;
/*! \brief The maximal number of bin that feature values will bucket in */
int
max_bin_
;
/*! \brief True if enable sparse */
bool
is_enable_sparse_
;
/*! \brief True if dataset is loaded from binary file */
bool
is_loading_from_binfile_
;
/*! \brief Number of global data, used for distributed learning */
size_t
global_num_data_
=
0
;
/*! \brief used to local used data indices */
std
::
vector
<
data_size_t
>
used_data_indices_
;
/*! \brief prediction function for initial model */
const
PredictFunction
&
predict_fun_
;
/*! \brief index of label column */
int
label_idx_
=
0
;
/*! \brief index of weight column */
int
weight_idx_
=
-
1
;
/*! \brief index of group column */
int
group_idx_
=
-
1
;
/*! \brief Mapper from real feature index to used index*/
std
::
unordered_set
<
int
>
ignore_features_
;
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief store feature names */
int
bin_construct_sample_cnt_
;
};
}
// namespace LightGBM
...
...
src/application/application.cpp
View file @
1c08e71e
...
...
@@ -5,6 +5,7 @@
#include <LightGBM/network.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
...
...
@@ -26,7 +27,7 @@
namespace
LightGBM
{
Application
::
Application
(
int
argc
,
char
**
argv
)
:
train_data_
(
nullptr
),
boosting_
(
nullptr
),
objective_fun_
(
nullptr
)
{
:
dataset_loader_
(
nullptr
),
train_data_
(
nullptr
),
boosting_
(
nullptr
),
objective_fun_
(
nullptr
)
{
LoadParameters
(
argc
,
argv
);
// set number of threads for openmp
if
(
config_
.
num_threads
>
0
)
{
...
...
@@ -35,6 +36,7 @@ Application::Application(int argc, char** argv)
}
Application
::~
Application
()
{
if
(
dataset_loader_
!=
nullptr
)
{
delete
dataset_loader_
;
}
if
(
train_data_
!=
nullptr
)
{
delete
train_data_
;
}
for
(
auto
&
data
:
valid_datas_
)
{
if
(
data
!=
nullptr
)
{
delete
data
;
}
...
...
@@ -141,19 +143,17 @@ void Application::LoadData() {
config_
.
io_config
.
data_random_seed
=
GlobalSyncUpByMin
<
int
>
(
config_
.
io_config
.
data_random_seed
);
}
train_data_
=
new
Dataset
(
config_
.
io_config
.
data_filename
.
c_str
(),
config_
.
io_config
.
input_init_score
.
c_str
(),
config_
.
io_config
,
predict_fun
);
dataset_loader_
=
new
DatasetLoader
(
config_
.
io_config
,
predict_fun
);
dataset_loader_
->
SetHeadder
(
config_
.
io_config
.
data_filename
.
c_str
());
// load Training data
if
(
config_
.
is_parallel_find_bin
)
{
// load data for parallel training
train_data_
->
LoadTrainData
(
Network
::
rank
(),
Network
::
num_machines
(),
config_
.
io_config
.
is_pre_partition
,
config_
.
io_config
.
use_two_round_loading
);
train_data_
=
dataset_loader_
->
LoadFromFile
(
config_
.
io_config
.
data_filename
.
c_str
(),
Network
::
rank
(),
Network
::
num_machines
());
}
else
{
// load data for single machine
train_data_
->
LoadTrainData
(
config_
.
io_config
.
use_two_round_loading
);
train_data_
=
dataset_loader_
->
LoadFromFile
(
config_
.
io_config
.
data_filename
.
c_str
(),
0
,
1
);
}
// need save binary file
if
(
config_
.
io_config
.
is_save_binary_file
)
{
...
...
@@ -173,13 +173,8 @@ void Application::LoadData() {
// Add validation data, if it exists
for
(
size_t
i
=
0
;
i
<
config_
.
io_config
.
valid_data_filenames
.
size
();
++
i
)
{
// add
valid_datas_
.
push_back
(
new
Dataset
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
config_
.
io_config
,
predict_fun
));
// load validation data like train data
valid_datas_
.
back
()
->
LoadValidationData
(
train_data_
,
config_
.
io_config
.
use_two_round_loading
);
valid_datas_
.
push_back
(
dataset_loader_
->
LoadFromFileLikeOthers
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
train_data_
));
// need save binary file
if
(
config_
.
io_config
.
is_save_binary_file
)
{
valid_datas_
.
back
()
->
SaveBinaryFile
(
nullptr
);
...
...
src/c_api.cpp
View file @
1c08e71e
#include <LightGBM/c_api.h>
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
...
...
@@ -10,6 +9,7 @@
#include <vector>
#include <string>
#include <cstring>
#include <memory>
namespace
LightGBM
{
...
...
@@ -100,3 +100,6 @@ private:
};
}
using
namespace
LightGBM
;
src/io/config.cpp
View file @
1c08e71e
...
...
@@ -202,7 +202,6 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString
(
params
,
"output_model"
,
&
output_model
);
GetString
(
params
,
"input_model"
,
&
input_model
);
GetString
(
params
,
"output_result"
,
&
output_result
);
GetString
(
params
,
"input_init_score"
,
&
input_init_score
);
std
::
string
tmp_str
=
""
;
if
(
GetString
(
params
,
"valid_data"
,
&
tmp_str
))
{
valid_data_filenames
=
Common
::
Split
(
tmp_str
.
c_str
(),
','
);
...
...
src/io/dataset.cpp
View file @
1c08e71e
...
...
@@ -15,295 +15,32 @@
namespace
LightGBM
{
Dataset
::
Dataset
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
data_filename_
(
data_filename
),
random_
(
io_config
.
data_random_seed
),
max_bin_
(
io_config
.
max_bin
),
is_enable_sparse_
(
io_config
.
is_enable_sparse
),
predict_fun_
(
predict_fun
),
bin_construct_sample_cnt_
(
io_config
.
bin_construct_sample_cnt
)
{
num_class_
=
io_config
.
num_class
;
if
(
io_config
.
enable_load_from_binary_file
)
{
CheckCanLoadFromBin
();
}
if
(
is_loading_from_binfile_
&&
predict_fun
!=
nullptr
)
{
Log
::
Info
(
"Cannot initialize prediction by using a binary file, using text file instead"
);
is_loading_from_binfile_
=
false
;
}
if
(
!
is_loading_from_binfile_
)
{
// load weight, query information and initialize score
metadata_
.
Init
(
data_filename
,
init_score_filename
,
num_class_
);
// create text reader
text_reader_
=
new
TextReader
<
data_size_t
>
(
data_filename
,
io_config
.
has_header
);
std
::
unordered_map
<
std
::
string
,
int
>
name2idx
;
// get column names
if
(
io_config
.
has_header
)
{
std
::
string
first_line
=
text_reader_
->
first_line
();
feature_names_
=
Common
::
Split
(
first_line
.
c_str
(),
"
\t
,"
);
for
(
size_t
i
=
0
;
i
<
feature_names_
.
size
();
++
i
)
{
name2idx
[
feature_names_
[
i
]]
=
static_cast
<
int
>
(
i
);
}
}
std
::
string
name_prefix
(
"name:"
);
// load label idx
if
(
io_config
.
label_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
label_column
,
name_prefix
))
{
std
::
string
name
=
io_config
.
label_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
label_idx_
=
name2idx
[
name
];
Log
::
Info
(
"Using column %s as label"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"Could not find label column %s in data file"
,
name
.
c_str
());
}
}
else
{
if
(
!
Common
::
AtoiAndCheck
(
io_config
.
label_column
.
c_str
(),
&
label_idx_
))
{
Log
::
Fatal
(
"label_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
Log
::
Info
(
"Using column number %d as label"
,
label_idx_
);
}
}
if
(
feature_names_
.
size
()
>
0
)
{
// erase label column name
feature_names_
.
erase
(
feature_names_
.
begin
()
+
label_idx_
);
}
// load ignore columns
if
(
io_config
.
ignore_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
ignore_column
,
name_prefix
))
{
std
::
string
names
=
io_config
.
ignore_column
.
substr
(
name_prefix
.
size
());
for
(
auto
name
:
Common
::
Split
(
names
.
c_str
(),
','
))
{
if
(
name2idx
.
count
(
name
)
>
0
)
{
int
tmp
=
name2idx
[
name
];
// skip for label column
if
(
tmp
>
label_idx_
)
{
tmp
-=
1
;
}
ignore_features_
.
emplace
(
tmp
);
}
else
{
Log
::
Fatal
(
"Could not find ignore column %s in data file"
,
name
.
c_str
());
}
}
}
else
{
for
(
auto
token
:
Common
::
Split
(
io_config
.
ignore_column
.
c_str
(),
','
))
{
int
tmp
=
0
;
if
(
!
Common
::
AtoiAndCheck
(
token
.
c_str
(),
&
tmp
))
{
Log
::
Fatal
(
"ignore_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
// skip for label column
if
(
tmp
>
label_idx_
)
{
tmp
-=
1
;
}
ignore_features_
.
emplace
(
tmp
);
}
}
}
// load weight idx
if
(
io_config
.
weight_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
weight_column
,
name_prefix
))
{
std
::
string
name
=
io_config
.
weight_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
weight_idx_
=
name2idx
[
name
];
Log
::
Info
(
"Using column %s as weight"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"Could not find weight column %s in data file"
,
name
.
c_str
());
}
}
else
{
if
(
!
Common
::
AtoiAndCheck
(
io_config
.
weight_column
.
c_str
(),
&
weight_idx_
))
{
Log
::
Fatal
(
"weight_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
Log
::
Info
(
"Using column number %d as weight"
,
weight_idx_
);
}
// skip for label column
if
(
weight_idx_
>
label_idx_
)
{
weight_idx_
-=
1
;
}
ignore_features_
.
emplace
(
weight_idx_
);
}
if
(
io_config
.
group_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config
.
group_column
,
name_prefix
))
{
std
::
string
name
=
io_config
.
group_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
group_idx_
=
name2idx
[
name
];
Log
::
Info
(
"Using column %s as group/query id"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"Could not find group/query column %s in data file"
,
name
.
c_str
());
}
}
else
{
if
(
!
Common
::
AtoiAndCheck
(
io_config
.
group_column
.
c_str
(),
&
group_idx_
))
{
Log
::
Fatal
(
"group_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
Log
::
Info
(
"Using column number %d as group/query id"
,
group_idx_
);
}
// skip for label column
if
(
group_idx_
>
label_idx_
)
{
group_idx_
-=
1
;
}
ignore_features_
.
emplace
(
group_idx_
);
}
// create text parser
parser_
=
Parser
::
CreateParser
(
data_filename_
,
io_config
.
has_header
,
0
,
label_idx_
);
if
(
parser_
==
nullptr
)
{
Log
::
Fatal
(
"Could not recognize data format of %s"
,
data_filename_
);
}
}
else
{
// only need to load initialize score, other meta data will be loaded from binary file
metadata_
.
Init
(
init_score_filename
,
num_class_
);
Log
::
Info
(
"Loading data set from binary file"
);
parser_
=
nullptr
;
text_reader_
=
nullptr
;
}
Dataset
::
Dataset
()
{
num_class_
=
1
;
num_data_
=
0
;
is_loading_from_binfile_
=
false
;
}
Dataset
::
Dataset
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
data_filename_
(
""
),
random_
(
io_config
.
data_random_seed
),
max_bin_
(
io_config
.
max_bin
),
is_enable_sparse_
(
io_config
.
is_enable_sparse
),
predict_fun_
(
predict_fun
),
bin_construct_sample_cnt_
(
io_config
.
bin_construct_sample_cnt
)
{
num_class_
=
io_config
.
num_class
;
parser_
=
nullptr
;
text_reader_
=
nullptr
;
}
Dataset
::~
Dataset
()
{
if
(
parser_
!=
nullptr
)
{
delete
parser_
;
}
if
(
text_reader_
!=
nullptr
)
{
delete
text_reader_
;
}
for
(
auto
&
feature
:
features_
)
{
delete
feature
;
}
features_
.
clear
();
}
void
Dataset
::
LoadDataToMemory
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
)
{
used_data_indices_
.
clear
();
if
(
num_machines
==
1
||
is_pre_partition
)
{
// read all lines
num_data_
=
text_reader_
->
ReadAllLines
();
global_num_data_
=
num_data_
;
}
else
{
// need partition data
// get query data
const
data_size_t
*
query_boundaries
=
metadata_
.
query_boundaries
();
if
(
query_boundaries
==
nullptr
)
{
// if not contain query data, minimal sample unit is one record
global_num_data_
=
text_reader_
->
ReadAndFilterLines
([
this
,
rank
,
num_machines
](
data_size_t
)
{
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
return
true
;
}
else
{
return
false
;
}
},
&
used_data_indices_
);
}
else
{
// if contain query data, minimal sample unit is one query
data_size_t
num_queries
=
metadata_
.
num_queries
();
data_size_t
qid
=
-
1
;
bool
is_query_used
=
false
;
global_num_data_
=
text_reader_
->
ReadAndFilterLines
(
[
this
,
rank
,
num_machines
,
&
qid
,
&
query_boundaries
,
&
is_query_used
,
num_queries
]
(
data_size_t
line_idx
)
{
if
(
qid
>=
num_queries
)
{
Log
::
Fatal
(
"Current query exceeds the range of the query file, please ensure the query file is correct"
);
}
if
(
line_idx
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
}
return
is_query_used
;
},
&
used_data_indices_
);
}
// set number of data
num_data_
=
static_cast
<
data_size_t
>
(
used_data_indices_
.
size
());
}
}
void
Dataset
::
SampleDataFromMemory
(
std
::
vector
<
std
::
string
>*
out_data
)
{
const
size_t
sample_cnt
=
static_cast
<
size_t
>
(
num_data_
<
bin_construct_sample_cnt_
?
num_data_
:
bin_construct_sample_cnt_
);
std
::
vector
<
size_t
>
sample_indices
=
random_
.
Sample
(
num_data_
,
sample_cnt
);
out_data
->
clear
();
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
const
size_t
idx
=
sample_indices
[
i
];
out_data
->
push_back
(
text_reader_
->
Lines
()[
idx
]);
}
}
void
Dataset
::
SampleDataFromFile
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
std
::
vector
<
std
::
string
>*
out_data
)
{
used_data_indices_
.
clear
();
const
data_size_t
sample_cnt
=
static_cast
<
data_size_t
>
(
bin_construct_sample_cnt_
);
if
(
num_machines
==
1
||
is_pre_partition
)
{
num_data_
=
static_cast
<
data_size_t
>
(
text_reader_
->
SampleFromFile
(
random_
,
sample_cnt
,
out_data
));
global_num_data_
=
num_data_
;
}
else
{
// need partition data
// get query data
const
data_size_t
*
query_boundaries
=
metadata_
.
query_boundaries
();
if
(
query_boundaries
==
nullptr
)
{
// if not contain query file, minimal sample unit is one record
global_num_data_
=
text_reader_
->
SampleAndFilterFromFile
([
this
,
rank
,
num_machines
]
(
data_size_t
)
{
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
return
true
;
}
else
{
return
false
;
}
},
&
used_data_indices_
,
random_
,
sample_cnt
,
out_data
);
}
else
{
// if contain query file, minimal sample unit is one query
data_size_t
num_queries
=
metadata_
.
num_queries
();
data_size_t
qid
=
-
1
;
bool
is_query_used
=
false
;
global_num_data_
=
text_reader_
->
SampleAndFilterFromFile
(
[
this
,
rank
,
num_machines
,
&
qid
,
&
query_boundaries
,
&
is_query_used
,
num_queries
]
(
data_size_t
line_idx
)
{
if
(
qid
>=
num_queries
)
{
Log
::
Fatal
(
"Query id exceeds the range of the query file, \
please ensure the query file is correct"
);
}
if
(
line_idx
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
}
return
is_query_used
;
},
&
used_data_indices_
,
random_
,
sample_cnt
,
out_data
);
}
num_data_
=
static_cast
<
data_size_t
>
(
used_data_indices_
.
size
());
}
}
void
Dataset
::
InitByBinMapper
(
std
::
vector
<
const
BinMapper
*>
bin_mappers
,
data_size_t
num_data
)
{
num_data_
=
num_data
;
global_num_data_
=
num_data_
;
// initialize label
metadata_
.
Init
(
num_data_
,
num_class_
,
-
1
,
-
1
);
// free old memory
for
(
auto
&
feature
:
features_
)
{
delete
feature
;
}
features_
.
clear
();
used_feature_map_
=
std
::
vector
<
int
>
(
bin_mappers
.
size
(),
-
1
);
for
(
size_t
i
=
0
;
i
<
bin_mappers
.
size
();
++
i
)
{
if
(
bin_mappers
[
i
]
!=
nullptr
)
{
features_
.
push_back
(
new
Feature
(
static_cast
<
int
>
(
i
),
new
BinMapper
(
bin_mappers
[
i
]),
num_data_
,
is_enable_sparse_
));
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
features_
.
size
());
}
void
Dataset
::
CopyFeatureMetadataTo
(
Dataset
*
dataset
,
bool
is_enable_sparse
)
const
{
dataset
->
features_
.
clear
();
// copy feature bin mapper data
for
(
Feature
*
feature
:
features_
)
{
dataset
->
features_
.
push_back
(
new
Feature
(
feature
->
feature_index
(),
new
BinMapper
(
*
feature
->
bin_mapper
()),
dataset
->
num_data_
,
is_enable_sparse
));
}
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
dataset
->
used_feature_map_
=
used_feature_map_
;
dataset
->
num_features_
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
dataset
->
num_total_features_
=
num_total_features_
;
dataset
->
feature_names_
=
feature_names_
;
}
std
::
vector
<
const
BinMapper
*>
Dataset
::
GetBinMappers
()
const
{
...
...
@@ -314,27 +51,6 @@ std::vector<const BinMapper*> Dataset::GetBinMappers() const {
return
ret
;
}
void
Dataset
::
PushData
(
const
std
::
vector
<
std
::
vector
<
std
::
pair
<
int
,
float
>>>&
datas
,
data_size_t
start_idx
,
bool
is_finished
)
{
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(guided)
for
(
data_size_t
i
=
0
;
i
<
static_cast
<
int
>
(
datas
.
size
());
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
for
(
auto
&
inner_data
:
datas
[
i
])
{
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
start_idx
+
i
,
inner_data
.
second
);
}
}
}
if
(
is_finished
)
{
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
features_
[
i
]
->
FinishLoad
();
}
}
}
void
Dataset
::
SetField
(
const
char
*
field_name
,
const
void
*
field_data
,
data_size_t
num_element
,
int
type
)
{
std
::
string
name
(
field_name
);
name
=
Common
::
Trim
(
name
);
...
...
@@ -343,7 +59,7 @@ void Dataset::SetField(const char* field_name, const void* field_data, data_size
Log
::
Fatal
(
"type of label should be float"
);
}
metadata_
.
SetLabel
(
static_cast
<
const
float
*>
(
field_data
),
num_element
);
}
else
if
(
name
==
std
::
string
(
"weight"
)
||
name
==
std
::
string
(
"weights"
))
{
}
else
if
(
name
==
std
::
string
(
"weight"
)
||
name
==
std
::
string
(
"weights"
))
{
if
(
type
!=
0
)
{
Log
::
Fatal
(
"type of weights should be float"
);
}
...
...
@@ -363,396 +79,8 @@ void Dataset::SetField(const char* field_name, const void* field_data, data_size
}
}
void
Dataset
::
ConstructBinMappers
(
int
rank
,
int
num_machines
,
const
std
::
vector
<
std
::
string
>&
sample_data
)
{
// sample_values[i][j], means the value of j-th sample on i-th feature
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
// temp buffer for one line features and label
std
::
vector
<
std
::
pair
<
int
,
double
>>
oneline_features
;
double
label
;
for
(
size_t
i
=
0
;
i
<
sample_data
.
size
();
++
i
)
{
oneline_features
.
clear
();
// parse features
parser_
->
ParseOneLine
(
sample_data
[
i
].
c_str
(),
&
oneline_features
,
&
label
);
// push 0 first, then edit the value according existing feature values
for
(
auto
&
feature_values
:
sample_values
)
{
feature_values
.
push_back
(
0.0
);
}
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
oneline_features
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
// if need expand feature set
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
// push i+1 0
sample_values
.
emplace_back
(
i
+
1
,
0.0
f
);
}
}
// edit the feature value
sample_values
[
inner_data
.
first
][
i
]
=
inner_data
.
second
;
}
}
features_
.
clear
();
// -1 means doesn't use this feature
used_feature_map_
=
std
::
vector
<
int
>
(
sample_values
.
size
(),
-
1
);
num_total_features_
=
static_cast
<
int
>
(
sample_values
.
size
());
// check the range of label_idx, weight_idx and group_idx
CHECK
(
label_idx_
>=
0
&&
label_idx_
<=
num_total_features_
);
CHECK
(
weight_idx_
<
0
||
weight_idx_
<
num_total_features_
);
CHECK
(
group_idx_
<
0
||
group_idx_
<
num_total_features_
);
// fill feature_names_ if not header
if
(
feature_names_
.
size
()
<=
0
)
{
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
std
::
stringstream
str_buf
;
str_buf
<<
"Column_"
<<
i
;
feature_names_
.
push_back
(
str_buf
.
str
());
}
}
// start find bins
if
(
num_machines
==
1
)
{
std
::
vector
<
BinMapper
*>
bin_mappers
(
sample_values
.
size
());
// if only one machine, find bin locally
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
bin_mappers
[
i
]
=
new
BinMapper
();
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
max_bin_
);
}
for
(
size_t
i
=
0
;
i
<
sample_values
.
size
();
++
i
)
{
if
(
bin_mappers
[
i
]
==
nullptr
)
{
Log
::
Warning
(
"Ignoring feature %s"
,
feature_names_
[
i
].
c_str
());
}
else
if
(
!
bin_mappers
[
i
]
->
is_trival
())
{
// map real feature index to used feature index
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
features_
.
size
());
// push new feature
features_
.
push_back
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mappers
[
i
],
num_data_
,
is_enable_sparse_
));
}
else
{
// if feature is trival(only 1 bin), free spaces
Log
::
Warning
(
"Ignoring feature %s, only has one value"
,
feature_names_
[
i
].
c_str
());
delete
bin_mappers
[
i
];
}
}
}
else
{
// if have multi-machines, need find bin distributed
// different machines will find bin for different features
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ strat[i], start[i] + len[i] )
int
*
start
=
new
int
[
num_machines
];
int
*
len
=
new
int
[
num_machines
];
int
total_num_feature
=
static_cast
<
int
>
(
sample_values
.
size
());
int
step
=
(
total_num_feature
+
num_machines
-
1
)
/
num_machines
;
if
(
step
<
1
)
{
step
=
1
;
}
start
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
num_machines
-
1
;
++
i
)
{
len
[
i
]
=
Common
::
Min
<
int
>
(
step
,
total_num_feature
-
start
[
i
]);
start
[
i
+
1
]
=
start
[
i
]
+
len
[
i
];
}
len
[
num_machines
-
1
]
=
total_num_feature
-
start
[
num_machines
-
1
];
// get size of bin mapper with max_bin_ size
int
type_size
=
BinMapper
::
SizeForSpecificBin
(
max_bin_
);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int
buffer_size
=
type_size
*
total_num_feature
;
char
*
input_buffer
=
new
char
[
buffer_size
];
char
*
output_buffer
=
new
char
[
buffer_size
];
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
BinMapper
*
bin_mapper
=
new
BinMapper
();
bin_mapper
->
FindBin
(
&
sample_values
[
start
[
rank
]
+
i
],
max_bin_
);
bin_mapper
->
CopyTo
(
input_buffer
+
i
*
type_size
);
// don't need this any more
delete
bin_mapper
;
}
// convert to binary size
for
(
int
i
=
0
;
i
<
num_machines
;
++
i
)
{
start
[
i
]
*=
type_size
;
len
[
i
]
*=
type_size
;
}
// gather global feature bin mappers
Network
::
Allgather
(
input_buffer
,
buffer_size
,
start
,
len
,
output_buffer
);
// restore features bins from buffer
for
(
int
i
=
0
;
i
<
total_num_feature
;
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
Log
::
Warning
(
"Ignoring feature %s"
,
feature_names_
[
i
].
c_str
());
continue
;
}
BinMapper
*
bin_mapper
=
new
BinMapper
();
bin_mapper
->
CopyFrom
(
output_buffer
+
i
*
type_size
);
if
(
!
bin_mapper
->
is_trival
())
{
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
features_
.
size
());
features_
.
push_back
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mapper
,
num_data_
,
is_enable_sparse_
));
}
else
{
Log
::
Warning
(
"Ignoring feature %s, only has one value"
,
feature_names_
[
i
].
c_str
());
delete
bin_mapper
;
}
}
// free buffer
delete
[]
start
;
delete
[]
len
;
delete
[]
input_buffer
;
delete
[]
output_buffer
;
}
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
}
void
Dataset
::
LoadTrainData
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
bool
use_two_round_loading
)
{
// don't support query id in data file when training in parallel
if
(
num_machines
>
1
&&
!
is_pre_partition
)
{
if
(
group_idx_
>
0
)
{
Log
::
Fatal
(
"Using a query id without pre-partitioning the data file is not supported for parallel training. \
Please use an additional query file or pre-partition the data"
);
}
}
used_data_indices_
.
clear
();
if
(
!
is_loading_from_binfile_
)
{
if
(
!
use_two_round_loading
)
{
// read data to memory
LoadDataToMemory
(
rank
,
num_machines
,
is_pre_partition
);
std
::
vector
<
std
::
string
>
sample_data
;
// sample data
SampleDataFromMemory
(
&
sample_data
);
// construct feature bin mappers
ConstructBinMappers
(
rank
,
num_machines
,
sample_data
);
// initialize label
metadata_
.
Init
(
num_data_
,
num_class_
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromMemory
();
}
else
{
std
::
vector
<
std
::
string
>
sample_data
;
// sample data from file
SampleDataFromFile
(
rank
,
num_machines
,
is_pre_partition
,
&
sample_data
);
// construct feature bin mappers
ConstructBinMappers
(
rank
,
num_machines
,
sample_data
);
// initialize label
metadata_
.
Init
(
num_data_
,
num_class_
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromFile
();
}
}
else
{
std
::
string
bin_filename
(
data_filename_
);
bin_filename
.
append
(
".bin"
);
// load data from binary file
LoadDataFromBinFile
(
bin_filename
.
c_str
(),
rank
,
num_machines
,
is_pre_partition
);
}
// check meta data
metadata_
.
CheckOrPartition
(
static_cast
<
data_size_t
>
(
global_num_data_
),
used_data_indices_
);
// free memory
used_data_indices_
.
clear
();
used_data_indices_
.
shrink_to_fit
();
// need to check training data
CheckDataset
();
}
void
Dataset
::
LoadValidationData
(
const
Dataset
*
train_set
,
bool
use_two_round_loading
)
{
used_data_indices_
.
clear
();
if
(
!
is_loading_from_binfile_
)
{
if
(
!
use_two_round_loading
)
{
// read data in memory
LoadDataToMemory
(
0
,
1
,
false
);
// initialize label
metadata_
.
Init
(
num_data_
,
num_class_
,
weight_idx_
,
group_idx_
);
features_
.
clear
();
// copy feature bin mapper data
for
(
Feature
*
feature
:
train_set
->
features_
)
{
features_
.
push_back
(
new
Feature
(
feature
->
feature_index
(),
new
BinMapper
(
*
feature
->
bin_mapper
()),
num_data_
,
is_enable_sparse_
));
}
used_feature_map_
=
train_set
->
used_feature_map_
;
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
num_total_features_
=
train_set
->
num_total_features_
;
feature_names_
=
train_set
->
feature_names_
;
// extract features
ExtractFeaturesFromMemory
();
}
else
{
// Get number of lines of data file
num_data_
=
static_cast
<
data_size_t
>
(
text_reader_
->
CountLine
());
// initialize label
metadata_
.
Init
(
num_data_
,
num_class_
,
weight_idx_
,
group_idx_
);
features_
.
clear
();
// copy feature bin mapper data
for
(
Feature
*
feature
:
train_set
->
features_
)
{
features_
.
push_back
(
new
Feature
(
feature
->
feature_index
(),
new
BinMapper
(
*
feature
->
bin_mapper
()),
num_data_
,
is_enable_sparse_
));
}
used_feature_map_
=
train_set
->
used_feature_map_
;
num_features_
=
static_cast
<
int
>
(
features_
.
size
());
num_total_features_
=
train_set
->
num_total_features_
;
feature_names_
=
train_set
->
feature_names_
;
// extract features
ExtractFeaturesFromFile
();
}
}
else
{
std
::
string
bin_filename
(
data_filename_
);
bin_filename
.
append
(
".bin"
);
// load from binary file
LoadDataFromBinFile
(
bin_filename
.
c_str
(),
0
,
1
,
false
);
}
// not need to check validation data
// check meta data
metadata_
.
CheckOrPartition
(
static_cast
<
data_size_t
>
(
global_num_data_
),
used_data_indices_
);
// CheckDataset();
}
void
Dataset
::
ExtractFeaturesFromMemory
()
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
oneline_features
;
double
tmp_label
=
0.0
f
;
if
(
predict_fun_
==
nullptr
)
{
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
// parser
parser_
->
ParseOneLine
(
text_reader_
->
Lines
()[
i
].
c_str
(),
&
oneline_features
,
&
tmp_label
);
// set label
metadata_
.
SetLabelAt
(
i
,
static_cast
<
float
>
(
tmp_label
));
// free processed line:
text_reader_
->
Lines
()[
i
].
clear
();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit();
// push data
for
(
auto
&
inner_data
:
oneline_features
)
{
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
metadata_
.
SetWeightAt
(
i
,
static_cast
<
float
>
(
inner_data
.
second
));
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
metadata_
.
SetQueryAt
(
i
,
static_cast
<
data_size_t
>
(
inner_data
.
second
));
}
}
}
}
}
else
{
// if need to prediction with initial model
float
*
init_score
=
new
float
[
num_data_
*
num_class_
];
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
// parser
parser_
->
ParseOneLine
(
text_reader_
->
Lines
()[
i
].
c_str
(),
&
oneline_features
,
&
tmp_label
);
// set initial score
std
::
vector
<
double
>
oneline_init_score
=
predict_fun_
(
oneline_features
);
for
(
int
k
=
0
;
k
<
num_class_
;
++
k
){
init_score
[
k
*
num_data_
+
i
]
=
static_cast
<
float
>
(
oneline_init_score
[
k
]);
}
// set label
metadata_
.
SetLabelAt
(
i
,
static_cast
<
float
>
(
tmp_label
));
// free processed line:
text_reader_
->
Lines
()[
i
].
clear
();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit();
// push data
for
(
auto
&
inner_data
:
oneline_features
)
{
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
metadata_
.
SetWeightAt
(
i
,
static_cast
<
float
>
(
inner_data
.
second
));
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
metadata_
.
SetQueryAt
(
i
,
static_cast
<
data_size_t
>
(
inner_data
.
second
));
}
}
}
}
// metadata_ will manage space of init_score
metadata_
.
SetInitScore
(
init_score
,
num_data_
*
num_class_
);
delete
[]
init_score
;
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
features_
[
i
]
->
FinishLoad
();
}
// text data can be free after loaded feature values
text_reader_
->
Clear
();
}
void
Dataset
::
ExtractFeaturesFromFile
()
{
float
*
init_score
=
nullptr
;
if
(
predict_fun_
!=
nullptr
)
{
init_score
=
new
float
[
num_data_
*
num_class_
];
}
std
::
function
<
void
(
data_size_t
,
const
std
::
vector
<
std
::
string
>&
)
>
process_fun
=
[
this
,
&
init_score
]
(
data_size_t
start_idx
,
const
std
::
vector
<
std
::
string
>&
lines
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
oneline_features
;
double
tmp_label
=
0.0
f
;
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
static_cast
<
data_size_t
>
(
lines
.
size
());
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
// parser
parser_
->
ParseOneLine
(
lines
[
i
].
c_str
(),
&
oneline_features
,
&
tmp_label
);
// set initial score
if
(
init_score
!=
nullptr
)
{
std
::
vector
<
double
>
oneline_init_score
=
predict_fun_
(
oneline_features
);
for
(
int
k
=
0
;
k
<
num_class_
;
++
k
){
init_score
[
k
*
num_data_
+
start_idx
+
i
]
=
static_cast
<
float
>
(
oneline_init_score
[
k
]);
}
}
// set label
metadata_
.
SetLabelAt
(
start_idx
+
i
,
static_cast
<
float
>
(
tmp_label
));
// push data
for
(
auto
&
inner_data
:
oneline_features
)
{
int
feature_idx
=
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
features_
[
feature_idx
]
->
PushData
(
tid
,
start_idx
+
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
metadata_
.
SetWeightAt
(
start_idx
+
i
,
static_cast
<
float
>
(
inner_data
.
second
));
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
metadata_
.
SetQueryAt
(
start_idx
+
i
,
static_cast
<
data_size_t
>
(
inner_data
.
second
));
}
}
}
}
};
if
(
used_data_indices_
.
size
()
>
0
)
{
// only need part of data
text_reader_
->
ReadPartAndProcessParallel
(
used_data_indices_
,
process_fun
);
}
else
{
// need full data
text_reader_
->
ReadAllAndProcessParallel
(
process_fun
);
}
// metadata_ will manage space of init_score
if
(
init_score
!=
nullptr
)
{
metadata_
.
SetInitScore
(
init_score
,
num_data_
*
num_class_
);
delete
[]
init_score
;
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
features_
[
i
]
->
FinishLoad
();
}
}
void
Dataset
::
SaveBinaryFile
(
const
char
*
bin_filename
)
{
if
(
!
is_loading_from_binfile_
)
{
// if not pass a filename, just append ".bin" of original file
if
(
bin_filename
==
nullptr
||
bin_filename
[
0
]
==
'\0'
)
{
...
...
@@ -761,11 +89,11 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
bin_filename
=
bin_filename_str
.
c_str
();
}
FILE
*
file
;
#ifdef _MSC_VER
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
,
"wb"
);
#else
#else
file
=
fopen
(
bin_filename
,
"wb"
);
#endif
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"Cannot write binary data to %s "
,
bin_filename
);
}
...
...
@@ -773,17 +101,14 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log
::
Info
(
"Saving data to binary file %s"
,
data_filename_
);
// get size of header
size_t
size_of_header
=
sizeof
(
global_
num_data_
)
+
sizeof
(
is_enable_sparse
_
)
+
sizeof
(
max_bin_
)
+
sizeof
(
num_data_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
size_t
size_of_header
=
sizeof
(
num_data_
)
+
sizeof
(
num_features_
)
+
sizeof
(
num_total_features
_
)
+
sizeof
(
size_t
)
+
sizeof
(
int
)
*
used_feature_map_
.
size
();
// size of feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
size_of_header
+=
feature_names_
[
i
].
size
()
+
sizeof
(
int
);
}
fwrite
(
&
size_of_header
,
sizeof
(
size_of_header
),
1
,
file
);
// write header
fwrite
(
&
global_num_data_
,
sizeof
(
global_num_data_
),
1
,
file
);
fwrite
(
&
is_enable_sparse_
,
sizeof
(
is_enable_sparse_
),
1
,
file
);
fwrite
(
&
max_bin_
,
sizeof
(
max_bin_
),
1
,
file
);
fwrite
(
&
num_data_
,
sizeof
(
num_data_
),
1
,
file
);
fwrite
(
&
num_features_
,
sizeof
(
num_features_
),
1
,
file
);
fwrite
(
&
num_total_features_
,
sizeof
(
num_features_
),
1
,
file
);
...
...
@@ -817,196 +142,4 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
}
}
void
Dataset
::
CheckCanLoadFromBin
()
{
std
::
string
bin_filename
(
data_filename_
);
bin_filename
.
append
(
".bin"
);
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
.
c_str
(),
"rb"
);
#else
file
=
fopen
(
bin_filename
.
c_str
(),
"rb"
);
#endif
if
(
file
==
NULL
)
{
is_loading_from_binfile_
=
false
;
}
else
{
is_loading_from_binfile_
=
true
;
fclose
(
file
);
}
}
void
Dataset
::
LoadDataFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
bool
is_pre_partition
)
{
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
,
"rb"
);
#else
file
=
fopen
(
bin_filename
,
"rb"
);
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"Cannot read binary data from %s"
,
bin_filename
);
}
// buffer to read binary file
size_t
buffer_size
=
16
*
1024
*
1024
;
char
*
buffer
=
new
char
[
buffer_size
];
// read size of header
size_t
read_cnt
=
fread
(
buffer
,
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: header has the wrong size"
);
}
size_t
size_of_head
=
*
(
reinterpret_cast
<
size_t
*>
(
buffer
));
// re-allocmate space if not enough
if
(
size_of_head
>
buffer_size
)
{
delete
[]
buffer
;
buffer_size
=
size_of_head
;
buffer
=
new
char
[
buffer_size
];
}
// read header
read_cnt
=
fread
(
buffer
,
1
,
size_of_head
,
file
);
if
(
read_cnt
!=
size_of_head
)
{
Log
::
Fatal
(
"Binary file error: header is incorrect"
);
}
// get header
const
char
*
mem_ptr
=
buffer
;
global_num_data_
=
*
(
reinterpret_cast
<
const
size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
global_num_data_
);
is_enable_sparse_
=
*
(
reinterpret_cast
<
const
bool
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
is_enable_sparse_
);
max_bin_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
max_bin_
);
num_data_
=
*
(
reinterpret_cast
<
const
data_size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
num_data_
);
num_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
num_features_
);
num_total_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
num_total_features_
);
size_t
num_used_feature_map
=
*
(
reinterpret_cast
<
const
size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
num_used_feature_map
);
const
int
*
tmp_feature_map
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
used_feature_map_
.
clear
();
for
(
size_t
i
=
0
;
i
<
num_used_feature_map
;
++
i
)
{
used_feature_map_
.
push_back
(
tmp_feature_map
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
num_used_feature_map
;
// get feature names
feature_names_
.
clear
();
// write feature names
for
(
int
i
=
0
;
i
<
num_total_features_
;
++
i
)
{
int
str_len
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
int
);
std
::
stringstream
str_buf
;
for
(
int
j
=
0
;
j
<
str_len
;
++
j
)
{
char
tmp_char
=
*
(
reinterpret_cast
<
const
char
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
char
);
str_buf
<<
tmp_char
;
}
feature_names_
.
emplace_back
(
str_buf
.
str
());
}
// read size of meta data
read_cnt
=
fread
(
buffer
,
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: meta data has the wrong size"
);
}
size_t
size_of_metadata
=
*
(
reinterpret_cast
<
size_t
*>
(
buffer
));
// re-allocate space if not enough
if
(
size_of_metadata
>
buffer_size
)
{
delete
[]
buffer
;
buffer_size
=
size_of_metadata
;
buffer
=
new
char
[
buffer_size
];
}
// read meta data
read_cnt
=
fread
(
buffer
,
1
,
size_of_metadata
,
file
);
if
(
read_cnt
!=
size_of_metadata
)
{
Log
::
Fatal
(
"Binary file error: meta data is incorrect"
);
}
// load meta data
metadata_
.
LoadFromMemory
(
buffer
);
used_data_indices_
.
clear
();
global_num_data_
=
num_data_
;
// sample local used data if need to partition
if
(
num_machines
>
1
&&
!
is_pre_partition
)
{
const
data_size_t
*
query_boundaries
=
metadata_
.
query_boundaries
();
if
(
query_boundaries
==
nullptr
)
{
// if not contain query file, minimal sample unit is one record
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
used_data_indices_
.
push_back
(
i
);
}
}
}
else
{
// if contain query file, minimal sample unit is one query
data_size_t
num_queries
=
metadata_
.
num_queries
();
data_size_t
qid
=
-
1
;
bool
is_query_used
=
false
;
for
(
data_size_t
i
=
0
;
i
<
num_data_
;
++
i
)
{
if
(
qid
>=
num_queries
)
{
Log
::
Fatal
(
"Current query exceeds the range of the query file, please ensure the query file is correct"
);
}
if
(
i
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
}
if
(
is_query_used
)
{
used_data_indices_
.
push_back
(
i
);
}
}
}
num_data_
=
static_cast
<
data_size_t
>
(
used_data_indices_
.
size
());
}
metadata_
.
PartitionLabel
(
used_data_indices_
);
// read feature data
for
(
int
i
=
0
;
i
<
num_features_
;
++
i
)
{
// read feature size
read_cnt
=
fread
(
buffer
,
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: feature %d has the wrong size"
,
i
);
}
size_t
size_of_feature
=
*
(
reinterpret_cast
<
size_t
*>
(
buffer
));
// re-allocate space if not enough
if
(
size_of_feature
>
buffer_size
)
{
delete
[]
buffer
;
buffer_size
=
size_of_feature
;
buffer
=
new
char
[
buffer_size
];
}
read_cnt
=
fread
(
buffer
,
1
,
size_of_feature
,
file
);
if
(
read_cnt
!=
size_of_feature
)
{
Log
::
Fatal
(
"Binary file error: feature %d is incorrect, read count: %d"
,
i
,
read_cnt
);
}
features_
.
push_back
(
new
Feature
(
buffer
,
static_cast
<
data_size_t
>
(
global_num_data_
),
used_data_indices_
));
}
delete
[]
buffer
;
fclose
(
file
);
}
void
Dataset
::
CheckDataset
()
{
if
(
num_data_
<=
0
)
{
Log
::
Fatal
(
"Data file %s is empty"
,
data_filename_
);
}
if
(
features_
.
size
()
<=
0
)
{
Log
::
Fatal
(
"No usable features in data file %s"
,
data_filename_
);
}
}
}
// namespace LightGBM
src/io/dataset_loader.cpp
0 → 100644
View file @
1c08e71e
#include <omp.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/feature.h>
#include <LightGBM/network.h>
namespace
LightGBM
{
DatasetLoader
::
DatasetLoader
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
io_config_
(
io_config
),
predict_fun_
(
predict_fun
){
}
DatasetLoader
::~
DatasetLoader
()
{
}
void
DatasetLoader
::
SetHeadder
(
const
char
*
filename
)
{
TextReader
<
data_size_t
>
text_reader
(
filename
,
io_config_
.
has_header
);
std
::
unordered_map
<
std
::
string
,
int
>
name2idx
;
// get column names
if
(
io_config_
.
has_header
)
{
std
::
string
first_line
=
text_reader
.
first_line
();
feature_names_
=
Common
::
Split
(
first_line
.
c_str
(),
"
\t
,"
);
for
(
size_t
i
=
0
;
i
<
feature_names_
.
size
();
++
i
)
{
name2idx
[
feature_names_
[
i
]]
=
static_cast
<
int
>
(
i
);
}
}
std
::
string
name_prefix
(
"name:"
);
// load label idx
if
(
io_config_
.
label_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config_
.
label_column
,
name_prefix
))
{
std
::
string
name
=
io_config_
.
label_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
label_idx_
=
name2idx
[
name
];
Log
::
Info
(
"Using column %s as label"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"Could not find label column %s in data file"
,
name
.
c_str
());
}
}
else
{
if
(
!
Common
::
AtoiAndCheck
(
io_config_
.
label_column
.
c_str
(),
&
label_idx_
))
{
Log
::
Fatal
(
"label_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
Log
::
Info
(
"Using column number %d as label"
,
label_idx_
);
}
}
if
(
feature_names_
.
size
()
>
0
)
{
// erase label column name
feature_names_
.
erase
(
feature_names_
.
begin
()
+
label_idx_
);
}
// load ignore columns
if
(
io_config_
.
ignore_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config_
.
ignore_column
,
name_prefix
))
{
std
::
string
names
=
io_config_
.
ignore_column
.
substr
(
name_prefix
.
size
());
for
(
auto
name
:
Common
::
Split
(
names
.
c_str
(),
','
))
{
if
(
name2idx
.
count
(
name
)
>
0
)
{
int
tmp
=
name2idx
[
name
];
// skip for label column
if
(
tmp
>
label_idx_
)
{
tmp
-=
1
;
}
ignore_features_
.
emplace
(
tmp
);
}
else
{
Log
::
Fatal
(
"Could not find ignore column %s in data file"
,
name
.
c_str
());
}
}
}
else
{
for
(
auto
token
:
Common
::
Split
(
io_config_
.
ignore_column
.
c_str
(),
','
))
{
int
tmp
=
0
;
if
(
!
Common
::
AtoiAndCheck
(
token
.
c_str
(),
&
tmp
))
{
Log
::
Fatal
(
"ignore_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
// skip for label column
if
(
tmp
>
label_idx_
)
{
tmp
-=
1
;
}
ignore_features_
.
emplace
(
tmp
);
}
}
}
// load weight idx
if
(
io_config_
.
weight_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config_
.
weight_column
,
name_prefix
))
{
std
::
string
name
=
io_config_
.
weight_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
weight_idx_
=
name2idx
[
name
];
Log
::
Info
(
"Using column %s as weight"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"Could not find weight column %s in data file"
,
name
.
c_str
());
}
}
else
{
if
(
!
Common
::
AtoiAndCheck
(
io_config_
.
weight_column
.
c_str
(),
&
weight_idx_
))
{
Log
::
Fatal
(
"weight_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
Log
::
Info
(
"Using column number %d as weight"
,
weight_idx_
);
}
// skip for label column
if
(
weight_idx_
>
label_idx_
)
{
weight_idx_
-=
1
;
}
ignore_features_
.
emplace
(
weight_idx_
);
}
if
(
io_config_
.
group_column
.
size
()
>
0
)
{
if
(
Common
::
StartsWith
(
io_config_
.
group_column
,
name_prefix
))
{
std
::
string
name
=
io_config_
.
group_column
.
substr
(
name_prefix
.
size
());
if
(
name2idx
.
count
(
name
)
>
0
)
{
group_idx_
=
name2idx
[
name
];
Log
::
Info
(
"Using column %s as group/query id"
,
name
.
c_str
());
}
else
{
Log
::
Fatal
(
"Could not find group/query column %s in data file"
,
name
.
c_str
());
}
}
else
{
if
(
!
Common
::
AtoiAndCheck
(
io_config_
.
group_column
.
c_str
(),
&
group_idx_
))
{
Log
::
Fatal
(
"group_column is not a number, \
if you want to use a column name, \
please add the prefix
\"
name:
\"
to the column name"
);
}
Log
::
Info
(
"Using column number %d as group/query id"
,
group_idx_
);
}
// skip for label column
if
(
group_idx_
>
label_idx_
)
{
group_idx_
-=
1
;
}
ignore_features_
.
emplace
(
group_idx_
);
}
}
Dataset
*
DatasetLoader
::
LoadFromFile
(
const
char
*
filename
,
int
rank
,
int
num_machines
)
{
// don't support query id in data file when training in parallel
if
(
num_machines
>
1
&&
!
io_config_
.
is_pre_partition
)
{
if
(
group_idx_
>
0
)
{
Log
::
Fatal
(
"Using a query id without pre-partitioning the data file is not supported for parallel training. \
Please use an additional query file or pre-partition the data"
);
}
}
auto
parser
=
Parser
::
CreateParser
(
filename
,
io_config_
.
has_header
,
0
,
label_idx_
);
if
(
parser
==
nullptr
)
{
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
}
data_size_t
num_global_data
=
0
;
std
::
vector
<
data_size_t
>
used_data_indices
;
Dataset
*
dataset
=
new
Dataset
();
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
bool
is_loading_from_binfile
=
CheckCanLoadFromBin
(
filename
);
if
(
!
is_loading_from_binfile
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data to memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
rank
,
num_machines
,
&
num_global_data
,
&
used_data_indices
);
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_data
.
size
());
// sample data
auto
sample_data
=
SampleTextDataFromMemory
(
text_data
);
// construct feature bin mappers
ConstructBinMappersFromTextData
(
rank
,
num_machines
,
sample_data
,
parser
,
dataset
);
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
io_config_
.
num_class
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromMemory
(
text_data
,
parser
,
dataset
);
text_data
.
clear
();
}
else
{
// sample data from file
auto
sample_data
=
SampleTextDataFromFile
(
filename
,
dataset
->
metadata_
,
rank
,
num_machines
,
&
num_global_data
,
&
used_data_indices
);
if
(
used_data_indices
.
size
()
>
0
)
{
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
used_data_indices
.
size
());
}
else
{
dataset
->
num_data_
=
num_global_data
;
}
// construct feature bin mappers
ConstructBinMappersFromTextData
(
rank
,
num_machines
,
sample_data
,
parser
,
dataset
);
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
,
used_data_indices
,
dataset
);
}
}
else
{
// load data from binary file
delete
dataset
;
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
dataset
=
LoadFromBinFile
(
bin_filename
.
c_str
(),
rank
,
num_machines
);
}
// check meta data
dataset
->
metadata_
.
CheckOrPartition
(
num_global_data
,
used_data_indices
);
// need to check training data
CheckDataset
(
dataset
);
delete
parser
;
return
dataset
;
}
Dataset
*
DatasetLoader
::
LoadFromFileLikeOthers
(
const
char
*
filename
,
const
Dataset
*
other
)
{
auto
parser
=
Parser
::
CreateParser
(
filename
,
io_config_
.
has_header
,
0
,
label_idx_
);
if
(
parser
==
nullptr
)
{
Log
::
Fatal
(
"Could not recognize data format of %s"
,
filename
);
}
data_size_t
num_global_data
=
0
;
std
::
vector
<
data_size_t
>
used_data_indices
;
Dataset
*
dataset
=
new
Dataset
();
dataset
->
data_filename_
=
filename
;
dataset
->
num_class_
=
io_config_
.
num_class
;
dataset
->
metadata_
.
Init
(
filename
,
dataset
->
num_class_
);
bool
is_loading_from_binfile
=
CheckCanLoadFromBin
(
filename
);
if
(
!
is_loading_from_binfile
)
{
if
(
!
io_config_
.
use_two_round_loading
)
{
// read data in memory
auto
text_data
=
LoadTextDataToMemory
(
filename
,
dataset
->
metadata_
,
0
,
1
,
&
num_global_data
,
&
used_data_indices
);
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_data
.
size
());
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
other
->
CopyFeatureMetadataTo
(
dataset
,
io_config_
.
is_enable_sparse
);
// extract features
ExtractFeaturesFromMemory
(
text_data
,
parser
,
dataset
);
text_data
.
clear
();
}
else
{
TextReader
<
data_size_t
>
text_reader
(
filename
,
io_config_
.
has_header
);
// Get number of lines of data file
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
text_reader
.
CountLine
());
num_global_data
=
dataset
->
num_data_
;
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
dataset
->
num_class_
,
weight_idx_
,
group_idx_
);
other
->
CopyFeatureMetadataTo
(
dataset
,
io_config_
.
is_enable_sparse
);
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
,
used_data_indices
,
dataset
);
}
}
else
{
// load data from binary file
delete
dataset
;
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
dataset
=
LoadFromBinFile
(
bin_filename
.
c_str
(),
0
,
1
);
}
// not need to check validation data
// check meta data
dataset
->
metadata_
.
CheckOrPartition
(
num_global_data
,
used_data_indices
);
delete
parser
;
return
dataset
;
}
Dataset
*
DatasetLoader
::
LoadFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
)
{
Dataset
*
dataset
=
new
Dataset
();
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
,
"rb"
);
#else
file
=
fopen
(
bin_filename
,
"rb"
);
#endif
if
(
file
==
NULL
)
{
Log
::
Fatal
(
"Could not read binary data from %s"
,
bin_filename
);
}
// buffer to read binary file
size_t
buffer_size
=
16
*
1024
*
1024
;
char
*
buffer
=
new
char
[
buffer_size
];
// read size of header
size_t
read_cnt
=
fread
(
buffer
,
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: header has the wrong size"
);
}
size_t
size_of_head
=
*
(
reinterpret_cast
<
size_t
*>
(
buffer
));
// re-allocmate space if not enough
if
(
size_of_head
>
buffer_size
)
{
delete
[]
buffer
;
buffer_size
=
size_of_head
;
buffer
=
new
char
[
buffer_size
];
}
// read header
read_cnt
=
fread
(
buffer
,
1
,
size_of_head
,
file
);
if
(
read_cnt
!=
size_of_head
)
{
Log
::
Fatal
(
"Binary file error: header is incorrect"
);
}
// get header
const
char
*
mem_ptr
=
buffer
;
dataset
->
num_data_
=
*
(
reinterpret_cast
<
const
data_size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_data_
);
dataset
->
num_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_features_
);
dataset
->
num_total_features_
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
dataset
->
num_total_features_
);
size_t
num_used_feature_map
=
*
(
reinterpret_cast
<
const
size_t
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
num_used_feature_map
);
const
int
*
tmp_feature_map
=
reinterpret_cast
<
const
int
*>
(
mem_ptr
);
dataset
->
used_feature_map_
.
clear
();
for
(
size_t
i
=
0
;
i
<
num_used_feature_map
;
++
i
)
{
dataset
->
used_feature_map_
.
push_back
(
tmp_feature_map
[
i
]);
}
mem_ptr
+=
sizeof
(
int
)
*
num_used_feature_map
;
// get feature names
feature_names_
.
clear
();
// write feature names
for
(
int
i
=
0
;
i
<
dataset
->
num_total_features_
;
++
i
)
{
int
str_len
=
*
(
reinterpret_cast
<
const
int
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
int
);
std
::
stringstream
str_buf
;
for
(
int
j
=
0
;
j
<
str_len
;
++
j
)
{
char
tmp_char
=
*
(
reinterpret_cast
<
const
char
*>
(
mem_ptr
));
mem_ptr
+=
sizeof
(
char
);
str_buf
<<
tmp_char
;
}
feature_names_
.
emplace_back
(
str_buf
.
str
());
}
// read size of meta data
read_cnt
=
fread
(
buffer
,
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: meta data has the wrong size"
);
}
size_t
size_of_metadata
=
*
(
reinterpret_cast
<
size_t
*>
(
buffer
));
// re-allocate space if not enough
if
(
size_of_metadata
>
buffer_size
)
{
delete
[]
buffer
;
buffer_size
=
size_of_metadata
;
buffer
=
new
char
[
buffer_size
];
}
// read meta data
read_cnt
=
fread
(
buffer
,
1
,
size_of_metadata
,
file
);
if
(
read_cnt
!=
size_of_metadata
)
{
Log
::
Fatal
(
"Binary file error: meta data is incorrect"
);
}
// load meta data
dataset
->
metadata_
.
LoadFromMemory
(
buffer
);
std
::
vector
<
data_size_t
>
used_data_indices
;
data_size_t
num_global_data
=
dataset
->
num_data_
;
// sample local used data if need to partition
if
(
num_machines
>
1
&&
!
io_config_
.
is_pre_partition
)
{
const
data_size_t
*
query_boundaries
=
dataset
->
metadata_
.
query_boundaries
();
if
(
query_boundaries
==
nullptr
)
{
// if not contain query file, minimal sample unit is one record
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
used_data_indices
.
push_back
(
i
);
}
}
}
else
{
// if contain query file, minimal sample unit is one query
data_size_t
num_queries
=
dataset
->
metadata_
.
num_queries
();
data_size_t
qid
=
-
1
;
bool
is_query_used
=
false
;
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
if
(
qid
>=
num_queries
)
{
Log
::
Fatal
(
"Current query exceeds the range of the query file, please ensure the query file is correct"
);
}
if
(
i
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
}
if
(
is_query_used
)
{
used_data_indices
.
push_back
(
i
);
}
}
}
dataset
->
num_data_
=
static_cast
<
data_size_t
>
(
used_data_indices
.
size
());
}
dataset
->
metadata_
.
PartitionLabel
(
used_data_indices
);
// read feature data
for
(
int
i
=
0
;
i
<
dataset
->
num_features_
;
++
i
)
{
// read feature size
read_cnt
=
fread
(
buffer
,
sizeof
(
size_t
),
1
,
file
);
if
(
read_cnt
!=
1
)
{
Log
::
Fatal
(
"Binary file error: feature %d has the wrong size"
,
i
);
}
size_t
size_of_feature
=
*
(
reinterpret_cast
<
size_t
*>
(
buffer
));
// re-allocate space if not enough
if
(
size_of_feature
>
buffer_size
)
{
delete
[]
buffer
;
buffer_size
=
size_of_feature
;
buffer
=
new
char
[
buffer_size
];
}
read_cnt
=
fread
(
buffer
,
1
,
size_of_feature
,
file
);
if
(
read_cnt
!=
size_of_feature
)
{
Log
::
Fatal
(
"Binary file error: feature %d is incorrect, read count: %d"
,
i
,
read_cnt
);
}
dataset
->
features_
.
push_back
(
new
Feature
(
buffer
,
num_global_data
,
used_data_indices
));
}
delete
[]
buffer
;
fclose
(
file
);
dataset
->
is_loading_from_binfile_
=
true
;
return
dataset
;
}
// ---- private functions ----
void
DatasetLoader
::
CheckDataset
(
const
Dataset
*
dataset
)
{
if
(
dataset
->
num_data_
<=
0
)
{
Log
::
Fatal
(
"Data file %s is empty"
,
dataset
->
data_filename_
);
}
if
(
dataset
->
features_
.
size
()
<=
0
)
{
Log
::
Fatal
(
"No usable features in data file %s"
,
dataset
->
data_filename_
);
}
}
std
::
vector
<
std
::
string
>
DatasetLoader
::
LoadTextDataToMemory
(
const
char
*
filename
,
const
Metadata
&
metadata
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
std
::
vector
<
data_size_t
>*
used_data_indices
)
{
TextReader
<
data_size_t
>
text_reader
(
filename
,
io_config_
.
has_header
);
used_data_indices
->
clear
();
if
(
num_machines
==
1
||
io_config_
.
is_pre_partition
)
{
// read all lines
*
num_global_data
=
text_reader
.
ReadAllLines
();
}
else
{
// need partition data
// get query data
const
data_size_t
*
query_boundaries
=
metadata
.
query_boundaries
();
if
(
query_boundaries
==
nullptr
)
{
// if not contain query data, minimal sample unit is one record
*
num_global_data
=
text_reader
.
ReadAndFilterLines
([
this
,
rank
,
num_machines
](
data_size_t
)
{
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
return
true
;
}
else
{
return
false
;
}
},
used_data_indices
);
}
else
{
// if contain query data, minimal sample unit is one query
data_size_t
num_queries
=
metadata
.
num_queries
();
data_size_t
qid
=
-
1
;
bool
is_query_used
=
false
;
*
num_global_data
=
text_reader
.
ReadAndFilterLines
(
[
this
,
rank
,
num_machines
,
&
qid
,
&
query_boundaries
,
&
is_query_used
,
num_queries
]
(
data_size_t
line_idx
)
{
if
(
qid
>=
num_queries
)
{
Log
::
Fatal
(
"Current query exceeds the range of the query file, please ensure the query file is correct"
);
}
if
(
line_idx
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
}
return
is_query_used
;
},
used_data_indices
);
}
}
return
std
::
move
(
text_reader
.
Lines
());
}
std
::
vector
<
std
::
string
>
DatasetLoader
::
SampleTextDataFromMemory
(
const
std
::
vector
<
std
::
string
>&
data
)
{
const
size_t
sample_cnt
=
static_cast
<
size_t
>
(
data
.
size
()
<
io_config_
.
bin_construct_sample_cnt
?
data
.
size
()
:
io_config_
.
bin_construct_sample_cnt
);
std
::
vector
<
size_t
>
sample_indices
=
random_
.
Sample
(
data
.
size
(),
sample_cnt
);
std
::
vector
<
std
::
string
>
out
;
for
(
size_t
i
=
0
;
i
<
sample_indices
.
size
();
++
i
)
{
const
size_t
idx
=
sample_indices
[
i
];
out
.
push_back
(
data
[
idx
]);
}
return
out
;
}
std
::
vector
<
std
::
string
>
DatasetLoader
::
SampleTextDataFromFile
(
const
char
*
filename
,
const
Metadata
&
metadata
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
std
::
vector
<
data_size_t
>*
used_data_indices
)
{
const
data_size_t
sample_cnt
=
static_cast
<
data_size_t
>
(
io_config_
.
bin_construct_sample_cnt
);
TextReader
<
data_size_t
>
text_reader
(
filename
,
io_config_
.
has_header
);
std
::
vector
<
std
::
string
>
out_data
;
if
(
num_machines
==
1
||
io_config_
.
is_pre_partition
)
{
*
num_global_data
=
static_cast
<
data_size_t
>
(
text_reader
.
SampleFromFile
(
random_
,
sample_cnt
,
&
out_data
));
}
else
{
// need partition data
// get query data
const
data_size_t
*
query_boundaries
=
metadata
.
query_boundaries
();
if
(
query_boundaries
==
nullptr
)
{
// if not contain query file, minimal sample unit is one record
*
num_global_data
=
text_reader
.
SampleAndFilterFromFile
([
this
,
rank
,
num_machines
]
(
data_size_t
)
{
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
return
true
;
}
else
{
return
false
;
}
},
used_data_indices
,
random_
,
sample_cnt
,
&
out_data
);
}
else
{
// if contain query file, minimal sample unit is one query
data_size_t
num_queries
=
metadata
.
num_queries
();
data_size_t
qid
=
-
1
;
bool
is_query_used
=
false
;
*
num_global_data
=
text_reader
.
SampleAndFilterFromFile
(
[
this
,
rank
,
num_machines
,
&
qid
,
&
query_boundaries
,
&
is_query_used
,
num_queries
]
(
data_size_t
line_idx
)
{
if
(
qid
>=
num_queries
)
{
Log
::
Fatal
(
"Query id exceeds the range of the query file, \
please ensure the query file is correct"
);
}
if
(
line_idx
>=
query_boundaries
[
qid
+
1
])
{
// if is new query
is_query_used
=
false
;
if
(
random_
.
NextInt
(
0
,
num_machines
)
==
rank
)
{
is_query_used
=
true
;
}
++
qid
;
}
return
is_query_used
;
},
used_data_indices
,
random_
,
sample_cnt
,
&
out_data
);
}
}
return
out_data
;
}
void
DatasetLoader
::
ConstructBinMappersFromTextData
(
int
rank
,
int
num_machines
,
const
std
::
vector
<
std
::
string
>&
sample_data
,
const
Parser
*
parser
,
Dataset
*
dataset
)
{
// sample_values[i][j], means the value of j-th sample on i-th feature
std
::
vector
<
std
::
vector
<
double
>>
sample_values
;
// temp buffer for one line features and label
std
::
vector
<
std
::
pair
<
int
,
double
>>
oneline_features
;
double
label
;
for
(
size_t
i
=
0
;
i
<
sample_data
.
size
();
++
i
)
{
oneline_features
.
clear
();
// parse features
parser
->
ParseOneLine
(
sample_data
[
i
].
c_str
(),
&
oneline_features
,
&
label
);
// push 0 first, then edit the value according existing feature values
for
(
auto
&
feature_values
:
sample_values
)
{
feature_values
.
push_back
(
0.0
);
}
for
(
std
::
pair
<
int
,
double
>&
inner_data
:
oneline_features
)
{
if
(
static_cast
<
size_t
>
(
inner_data
.
first
)
>=
sample_values
.
size
())
{
// if need expand feature set
size_t
need_size
=
inner_data
.
first
-
sample_values
.
size
()
+
1
;
for
(
size_t
j
=
0
;
j
<
need_size
;
++
j
)
{
// push i+1 0
sample_values
.
emplace_back
(
i
+
1
,
0.0
f
);
}
}
// edit the feature value
sample_values
[
inner_data
.
first
][
i
]
=
inner_data
.
second
;
}
}
dataset
->
features_
.
clear
();
// -1 means doesn't use this feature
dataset
->
used_feature_map_
=
std
::
vector
<
int
>
(
sample_values
.
size
(),
-
1
);
dataset
->
num_total_features_
=
static_cast
<
int
>
(
sample_values
.
size
());
// check the range of label_idx, weight_idx and group_idx
CHECK
(
label_idx_
>=
0
&&
label_idx_
<=
dataset
->
num_total_features_
);
CHECK
(
weight_idx_
<
0
||
weight_idx_
<
dataset
->
num_total_features_
);
CHECK
(
group_idx_
<
0
||
group_idx_
<
dataset
->
num_total_features_
);
// fill feature_names_ if not header
if
(
feature_names_
.
size
()
<=
0
)
{
for
(
int
i
=
0
;
i
<
dataset
->
num_total_features_
;
++
i
)
{
std
::
stringstream
str_buf
;
str_buf
<<
"Column_"
<<
i
;
feature_names_
.
push_back
(
str_buf
.
str
());
}
}
dataset
->
feature_names_
=
feature_names_
;
// start find bins
if
(
num_machines
==
1
)
{
std
::
vector
<
BinMapper
*>
bin_mappers
(
sample_values
.
size
());
// if only one machine, find bin locally
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
sample_values
.
size
());
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
bin_mappers
[
i
]
=
nullptr
;
continue
;
}
bin_mappers
[
i
]
=
new
BinMapper
();
bin_mappers
[
i
]
->
FindBin
(
&
sample_values
[
i
],
io_config_
.
max_bin
);
}
for
(
size_t
i
=
0
;
i
<
sample_values
.
size
();
++
i
)
{
if
(
bin_mappers
[
i
]
==
nullptr
)
{
Log
::
Warning
(
"Ignoring feature %s"
,
feature_names_
[
i
].
c_str
());
}
else
if
(
!
bin_mappers
[
i
]
->
is_trival
())
{
// map real feature index to used feature index
dataset
->
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
// push new feature
dataset
->
features_
.
push_back
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mappers
[
i
],
dataset
->
num_data_
,
io_config_
.
is_enable_sparse
));
}
else
{
// if feature is trival(only 1 bin), free spaces
Log
::
Warning
(
"Ignoring feature %s, only has one value"
,
feature_names_
[
i
].
c_str
());
delete
bin_mappers
[
i
];
}
}
}
else
{
// if have multi-machines, need find bin distributed
// different machines will find bin for different features
// start and len will store the process feature indices for different machines
// machine i will find bins for features in [ strat[i], start[i] + len[i] )
int
*
start
=
new
int
[
num_machines
];
int
*
len
=
new
int
[
num_machines
];
int
total_num_feature
=
static_cast
<
int
>
(
sample_values
.
size
());
int
step
=
(
total_num_feature
+
num_machines
-
1
)
/
num_machines
;
if
(
step
<
1
)
{
step
=
1
;
}
start
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
num_machines
-
1
;
++
i
)
{
len
[
i
]
=
Common
::
Min
<
int
>
(
step
,
total_num_feature
-
start
[
i
]);
start
[
i
+
1
]
=
start
[
i
]
+
len
[
i
];
}
len
[
num_machines
-
1
]
=
total_num_feature
-
start
[
num_machines
-
1
];
// get size of bin mapper with max_bin_ size
int
type_size
=
BinMapper
::
SizeForSpecificBin
(
io_config_
.
max_bin
);
// since sizes of different feature may not be same, we expand all bin mapper to type_size
int
buffer_size
=
type_size
*
total_num_feature
;
char
*
input_buffer
=
new
char
[
buffer_size
];
char
*
output_buffer
=
new
char
[
buffer_size
];
// find local feature bins and copy to buffer
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
len
[
rank
];
++
i
)
{
BinMapper
*
bin_mapper
=
new
BinMapper
();
bin_mapper
->
FindBin
(
&
sample_values
[
start
[
rank
]
+
i
],
io_config_
.
max_bin
);
bin_mapper
->
CopyTo
(
input_buffer
+
i
*
type_size
);
// don't need this any more
delete
bin_mapper
;
}
// convert to binary size
for
(
int
i
=
0
;
i
<
num_machines
;
++
i
)
{
start
[
i
]
*=
type_size
;
len
[
i
]
*=
type_size
;
}
// gather global feature bin mappers
Network
::
Allgather
(
input_buffer
,
buffer_size
,
start
,
len
,
output_buffer
);
// restore features bins from buffer
for
(
int
i
=
0
;
i
<
total_num_feature
;
++
i
)
{
if
(
ignore_features_
.
count
(
i
)
>
0
)
{
Log
::
Warning
(
"Ignoring feature %s"
,
feature_names_
[
i
].
c_str
());
continue
;
}
BinMapper
*
bin_mapper
=
new
BinMapper
();
bin_mapper
->
CopyFrom
(
output_buffer
+
i
*
type_size
);
if
(
!
bin_mapper
->
is_trival
())
{
dataset
->
used_feature_map_
[
i
]
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
dataset
->
features_
.
push_back
(
new
Feature
(
static_cast
<
int
>
(
i
),
bin_mapper
,
dataset
->
num_data_
,
io_config_
.
is_enable_sparse
));
}
else
{
Log
::
Warning
(
"Ignoring feature %s, only has one value"
,
feature_names_
[
i
].
c_str
());
delete
bin_mapper
;
}
}
// free buffer
delete
[]
start
;
delete
[]
len
;
delete
[]
input_buffer
;
delete
[]
output_buffer
;
}
dataset
->
num_features_
=
static_cast
<
int
>
(
dataset
->
features_
.
size
());
}
/*! \brief Extract local features from memory */
void
DatasetLoader
::
ExtractFeaturesFromMemory
(
std
::
vector
<
std
::
string
>&
text_data
,
const
Parser
*
parser
,
Dataset
*
dataset
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
oneline_features
;
double
tmp_label
=
0.0
f
;
if
(
predict_fun_
==
nullptr
)
{
// if doesn't need to prediction with initial model
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
// parser
parser
->
ParseOneLine
(
text_data
[
i
].
c_str
(),
&
oneline_features
,
&
tmp_label
);
// set label
dataset
->
metadata_
.
SetLabelAt
(
i
,
static_cast
<
float
>
(
tmp_label
));
// free processed line:
text_data
[
i
].
clear
();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit();
// push data
for
(
auto
&
inner_data
:
oneline_features
)
{
int
feature_idx
=
dataset
->
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
dataset
->
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
dataset
->
metadata_
.
SetWeightAt
(
i
,
static_cast
<
float
>
(
inner_data
.
second
));
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
dataset
->
metadata_
.
SetQueryAt
(
i
,
static_cast
<
data_size_t
>
(
inner_data
.
second
));
}
}
}
}
}
else
{
// if need to prediction with initial model
float
*
init_score
=
new
float
[
dataset
->
num_data_
*
dataset
->
num_class_
];
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
dataset
->
num_data_
;
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
// parser
parser
->
ParseOneLine
(
text_data
[
i
].
c_str
(),
&
oneline_features
,
&
tmp_label
);
// set initial score
std
::
vector
<
double
>
oneline_init_score
=
predict_fun_
(
oneline_features
);
for
(
int
k
=
0
;
k
<
dataset
->
num_class_
;
++
k
)
{
init_score
[
k
*
dataset
->
num_data_
+
i
]
=
static_cast
<
float
>
(
oneline_init_score
[
k
]);
}
// set label
dataset
->
metadata_
.
SetLabelAt
(
i
,
static_cast
<
float
>
(
tmp_label
));
// free processed line:
text_data
[
i
].
clear
();
// shrink_to_fit will be very slow in linux, and seems not free memory, disable for now
// text_reader_->Lines()[i].shrink_to_fit();
// push data
for
(
auto
&
inner_data
:
oneline_features
)
{
int
feature_idx
=
dataset
->
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
dataset
->
features_
[
feature_idx
]
->
PushData
(
tid
,
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
dataset
->
metadata_
.
SetWeightAt
(
i
,
static_cast
<
float
>
(
inner_data
.
second
));
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
dataset
->
metadata_
.
SetQueryAt
(
i
,
static_cast
<
data_size_t
>
(
inner_data
.
second
));
}
}
}
}
// metadata_ will manage space of init_score
dataset
->
metadata_
.
SetInitScore
(
init_score
,
dataset
->
num_data_
*
dataset
->
num_class_
);
delete
[]
init_score
;
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
dataset
->
num_features_
;
++
i
)
{
dataset
->
features_
[
i
]
->
FinishLoad
();
}
// text data can be free after loaded feature values
text_data
.
clear
();
}
/*! \brief Extract local features from file */
void
DatasetLoader
::
ExtractFeaturesFromFile
(
const
char
*
filename
,
const
Parser
*
parser
,
const
std
::
vector
<
data_size_t
>&
used_data_indices
,
Dataset
*
dataset
)
{
float
*
init_score
=
nullptr
;
if
(
predict_fun_
!=
nullptr
)
{
init_score
=
new
float
[
dataset
->
num_data_
*
dataset
->
num_class_
];
}
std
::
function
<
void
(
data_size_t
,
const
std
::
vector
<
std
::
string
>&
)
>
process_fun
=
[
this
,
&
init_score
,
&
parser
,
&
dataset
]
(
data_size_t
start_idx
,
const
std
::
vector
<
std
::
string
>&
lines
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
oneline_features
;
double
tmp_label
=
0.0
f
;
#pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label)
for
(
data_size_t
i
=
0
;
i
<
static_cast
<
data_size_t
>
(
lines
.
size
());
++
i
)
{
const
int
tid
=
omp_get_thread_num
();
oneline_features
.
clear
();
// parser
parser
->
ParseOneLine
(
lines
[
i
].
c_str
(),
&
oneline_features
,
&
tmp_label
);
// set initial score
if
(
init_score
!=
nullptr
)
{
std
::
vector
<
double
>
oneline_init_score
=
predict_fun_
(
oneline_features
);
for
(
int
k
=
0
;
k
<
dataset
->
num_class_
;
++
k
)
{
init_score
[
k
*
dataset
->
num_data_
+
start_idx
+
i
]
=
static_cast
<
float
>
(
oneline_init_score
[
k
]);
}
}
// set label
dataset
->
metadata_
.
SetLabelAt
(
start_idx
+
i
,
static_cast
<
float
>
(
tmp_label
));
// push data
for
(
auto
&
inner_data
:
oneline_features
)
{
int
feature_idx
=
dataset
->
used_feature_map_
[
inner_data
.
first
];
if
(
feature_idx
>=
0
)
{
// if is used feature
dataset
->
features_
[
feature_idx
]
->
PushData
(
tid
,
start_idx
+
i
,
inner_data
.
second
);
}
else
{
if
(
inner_data
.
first
==
weight_idx_
)
{
dataset
->
metadata_
.
SetWeightAt
(
start_idx
+
i
,
static_cast
<
float
>
(
inner_data
.
second
));
}
else
if
(
inner_data
.
first
==
group_idx_
)
{
dataset
->
metadata_
.
SetQueryAt
(
start_idx
+
i
,
static_cast
<
data_size_t
>
(
inner_data
.
second
));
}
}
}
}
};
TextReader
<
data_size_t
>
text_reader
(
filename
,
io_config_
.
has_header
);
if
(
used_data_indices
.
size
()
>
0
)
{
// only need part of data
text_reader
.
ReadPartAndProcessParallel
(
used_data_indices
,
process_fun
);
}
else
{
// need full data
text_reader
.
ReadAllAndProcessParallel
(
process_fun
);
}
// metadata_ will manage space of init_score
if
(
init_score
!=
nullptr
)
{
dataset
->
metadata_
.
SetInitScore
(
init_score
,
dataset
->
num_data_
*
dataset
->
num_class_
);
delete
[]
init_score
;
}
#pragma omp parallel for schedule(guided)
for
(
int
i
=
0
;
i
<
dataset
->
num_features_
;
++
i
)
{
dataset
->
features_
[
i
]
->
FinishLoad
();
}
}
/*! \brief Check can load from binary file */
bool
DatasetLoader
::
CheckCanLoadFromBin
(
const
char
*
filename
)
{
std
::
string
bin_filename
(
filename
);
bin_filename
.
append
(
".bin"
);
FILE
*
file
;
#ifdef _MSC_VER
fopen_s
(
&
file
,
bin_filename
.
c_str
(),
"rb"
);
#else
file
=
fopen
(
bin_filename
.
c_str
(),
"rb"
);
#endif
if
(
file
==
NULL
)
{
return
false
;
}
else
{
fclose
(
file
);
return
true
;
}
}
}
\ No newline at end of file
src/io/metadata.cpp
View file @
1c08e71e
...
...
@@ -14,9 +14,8 @@ Metadata::Metadata()
}
void
Metadata
::
Init
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
int
num_class
)
{
void
Metadata
::
Init
(
const
char
*
data_filename
,
const
int
num_class
)
{
data_filename_
=
data_filename
;
init_score_filename_
=
init_score_filename
;
num_class_
=
num_class
;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries
();
...
...
@@ -25,11 +24,6 @@ void Metadata::Init(const char * data_filename, const char* init_score_filename,
LoadInitialScore
();
}
void
Metadata
::
Init
(
const
char
*
init_score_filename
,
const
int
num_class
)
{
init_score_filename_
=
init_score_filename
;
num_class_
=
num_class
;
LoadInitialScore
();
}
Metadata
::~
Metadata
()
{
...
...
@@ -294,10 +288,14 @@ void Metadata::LoadWeights() {
void
Metadata
::
LoadInitialScore
()
{
num_init_score_
=
0
;
if
(
init_score_filename_
[
0
]
==
'\0'
)
{
return
;
}
TextReader
<
size_t
>
reader
(
init_score_filename_
,
false
);
std
::
string
init_score_filename
(
data_filename_
);
// default weight file name
init_score_filename
.
append
(
".init"
);
TextReader
<
size_t
>
reader
(
init_score_filename
.
c_str
(),
false
);
reader
.
ReadAllLines
();
if
(
reader
.
Lines
().
size
()
<=
0
)
{
return
;
}
Log
::
Info
(
"Loading initial scores..."
);
num_init_score_
=
static_cast
<
data_size_t
>
(
reader
.
Lines
().
size
());
...
...
windows/LightGBM.vcxproj
View file @
1c08e71e
...
...
@@ -161,6 +161,7 @@
<ClInclude
Include=
"..\include\LightGBM\config.h"
/>
<ClInclude
Include=
"..\include\LightGBM\c_api.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset_loader.h"
/>
<ClInclude
Include=
"..\include\LightGBM\feature.h"
/>
<ClInclude
Include=
"..\include\LightGBM\meta.h"
/>
<ClInclude
Include=
"..\include\LightGBM\metric.h"
/>
...
...
@@ -208,6 +209,7 @@
<ClCompile
Include=
"..\src\io\bin.cpp"
/>
<ClCompile
Include=
"..\src\io\config.cpp"
/>
<ClCompile
Include=
"..\src\io\dataset.cpp"
/>
<ClCompile
Include=
"..\src\io\dataset_loader.cpp"
/>
<ClCompile
Include=
"..\src\io\metadata.cpp"
/>
<ClCompile
Include=
"..\src\io\parser.cpp"
/>
<ClCompile
Include=
"..\src\io\tree.cpp"
/>
...
...
windows/LightGBM.vcxproj.filters
View file @
1c08e71e
...
...
@@ -168,6 +168,9 @@
<ClInclude
Include=
"..\include\LightGBM\c_api.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\dataset_loader.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile
Include=
"..\src\application\application.cpp"
>
...
...
@@ -236,5 +239,8 @@
<ClCompile
Include=
"..\src\c_api.cpp"
>
<Filter>
src
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\dataset_loader.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
</ItemGroup>
</Project>
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment