Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
1c08e71e
Commit
1c08e71e
authored
Nov 04, 2016
by
Guolin Ke
Browse files
use dataset_loader to load data
parent
8696709e
Changes
11
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
898 additions
and
1067 deletions
+898
-1067
include/LightGBM/application.h
include/LightGBM/application.h
+3
-0
include/LightGBM/config.h
include/LightGBM/config.h
+0
-2
include/LightGBM/dataset.h
include/LightGBM/dataset.h
+10
-149
src/application/application.cpp
src/application/application.cpp
+11
-16
src/c_api.cpp
src/c_api.cpp
+4
-1
src/io/config.cpp
src/io/config.cpp
+0
-1
src/io/dataset.cpp
src/io/dataset.cpp
+21
-888
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+833
-0
src/io/metadata.cpp
src/io/metadata.cpp
+8
-10
windows/LightGBM.vcxproj
windows/LightGBM.vcxproj
+2
-0
windows/LightGBM.vcxproj.filters
windows/LightGBM.vcxproj.filters
+6
-0
No files found.
include/LightGBM/application.h
View file @
1c08e71e
...
...
@@ -8,6 +8,7 @@
namespace
LightGBM
{
class
DatasetLoader
;
class
Dataset
;
class
Boosting
;
class
ObjectiveFunction
;
...
...
@@ -59,6 +60,8 @@ private:
/*! \brief All configs */
OverallConfig
config_
;
/*! \brief Dataset loader */
DatasetLoader
*
dataset_loader_
;
/*! \brief Training data */
Dataset
*
train_data_
;
/*! \brief Validation data */
...
...
include/LightGBM/config.h
View file @
1c08e71e
...
...
@@ -93,7 +93,6 @@ public:
std
::
string
output_model
=
"LightGBM_model.txt"
;
std
::
string
output_result
=
"LightGBM_predict_result.txt"
;
std
::
string
input_model
=
""
;
std
::
string
input_init_score
=
""
;
int
verbosity
=
1
;
int
num_model_predict
=
-
1
;
bool
is_pre_partition
=
false
;
...
...
@@ -318,7 +317,6 @@ struct ParameterAlias {
{
"model_out"
,
"output_model"
},
{
"model_input"
,
"input_model"
},
{
"model_in"
,
"input_model"
},
{
"init_score"
,
"input_init_score"
},
{
"predict_result"
,
"output_result"
},
{
"prediction_result"
,
"output_result"
},
{
"valid"
,
"valid_data"
},
...
...
include/LightGBM/dataset.h
View file @
1c08e71e
#ifndef LIGHTGBM_DATA_H_
#define LIGHTGBM_DATA_H_
#ifndef LIGHTGBM_DATA
SET
_H_
#define LIGHTGBM_DATA
SET
_H_
#include <LightGBM/utils/random.h>
#include <LightGBM/utils/text_reader.h>
...
...
@@ -18,6 +18,7 @@ namespace LightGBM {
/*! \brief forward declaration */
class
Feature
;
class
BinMapper
;
class
DatasetLoader
;
/*!
* \brief This class is used to store some meta(non-feature) data for training data,
...
...
@@ -44,13 +45,7 @@ public:
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void
Init
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
int
num_class
);
/*!
* \brief Initialize, only load initial score
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void
Init
(
const
char
*
init_score_filename
,
const
int
num_class
);
void
Init
(
const
char
*
data_filename
,
const
int
num_class
);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
...
...
@@ -178,10 +173,9 @@ public:
*/
inline
const
float
*
init_score
()
const
{
return
init_score_
;
}
private:
/*! \brief Load initial scores from file */
void
LoadInitialScore
();
private:
/*! \brief Load wights from file */
void
LoadWeights
();
/*! \brief Load query boundaries from file */
...
...
@@ -190,8 +184,6 @@ private:
void
LoadQueryWeights
();
/*! \brief Filename of current data */
const
char
*
data_filename_
;
/*! \brief Filename of initial scores */
const
char
*
init_score_filename_
;
/*! \brief Number of data */
data_size_t
num_data_
;
/*! \brief Number of classes */
...
...
@@ -251,79 +243,16 @@ using PredictFunction =
*/
class
Dataset
{
public:
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param init_score_filename Filename of initial score
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
);
friend
DatasetLoader
;
/*!
* \brief Constructor
* \param data_filename Filename of dataset
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
char
*
data_filename
,
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
)
:
Dataset
(
data_filename
,
""
,
io_config
,
predict_fun
)
{
}
/*!
* \brief Constructor, without filename, used to load data from memory
* \param io_config configs for IO
* \param predict_fun Used for initial model, will give a prediction score based on this function, then set as initial score
*/
Dataset
(
const
IOConfig
&
io_config
,
const
PredictFunction
&
predict_fun
);
Dataset
();
/*! \brief Destructor */
~
Dataset
();
/*! \brief Init Dataset with specific binmapper */
void
InitByBinMapper
(
std
::
vector
<
const
BinMapper
*>
bin_mappers
,
data_size_t
num_data
);
/*! \brief push raw data into dataset */
void
PushData
(
const
std
::
vector
<
std
::
vector
<
std
::
pair
<
int
,
float
>>>&
datas
,
data_size_t
start_idx
,
bool
is_finished
);
void
SetField
(
const
char
*
field_name
,
const
void
*
field_data
,
data_size_t
num_element
,
int
type
);
/*!
* \brief Load training data on parallel training
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param use_two_round_loading True if need to use two round loading
*/
void
LoadTrainData
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
bool
use_two_round_loading
);
/*!
* \brief Load training data on single machine training
* \param use_two_round_loading True if need to use two round loading
*/
inline
void
LoadTrainData
(
bool
use_two_round_loading
)
{
LoadTrainData
(
0
,
1
,
false
,
use_two_round_loading
);
}
/*!
* \brief Load data and use bin mapper from other data set, general this function is used to extract feature for validation data
* \param train_set Other loaded data set
* \param use_two_round_loading True if need to use two round loading
*/
void
LoadValidationData
(
const
Dataset
*
train_set
,
bool
use_two_round_loading
);
/*!
* \brief Load data set from binary file
* \param bin_filename filename of bin data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void
LoadDataFromBinFile
(
const
char
*
bin_filename
,
int
rank
,
int
num_machines
,
bool
is_pre_partition
);
/*!
* \brief Save current dataset into binary file, will save to "filename.bin"
*/
...
...
@@ -331,6 +260,8 @@ public:
std
::
vector
<
const
BinMapper
*>
GetBinMappers
()
const
;
void
CopyFeatureMetadataTo
(
Dataset
*
dataset
,
bool
is_enable_sparse
)
const
;
/*!
* \brief Get a feature pointer for specific index
* \param i Index for feature
...
...
@@ -365,57 +296,7 @@ public:
Dataset
(
const
Dataset
&
)
=
delete
;
private:
/*!
* \brief Load data content on memory. if num_machines > 1 and !is_pre_partition, will partition data
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
*/
void
LoadDataToMemory
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
);
/*!
* \brief Sample data from memory, need load data to memory first
* \param out_data Store the sampled data
*/
void
SampleDataFromMemory
(
std
::
vector
<
std
::
string
>*
out_data
);
/*!
* \brief Sample data from file
* \param rank Rank of local machine
* \param num_machines Total number of all machines
* \param is_pre_partition True if data file is pre-partitioned
* \param out_data Store the sampled data
*/
void
SampleDataFromFile
(
int
rank
,
int
num_machines
,
bool
is_pre_partition
,
std
::
vector
<
std
::
string
>*
out_data
);
/*!
* \brief Get feature bin mapper from sampled data.
* if num_machines > 1, differnt machines will construct bin mapper for different features, then have a global sync up
* \param rank Rank of local machine
* \param num_machines Total number of all machines
*/
void
ConstructBinMappers
(
int
rank
,
int
num_machines
,
const
std
::
vector
<
std
::
string
>&
sample_data
);
/*! \brief Extract local features from memory */
void
ExtractFeaturesFromMemory
();
/*! \brief Extract local features from file */
void
ExtractFeaturesFromFile
();
/*! \brief Check can load from binary file */
void
CheckCanLoadFromBin
();
/*! \brief Check this data set is null or not */
void
CheckDataset
();
/*! \brief Filename of data */
const
char
*
data_filename_
;
/*! \brief A reader class that can read text data */
TextReader
<
data_size_t
>*
text_reader_
;
/*! \brief A parser class that can parse data */
Parser
*
parser_
;
/*! \brief Store used features */
std
::
vector
<
Feature
*>
features_
;
/*! \brief Mapper from real feature index to used index*/
...
...
@@ -430,32 +311,12 @@ private:
int
num_class_
;
/*! \brief Store some label level data*/
Metadata
metadata_
;
/*! \brief Random generator*/
Random
random_
;
/*! \brief The maximal number of bin that feature values will bucket in */
int
max_bin_
;
/*! \brief True if enable sparse */
bool
is_enable_sparse_
;
/*! \brief True if dataset is loaded from binary file */
bool
is_loading_from_binfile_
;
/*! \brief Number of global data, used for distributed learning */
size_t
global_num_data_
=
0
;
/*! \brief used to local used data indices */
std
::
vector
<
data_size_t
>
used_data_indices_
;
/*! \brief prediction function for initial model */
const
PredictFunction
&
predict_fun_
;
/*! \brief index of label column */
int
label_idx_
=
0
;
/*! \brief index of weight column */
int
weight_idx_
=
-
1
;
/*! \brief index of group column */
int
group_idx_
=
-
1
;
/*! \brief Mapper from real feature index to used index*/
std
::
unordered_set
<
int
>
ignore_features_
;
/*! \brief store feature names */
std
::
vector
<
std
::
string
>
feature_names_
;
/*! \brief store feature names */
int
bin_construct_sample_cnt_
;
};
}
// namespace LightGBM
...
...
src/application/application.cpp
View file @
1c08e71e
...
...
@@ -5,6 +5,7 @@
#include <LightGBM/network.h>
#include <LightGBM/dataset.h>
#include <LightGBM/dataset_loader.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
...
...
@@ -26,7 +27,7 @@
namespace
LightGBM
{
Application
::
Application
(
int
argc
,
char
**
argv
)
:
train_data_
(
nullptr
),
boosting_
(
nullptr
),
objective_fun_
(
nullptr
)
{
:
dataset_loader_
(
nullptr
),
train_data_
(
nullptr
),
boosting_
(
nullptr
),
objective_fun_
(
nullptr
)
{
LoadParameters
(
argc
,
argv
);
// set number of threads for openmp
if
(
config_
.
num_threads
>
0
)
{
...
...
@@ -35,6 +36,7 @@ Application::Application(int argc, char** argv)
}
Application
::~
Application
()
{
if
(
dataset_loader_
!=
nullptr
)
{
delete
dataset_loader_
;
}
if
(
train_data_
!=
nullptr
)
{
delete
train_data_
;
}
for
(
auto
&
data
:
valid_datas_
)
{
if
(
data
!=
nullptr
)
{
delete
data
;
}
...
...
@@ -141,19 +143,17 @@ void Application::LoadData() {
config_
.
io_config
.
data_random_seed
=
GlobalSyncUpByMin
<
int
>
(
config_
.
io_config
.
data_random_seed
);
}
train_data_
=
new
Dataset
(
config_
.
io_config
.
data_filename
.
c_str
(),
config_
.
io_config
.
input_init_score
.
c_str
(),
config_
.
io_config
,
predict_fun
);
dataset_loader_
=
new
DatasetLoader
(
config_
.
io_config
,
predict_fun
);
dataset_loader_
->
SetHeadder
(
config_
.
io_config
.
data_filename
.
c_str
());
// load Training data
if
(
config_
.
is_parallel_find_bin
)
{
// load data for parallel training
train_data_
->
LoadTrainData
(
Network
::
rank
(),
Network
::
num_machines
(),
config_
.
io_config
.
is_pre_partition
,
config_
.
io_config
.
use_two_round_loading
);
train_data_
=
dataset_loader_
->
LoadFromFile
(
config_
.
io_config
.
data_filename
.
c_str
(),
Network
::
rank
(),
Network
::
num_machines
());
}
else
{
// load data for single machine
train_data_
->
LoadTrainData
(
config_
.
io_config
.
use_two_round_loading
);
train_data_
=
dataset_loader_
->
LoadFromFile
(
config_
.
io_config
.
data_filename
.
c_str
(),
0
,
1
);
}
// need save binary file
if
(
config_
.
io_config
.
is_save_binary_file
)
{
...
...
@@ -173,13 +173,8 @@ void Application::LoadData() {
// Add validation data, if it exists
for
(
size_t
i
=
0
;
i
<
config_
.
io_config
.
valid_data_filenames
.
size
();
++
i
)
{
// add
valid_datas_
.
push_back
(
new
Dataset
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
config_
.
io_config
,
predict_fun
));
// load validation data like train data
valid_datas_
.
back
()
->
LoadValidationData
(
train_data_
,
config_
.
io_config
.
use_two_round_loading
);
valid_datas_
.
push_back
(
dataset_loader_
->
LoadFromFileLikeOthers
(
config_
.
io_config
.
valid_data_filenames
[
i
].
c_str
(),
train_data_
));
// need save binary file
if
(
config_
.
io_config
.
is_save_binary_file
)
{
valid_datas_
.
back
()
->
SaveBinaryFile
(
nullptr
);
...
...
src/c_api.cpp
View file @
1c08e71e
#include <LightGBM/c_api.h>
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
...
...
@@ -10,6 +9,7 @@
#include <vector>
#include <string>
#include <cstring>
#include <memory>
namespace
LightGBM
{
...
...
@@ -100,3 +100,6 @@ private:
};
}
using
namespace
LightGBM
;
src/io/config.cpp
View file @
1c08e71e
...
...
@@ -202,7 +202,6 @@ void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetString
(
params
,
"output_model"
,
&
output_model
);
GetString
(
params
,
"input_model"
,
&
input_model
);
GetString
(
params
,
"output_result"
,
&
output_result
);
GetString
(
params
,
"input_init_score"
,
&
input_init_score
);
std
::
string
tmp_str
=
""
;
if
(
GetString
(
params
,
"valid_data"
,
&
tmp_str
))
{
valid_data_filenames
=
Common
::
Split
(
tmp_str
.
c_str
(),
','
);
...
...
src/io/dataset.cpp
View file @
1c08e71e
This diff is collapsed.
Click to expand it.
src/io/dataset_loader.cpp
0 → 100644
View file @
1c08e71e
This diff is collapsed.
Click to expand it.
src/io/metadata.cpp
View file @
1c08e71e
...
...
@@ -14,9 +14,8 @@ Metadata::Metadata()
}
void
Metadata
::
Init
(
const
char
*
data_filename
,
const
char
*
init_score_filename
,
const
int
num_class
)
{
void
Metadata
::
Init
(
const
char
*
data_filename
,
const
int
num_class
)
{
data_filename_
=
data_filename
;
init_score_filename_
=
init_score_filename
;
num_class_
=
num_class
;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries
();
...
...
@@ -25,11 +24,6 @@ void Metadata::Init(const char * data_filename, const char* init_score_filename,
LoadInitialScore
();
}
void
Metadata
::
Init
(
const
char
*
init_score_filename
,
const
int
num_class
)
{
init_score_filename_
=
init_score_filename
;
num_class_
=
num_class
;
LoadInitialScore
();
}
Metadata
::~
Metadata
()
{
...
...
@@ -294,10 +288,14 @@ void Metadata::LoadWeights() {
void
Metadata
::
LoadInitialScore
()
{
num_init_score_
=
0
;
if
(
init_score_filename_
[
0
]
==
'\0'
)
{
return
;
}
TextReader
<
size_t
>
reader
(
init_score_filename_
,
false
);
std
::
string
init_score_filename
(
data_filename_
);
// default weight file name
init_score_filename
.
append
(
".init"
);
TextReader
<
size_t
>
reader
(
init_score_filename
.
c_str
(),
false
);
reader
.
ReadAllLines
();
if
(
reader
.
Lines
().
size
()
<=
0
)
{
return
;
}
Log
::
Info
(
"Loading initial scores..."
);
num_init_score_
=
static_cast
<
data_size_t
>
(
reader
.
Lines
().
size
());
...
...
windows/LightGBM.vcxproj
View file @
1c08e71e
...
...
@@ -161,6 +161,7 @@
<ClInclude
Include=
"..\include\LightGBM\config.h"
/>
<ClInclude
Include=
"..\include\LightGBM\c_api.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset.h"
/>
<ClInclude
Include=
"..\include\LightGBM\dataset_loader.h"
/>
<ClInclude
Include=
"..\include\LightGBM\feature.h"
/>
<ClInclude
Include=
"..\include\LightGBM\meta.h"
/>
<ClInclude
Include=
"..\include\LightGBM\metric.h"
/>
...
...
@@ -208,6 +209,7 @@
<ClCompile
Include=
"..\src\io\bin.cpp"
/>
<ClCompile
Include=
"..\src\io\config.cpp"
/>
<ClCompile
Include=
"..\src\io\dataset.cpp"
/>
<ClCompile
Include=
"..\src\io\dataset_loader.cpp"
/>
<ClCompile
Include=
"..\src\io\metadata.cpp"
/>
<ClCompile
Include=
"..\src\io\parser.cpp"
/>
<ClCompile
Include=
"..\src\io\tree.cpp"
/>
...
...
windows/LightGBM.vcxproj.filters
View file @
1c08e71e
...
...
@@ -168,6 +168,9 @@
<ClInclude
Include=
"..\include\LightGBM\c_api.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
<ClInclude
Include=
"..\include\LightGBM\dataset_loader.h"
>
<Filter>
include\LightGBM
</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ClCompile
Include=
"..\src\application\application.cpp"
>
...
...
@@ -236,5 +239,8 @@
<ClCompile
Include=
"..\src\c_api.cpp"
>
<Filter>
src
</Filter>
</ClCompile>
<ClCompile
Include=
"..\src\io\dataset_loader.cpp"
>
<Filter>
src\io
</Filter>
</ClCompile>
</ItemGroup>
</Project>
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment