Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
b3975555
Commit
b3975555
authored
Nov 28, 2019
by
ashok-ponnuswami-msft
Committed by
Guolin Ke
Nov 29, 2019
Browse files
Add more debug logging to show data load progress. (#2587)
parent
483a9bba
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
40 additions
and
9 deletions
+40
-9
include/LightGBM/config.h
include/LightGBM/config.h
+2
-0
include/LightGBM/utils/text_reader.h
include/LightGBM/utils/text_reader.h
+23
-2
src/application/application.cpp
src/application/application.cpp
+3
-1
src/io/dataset_loader.cpp
src/io/dataset_loader.cpp
+4
-3
src/metric/dcg_calculator.cpp
src/metric/dcg_calculator.cpp
+7
-2
src/treelearner/serial_tree_learner.cpp
src/treelearner/serial_tree_learner.cpp
+1
-1
No files found.
include/LightGBM/config.h
View file @
b3975555
...
@@ -856,6 +856,8 @@ struct Config {
...
@@ -856,6 +856,8 @@ struct Config {
#pragma endregion
#pragma endregion
size_t
file_load_progress_interval_bytes
=
size_t
(
10
)
*
1024
*
1024
*
1024
;
bool
is_parallel
=
false
;
bool
is_parallel
=
false
;
bool
is_parallel_find_bin
=
false
;
bool
is_parallel_find_bin
=
false
;
LIGHTGBM_EXPORT
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
);
LIGHTGBM_EXPORT
void
Set
(
const
std
::
unordered_map
<
std
::
string
,
std
::
string
>&
params
);
...
...
include/LightGBM/utils/text_reader.h
View file @
b3975555
...
@@ -28,8 +28,8 @@ class TextReader {
...
@@ -28,8 +28,8 @@ class TextReader {
* \param filename Filename of data
* \param filename Filename of data
* \param is_skip_first_line True if need to skip header
* \param is_skip_first_line True if need to skip header
*/
*/
TextReader
(
const
char
*
filename
,
bool
is_skip_first_line
)
:
TextReader
(
const
char
*
filename
,
bool
is_skip_first_line
,
size_t
progress_interval_bytes
=
SIZE_MAX
)
:
filename_
(
filename
),
is_skip_first_line_
(
is_skip_first_line
)
{
filename_
(
filename
),
is_skip_first_line_
(
is_skip_first_line
)
,
read_progress_interval_bytes_
(
progress_interval_bytes
)
{
if
(
is_skip_first_line_
)
{
if
(
is_skip_first_line_
)
{
auto
reader
=
VirtualFileReader
::
Make
(
filename
);
auto
reader
=
VirtualFileReader
::
Make
(
filename
);
if
(
!
reader
->
Init
())
{
if
(
!
reader
->
Init
())
{
...
@@ -86,6 +86,7 @@ class TextReader {
...
@@ -86,6 +86,7 @@ class TextReader {
INDEX_T
ReadAllAndProcess
(
const
std
::
function
<
void
(
INDEX_T
,
const
char
*
,
size_t
)
>&
process_fun
)
{
INDEX_T
ReadAllAndProcess
(
const
std
::
function
<
void
(
INDEX_T
,
const
char
*
,
size_t
)
>&
process_fun
)
{
last_line_
=
""
;
last_line_
=
""
;
INDEX_T
total_cnt
=
0
;
INDEX_T
total_cnt
=
0
;
size_t
bytes_read
=
0
;
PipelineReader
::
Read
(
filename_
,
skip_bytes_
,
PipelineReader
::
Read
(
filename_
,
skip_bytes_
,
[
&
]
[
&
]
(
const
char
*
buffer_process
,
size_t
read_cnt
)
{
(
const
char
*
buffer_process
,
size_t
read_cnt
)
{
...
@@ -119,6 +120,15 @@ class TextReader {
...
@@ -119,6 +120,15 @@ class TextReader {
if
(
last_i
!=
read_cnt
)
{
if
(
last_i
!=
read_cnt
)
{
last_line_
.
append
(
buffer_process
+
last_i
,
read_cnt
-
last_i
);
last_line_
.
append
(
buffer_process
+
last_i
,
read_cnt
-
last_i
);
}
}
size_t
prev_bytes_read
=
bytes_read
;
bytes_read
+=
read_cnt
;
if
(
prev_bytes_read
/
read_progress_interval_bytes_
<
bytes_read
/
read_progress_interval_bytes_
)
{
const
size_t
gbs
=
size_t
(
1024
)
*
1024
*
1024
;
Log
::
Debug
(
"Read %.1f GBs from %s."
,
1.0
*
bytes_read
/
gbs
,
filename_
);
}
return
cnt
;
return
cnt
;
});
});
// if last line of file doesn't contain end of line
// if last line of file doesn't contain end of line
...
@@ -227,6 +237,7 @@ class TextReader {
...
@@ -227,6 +237,7 @@ class TextReader {
INDEX_T
ReadAllAndProcessParallelWithFilter
(
const
std
::
function
<
void
(
INDEX_T
,
const
std
::
vector
<
std
::
string
>&
)
>&
process_fun
,
const
std
::
function
<
bool
(
INDEX_T
,
INDEX_T
)
>&
filter_fun
)
{
INDEX_T
ReadAllAndProcessParallelWithFilter
(
const
std
::
function
<
void
(
INDEX_T
,
const
std
::
vector
<
std
::
string
>&
)
>&
process_fun
,
const
std
::
function
<
bool
(
INDEX_T
,
INDEX_T
)
>&
filter_fun
)
{
last_line_
=
""
;
last_line_
=
""
;
INDEX_T
total_cnt
=
0
;
INDEX_T
total_cnt
=
0
;
size_t
bytes_read
=
0
;
INDEX_T
used_cnt
=
0
;
INDEX_T
used_cnt
=
0
;
PipelineReader
::
Read
(
filename_
,
skip_bytes_
,
PipelineReader
::
Read
(
filename_
,
skip_bytes_
,
[
&
]
[
&
]
...
@@ -270,6 +281,15 @@ class TextReader {
...
@@ -270,6 +281,15 @@ class TextReader {
if
(
last_i
!=
read_cnt
)
{
if
(
last_i
!=
read_cnt
)
{
last_line_
.
append
(
buffer_process
+
last_i
,
read_cnt
-
last_i
);
last_line_
.
append
(
buffer_process
+
last_i
,
read_cnt
-
last_i
);
}
}
size_t
prev_bytes_read
=
bytes_read
;
bytes_read
+=
read_cnt
;
if
(
prev_bytes_read
/
read_progress_interval_bytes_
<
bytes_read
/
read_progress_interval_bytes_
)
{
const
size_t
gbs
=
size_t
(
1024
)
*
1024
*
1024
;
Log
::
Debug
(
"Read %.1f GBs from %s."
,
1.0
*
bytes_read
/
gbs
,
filename_
);
}
return
cnt
;
return
cnt
;
});
});
// if last line of file doesn't contain end of line
// if last line of file doesn't contain end of line
...
@@ -313,6 +333,7 @@ class TextReader {
...
@@ -313,6 +333,7 @@ class TextReader {
std
::
string
first_line_
=
""
;
std
::
string
first_line_
=
""
;
/*! \brief is skip first line */
/*! \brief is skip first line */
bool
is_skip_first_line_
=
false
;
bool
is_skip_first_line_
=
false
;
size_t
read_progress_interval_bytes_
;
/*! \brief is skip first line */
/*! \brief is skip first line */
int
skip_bytes_
=
0
;
int
skip_bytes_
=
0
;
};
};
...
...
src/application/application.cpp
View file @
b3975555
...
@@ -96,6 +96,7 @@ void Application::LoadData() {
...
@@ -96,6 +96,7 @@ void Application::LoadData() {
config_
.
data_random_seed
=
Network
::
GlobalSyncUpByMin
(
config_
.
data_random_seed
);
config_
.
data_random_seed
=
Network
::
GlobalSyncUpByMin
(
config_
.
data_random_seed
);
}
}
Log
::
Debug
(
"Loading train file..."
);
DatasetLoader
dataset_loader
(
config_
,
predict_fun
,
DatasetLoader
dataset_loader
(
config_
,
predict_fun
,
config_
.
num_class
,
config_
.
data
.
c_str
());
config_
.
num_class
,
config_
.
data
.
c_str
());
// load Training data
// load Training data
...
@@ -124,12 +125,12 @@ void Application::LoadData() {
...
@@ -124,12 +125,12 @@ void Application::LoadData() {
}
}
train_metric_
.
shrink_to_fit
();
train_metric_
.
shrink_to_fit
();
if
(
!
config_
.
metric
.
empty
())
{
if
(
!
config_
.
metric
.
empty
())
{
// only when have metrics then need to construct validation data
// only when have metrics then need to construct validation data
// Add validation data, if it exists
// Add validation data, if it exists
for
(
size_t
i
=
0
;
i
<
config_
.
valid
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
config_
.
valid
.
size
();
++
i
)
{
Log
::
Debug
(
"Loading validation file #%zu..."
,
(
i
+
1
));
// add
// add
auto
new_dataset
=
std
::
unique_ptr
<
Dataset
>
(
auto
new_dataset
=
std
::
unique_ptr
<
Dataset
>
(
dataset_loader
.
LoadFromFileAlignWithOtherDataset
(
dataset_loader
.
LoadFromFileAlignWithOtherDataset
(
...
@@ -194,6 +195,7 @@ void Application::InitTrain() {
...
@@ -194,6 +195,7 @@ void Application::InitTrain() {
for
(
size_t
i
=
0
;
i
<
valid_datas_
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
valid_datas_
.
size
();
++
i
)
{
boosting_
->
AddValidDataset
(
valid_datas_
[
i
].
get
(),
boosting_
->
AddValidDataset
(
valid_datas_
[
i
].
get
(),
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
valid_metrics_
[
i
]));
Common
::
ConstPtrInVectorWrapper
<
Metric
>
(
valid_metrics_
[
i
]));
Log
::
Debug
(
"Number of data points in validation set #%zu: %zu"
,
i
+
1
,
valid_datas_
[
i
]
->
num_data
());
}
}
Log
::
Info
(
"Finished initializing training"
);
Log
::
Info
(
"Finished initializing training"
);
}
}
...
...
src/io/dataset_loader.cpp
View file @
b3975555
...
@@ -210,6 +210,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore
...
@@ -210,6 +210,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore
ConstructBinMappersFromTextData
(
rank
,
num_machines
,
sample_data
,
parser
.
get
(),
dataset
.
get
());
ConstructBinMappersFromTextData
(
rank
,
num_machines
,
sample_data
,
parser
.
get
(),
dataset
.
get
());
// initialize label
// initialize label
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
dataset
->
metadata_
.
Init
(
dataset
->
num_data_
,
weight_idx_
,
group_idx_
);
Log
::
Debug
(
"Making second pass..."
);
// extract features
// extract features
ExtractFeaturesFromFile
(
filename
,
parser
.
get
(),
used_data_indices
,
dataset
.
get
());
ExtractFeaturesFromFile
(
filename
,
parser
.
get
(),
used_data_indices
,
dataset
.
get
());
}
}
...
@@ -758,7 +759,7 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
...
@@ -758,7 +759,7 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
std
::
vector
<
std
::
string
>
DatasetLoader
::
LoadTextDataToMemory
(
const
char
*
filename
,
const
Metadata
&
metadata
,
std
::
vector
<
std
::
string
>
DatasetLoader
::
LoadTextDataToMemory
(
const
char
*
filename
,
const
Metadata
&
metadata
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
std
::
vector
<
data_size_t
>*
used_data_indices
)
{
std
::
vector
<
data_size_t
>*
used_data_indices
)
{
TextReader
<
data_size_t
>
text_reader
(
filename
,
config_
.
header
);
TextReader
<
data_size_t
>
text_reader
(
filename
,
config_
.
header
,
config_
.
file_load_progress_interval_bytes
);
used_data_indices
->
clear
();
used_data_indices
->
clear
();
if
(
num_machines
==
1
||
config_
.
pre_partition
)
{
if
(
num_machines
==
1
||
config_
.
pre_partition
)
{
// read all lines
// read all lines
...
@@ -821,7 +822,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
...
@@ -821,7 +822,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
int
rank
,
int
num_machines
,
int
*
num_global_data
,
int
rank
,
int
num_machines
,
int
*
num_global_data
,
std
::
vector
<
data_size_t
>*
used_data_indices
)
{
std
::
vector
<
data_size_t
>*
used_data_indices
)
{
const
data_size_t
sample_cnt
=
static_cast
<
data_size_t
>
(
config_
.
bin_construct_sample_cnt
);
const
data_size_t
sample_cnt
=
static_cast
<
data_size_t
>
(
config_
.
bin_construct_sample_cnt
);
TextReader
<
data_size_t
>
text_reader
(
filename
,
config_
.
header
);
TextReader
<
data_size_t
>
text_reader
(
filename
,
config_
.
header
,
config_
.
file_load_progress_interval_bytes
);
std
::
vector
<
std
::
string
>
out_data
;
std
::
vector
<
std
::
string
>
out_data
;
if
(
num_machines
==
1
||
config_
.
pre_partition
)
{
if
(
num_machines
==
1
||
config_
.
pre_partition
)
{
*
num_global_data
=
static_cast
<
data_size_t
>
(
text_reader
.
SampleFromFile
(
&
random_
,
sample_cnt
,
&
out_data
));
*
num_global_data
=
static_cast
<
data_size_t
>
(
text_reader
.
SampleFromFile
(
&
random_
,
sample_cnt
,
&
out_data
));
...
@@ -1187,7 +1188,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
...
@@ -1187,7 +1188,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
}
}
OMP_THROW_EX
();
OMP_THROW_EX
();
};
};
TextReader
<
data_size_t
>
text_reader
(
filename
,
config_
.
header
);
TextReader
<
data_size_t
>
text_reader
(
filename
,
config_
.
header
,
config_
.
file_load_progress_interval_bytes
);
if
(
!
used_data_indices
.
empty
())
{
if
(
!
used_data_indices
.
empty
())
{
// only need part of data
// only need part of data
text_reader
.
ReadPartAndProcessParallel
(
used_data_indices
,
process_fun
);
text_reader
.
ReadPartAndProcessParallel
(
used_data_indices
,
process_fun
);
...
...
src/metric/dcg_calculator.cpp
View file @
b3975555
...
@@ -158,8 +158,13 @@ void DCGCalculator::CheckLabel(const label_t* label, data_size_t num_data) {
...
@@ -158,8 +158,13 @@ void DCGCalculator::CheckLabel(const label_t* label, data_size_t num_data) {
Log
::
Fatal
(
"label should be int type (met %f) for ranking task,
\n
"
Log
::
Fatal
(
"label should be int type (met %f) for ranking task,
\n
"
"for the gain of label, please set the label_gain parameter"
,
label
[
i
]);
"for the gain of label, please set the label_gain parameter"
,
label
[
i
]);
}
}
if
(
static_cast
<
size_t
>
(
label
[
i
])
>=
label_gain_
.
size
()
||
label
[
i
]
<
0
)
{
Log
::
Fatal
(
"label (%d) excel the max range %d"
,
label
[
i
],
label_gain_
.
size
());
if
(
label
[
i
]
<
0
)
{
Log
::
Fatal
(
"Label should be non-negative (met %f) for ranking task"
,
label
[
i
]);
}
if
(
static_cast
<
size_t
>
(
label
[
i
])
>=
label_gain_
.
size
())
{
Log
::
Fatal
(
"Label %zu is not less than the number of label mappings (%zu)"
,
static_cast
<
size_t
>
(
label
[
i
]),
label_gain_
.
size
());
}
}
}
}
}
}
...
...
src/treelearner/serial_tree_learner.cpp
View file @
b3975555
...
@@ -103,7 +103,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
...
@@ -103,7 +103,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
}
}
}
}
}
}
Log
::
Info
(
"Number of data: %d, number of used features: %d"
,
num_data_
,
num_features_
);
Log
::
Info
(
"Number of data
points in the train set
: %d, number of used features: %d"
,
num_data_
,
num_features_
);
if
(
CostEfficientGradientBoosting
::
IsEnable
(
config_
))
{
if
(
CostEfficientGradientBoosting
::
IsEnable
(
config_
))
{
cegb_
.
reset
(
new
CostEfficientGradientBoosting
(
this
));
cegb_
.
reset
(
new
CostEfficientGradientBoosting
(
this
));
cegb_
->
Init
();
cegb_
->
Init
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment