Commit b3975555 authored by ashok-ponnuswami-msft's avatar ashok-ponnuswami-msft Committed by Guolin Ke
Browse files

Add more debug logging to show data load progress. (#2587)

parent 483a9bba
......@@ -856,6 +856,8 @@ struct Config {
#pragma endregion
size_t file_load_progress_interval_bytes = size_t(10) * 1024 * 1024 * 1024;
bool is_parallel = false;
bool is_parallel_find_bin = false;
LIGHTGBM_EXPORT void Set(const std::unordered_map<std::string, std::string>& params);
......
......@@ -28,8 +28,8 @@ class TextReader {
* \param filename Filename of data
* \param is_skip_first_line True if need to skip header
*/
TextReader(const char* filename, bool is_skip_first_line):
filename_(filename), is_skip_first_line_(is_skip_first_line) {
TextReader(const char* filename, bool is_skip_first_line, size_t progress_interval_bytes = SIZE_MAX):
filename_(filename), is_skip_first_line_(is_skip_first_line), read_progress_interval_bytes_(progress_interval_bytes) {
if (is_skip_first_line_) {
auto reader = VirtualFileReader::Make(filename);
if (!reader->Init()) {
......@@ -86,6 +86,7 @@ class TextReader {
INDEX_T ReadAllAndProcess(const std::function<void(INDEX_T, const char*, size_t)>& process_fun) {
last_line_ = "";
INDEX_T total_cnt = 0;
size_t bytes_read = 0;
PipelineReader::Read(filename_, skip_bytes_,
[&]
(const char* buffer_process, size_t read_cnt) {
......@@ -119,6 +120,15 @@ class TextReader {
if (last_i != read_cnt) {
last_line_.append(buffer_process + last_i, read_cnt - last_i);
}
size_t prev_bytes_read = bytes_read;
bytes_read += read_cnt;
if (prev_bytes_read / read_progress_interval_bytes_ < bytes_read / read_progress_interval_bytes_)
{
const size_t gbs = size_t(1024) * 1024 * 1024;
Log::Debug("Read %.1f GBs from %s.", 1.0 * bytes_read / gbs, filename_);
}
return cnt;
});
// if last line of file doesn't contain end of line
......@@ -227,6 +237,7 @@ class TextReader {
INDEX_T ReadAllAndProcessParallelWithFilter(const std::function<void(INDEX_T, const std::vector<std::string>&)>& process_fun, const std::function<bool(INDEX_T, INDEX_T)>& filter_fun) {
last_line_ = "";
INDEX_T total_cnt = 0;
size_t bytes_read = 0;
INDEX_T used_cnt = 0;
PipelineReader::Read(filename_, skip_bytes_,
[&]
......@@ -270,6 +281,15 @@ class TextReader {
if (last_i != read_cnt) {
last_line_.append(buffer_process + last_i, read_cnt - last_i);
}
size_t prev_bytes_read = bytes_read;
bytes_read += read_cnt;
if (prev_bytes_read / read_progress_interval_bytes_ < bytes_read / read_progress_interval_bytes_)
{
const size_t gbs = size_t(1024) * 1024 * 1024;
Log::Debug("Read %.1f GBs from %s.", 1.0 * bytes_read / gbs, filename_);
}
return cnt;
});
// if last line of file doesn't contain end of line
......@@ -313,6 +333,7 @@ class TextReader {
std::string first_line_ = "";
/*! \brief is skip first line */
bool is_skip_first_line_ = false;
size_t read_progress_interval_bytes_;
/*! \brief is skip first line */
int skip_bytes_ = 0;
};
......
......@@ -96,6 +96,7 @@ void Application::LoadData() {
config_.data_random_seed = Network::GlobalSyncUpByMin(config_.data_random_seed);
}
Log::Debug("Loading train file...");
DatasetLoader dataset_loader(config_, predict_fun,
config_.num_class, config_.data.c_str());
// load Training data
......@@ -124,12 +125,12 @@ void Application::LoadData() {
}
train_metric_.shrink_to_fit();
if (!config_.metric.empty()) {
// only when have metrics then need to construct validation data
// Add validation data, if it exists
for (size_t i = 0; i < config_.valid.size(); ++i) {
Log::Debug("Loading validation file #%zu...", (i + 1));
// add
auto new_dataset = std::unique_ptr<Dataset>(
dataset_loader.LoadFromFileAlignWithOtherDataset(
......@@ -194,6 +195,7 @@ void Application::InitTrain() {
for (size_t i = 0; i < valid_datas_.size(); ++i) {
boosting_->AddValidDataset(valid_datas_[i].get(),
Common::ConstPtrInVectorWrapper<Metric>(valid_metrics_[i]));
Log::Debug("Number of data points in validation set #%zu: %zu", i + 1, valid_datas_[i]->num_data());
}
Log::Info("Finished initializing training");
}
......
......@@ -210,6 +210,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, const char* initscore
ConstructBinMappersFromTextData(rank, num_machines, sample_data, parser.get(), dataset.get());
// initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
Log::Debug("Making second pass...");
// extract features
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
}
......@@ -758,7 +759,7 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
int rank, int num_machines, int* num_global_data,
std::vector<data_size_t>* used_data_indices) {
TextReader<data_size_t> text_reader(filename, config_.header);
TextReader<data_size_t> text_reader(filename, config_.header, config_.file_load_progress_interval_bytes);
used_data_indices->clear();
if (num_machines == 1 || config_.pre_partition) {
// read all lines
......@@ -821,7 +822,7 @@ std::vector<std::string> DatasetLoader::SampleTextDataFromFile(const char* filen
int rank, int num_machines, int* num_global_data,
std::vector<data_size_t>* used_data_indices) {
const data_size_t sample_cnt = static_cast<data_size_t>(config_.bin_construct_sample_cnt);
TextReader<data_size_t> text_reader(filename, config_.header);
TextReader<data_size_t> text_reader(filename, config_.header, config_.file_load_progress_interval_bytes);
std::vector<std::string> out_data;
if (num_machines == 1 || config_.pre_partition) {
*num_global_data = static_cast<data_size_t>(text_reader.SampleFromFile(&random_, sample_cnt, &out_data));
......@@ -1187,7 +1188,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
}
OMP_THROW_EX();
};
TextReader<data_size_t> text_reader(filename, config_.header);
TextReader<data_size_t> text_reader(filename, config_.header, config_.file_load_progress_interval_bytes);
if (!used_data_indices.empty()) {
// only need part of data
text_reader.ReadPartAndProcessParallel(used_data_indices, process_fun);
......
......@@ -158,8 +158,13 @@ void DCGCalculator::CheckLabel(const label_t* label, data_size_t num_data) {
Log::Fatal("label should be int type (met %f) for ranking task,\n"
"for the gain of label, please set the label_gain parameter", label[i]);
}
if (static_cast<size_t>(label[i]) >= label_gain_.size() || label[i] < 0) {
Log::Fatal("label (%d) excel the max range %d", label[i], label_gain_.size());
if (label[i] < 0) {
Log::Fatal("Label should be non-negative (met %f) for ranking task", label[i]);
}
if (static_cast<size_t>(label[i]) >= label_gain_.size()) {
Log::Fatal("Label %zu is not less than the number of label mappings (%zu)", static_cast<size_t>(label[i]), label_gain_.size());
}
}
}
......
......@@ -103,7 +103,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
}
}
}
Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
if (CostEfficientGradientBoosting::IsEnable(config_)) {
cegb_.reset(new CostEfficientGradientBoosting(this));
cegb_->Init();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment