Unverified Commit 2c958dd4 authored by Chen Yufei's avatar Chen Yufei Committed by GitHub
Browse files

Check max_bin, etc. match config when using binary (#3592)

* Check max_bin, etc. match config when using binary.

* Check max_bin_by_feature, bin_construct_sample_cnt matching config.
parent d83b973b
...@@ -44,7 +44,7 @@ class DatasetLoader { ...@@ -44,7 +44,7 @@ class DatasetLoader {
void SetHeader(const char* filename); void SetHeader(const char* filename);
void CheckDataset(const Dataset* dataset); void CheckDataset(const Dataset* dataset, bool is_load_from_binary);
std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices); std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
......
...@@ -186,6 +186,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -186,6 +186,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
data_size_t num_global_data = 0; data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices; std::vector<data_size_t> used_data_indices;
auto bin_filename = CheckCanLoadFromBin(filename); auto bin_filename = CheckCanLoadFromBin(filename);
bool is_load_from_binary = false;
if (bin_filename.size() == 0) { if (bin_filename.size() == 0) {
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_)); auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_));
if (parser == nullptr) { if (parser == nullptr) {
...@@ -229,12 +230,15 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -229,12 +230,15 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
} }
} else { } else {
// load data from binary file // load data from binary file
is_load_from_binary = true;
Log::Info("Load from binary file %s", bin_filename.c_str());
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices)); dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices));
} }
// check meta data // check meta data
dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
// need to check training data // need to check training data
CheckDataset(dataset.get()); CheckDataset(dataset.get(), is_load_from_binary);
return dataset.release(); return dataset.release();
} }
...@@ -707,7 +711,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values, ...@@ -707,7 +711,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
// ---- private functions ---- // ---- private functions ----
void DatasetLoader::CheckDataset(const Dataset* dataset) { void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binary) {
if (dataset->num_data_ <= 0) { if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str()); Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
} }
...@@ -736,6 +740,38 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) { ...@@ -736,6 +740,38 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if (!is_feature_order_by_group) { if (!is_feature_order_by_group) {
Log::Fatal("Features in dataset should be ordered by group"); Log::Fatal("Features in dataset should be ordered by group");
} }
if (is_load_from_binary) {
if (dataset->max_bin_ != config_.max_bin) {
Log::Fatal("Dataset max_bin %d != config %d", dataset->max_bin_, config_.max_bin);
}
if (dataset->min_data_in_bin_ != config_.min_data_in_bin) {
Log::Fatal("Dataset min_data_in_bin %d != config %d", dataset->min_data_in_bin_, config_.min_data_in_bin);
}
if (dataset->use_missing_ != config_.use_missing) {
Log::Fatal("Dataset use_missing %d != config %d", dataset->use_missing_, config_.use_missing);
}
if (dataset->zero_as_missing_ != config_.zero_as_missing) {
Log::Fatal("Dataset zero_as_missing %d != config %d", dataset->zero_as_missing_, config_.zero_as_missing);
}
if (dataset->bin_construct_sample_cnt_ != config_.bin_construct_sample_cnt) {
Log::Fatal("Dataset bin_construct_sample_cnt %d != config %d", dataset->bin_construct_sample_cnt_, config_.bin_construct_sample_cnt);
}
if ((dataset->max_bin_by_feature_.size() != config_.max_bin_by_feature.size()) ||
!std::equal(dataset->max_bin_by_feature_.begin(), dataset->max_bin_by_feature_.end(),
config_.max_bin_by_feature.begin())) {
Log::Fatal("Dataset max_bin_by_feature does not match with config");
}
int label_idx = -1;
if (Common::AtoiAndCheck(config_.label_column.c_str(), &label_idx)) {
if (dataset->label_idx_ != label_idx) {
Log::Fatal("Dataset label_idx %d != config %d", dataset->zero_as_missing_, config_.zero_as_missing);
}
} else {
Log::Info("Recommend use integer for label index when loading data from binary for sanity check.");
}
}
} }
std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata, std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment