"tests/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "17ecfab3353a15384e19bfde85ebd23be1acc00d"
Unverified Commit 2c958dd4 authored by Chen Yufei's avatar Chen Yufei Committed by GitHub
Browse files

Check max_bin, etc. match config when using binary (#3592)

* Check max_bin, etc. match config when using binary.

* Check max_bin_by_feature, bin_construct_sample_cnt matching config.
parent d83b973b
......@@ -44,7 +44,7 @@ class DatasetLoader {
void SetHeader(const char* filename);
void CheckDataset(const Dataset* dataset);
void CheckDataset(const Dataset* dataset, bool is_load_from_binary);
std::vector<std::string> LoadTextDataToMemory(const char* filename, const Metadata& metadata, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
......
......@@ -186,6 +186,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
data_size_t num_global_data = 0;
std::vector<data_size_t> used_data_indices;
auto bin_filename = CheckCanLoadFromBin(filename);
bool is_load_from_binary = false;
if (bin_filename.size() == 0) {
auto parser = std::unique_ptr<Parser>(Parser::CreateParser(filename, config_.header, 0, label_idx_));
if (parser == nullptr) {
......@@ -229,12 +230,15 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
}
} else {
// load data from binary file
is_load_from_binary = true;
Log::Info("Load from binary file %s", bin_filename.c_str());
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices));
}
// check meta data
dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
// need to check training data
CheckDataset(dataset.get());
CheckDataset(dataset.get(), is_load_from_binary);
return dataset.release();
}
......@@ -707,7 +711,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
// ---- private functions ----
void DatasetLoader::CheckDataset(const Dataset* dataset) {
void DatasetLoader::CheckDataset(const Dataset* dataset, bool is_load_from_binary) {
if (dataset->num_data_ <= 0) {
Log::Fatal("Data file %s is empty", dataset->data_filename_.c_str());
}
......@@ -736,6 +740,38 @@ void DatasetLoader::CheckDataset(const Dataset* dataset) {
if (!is_feature_order_by_group) {
Log::Fatal("Features in dataset should be ordered by group");
}
if (is_load_from_binary) {
if (dataset->max_bin_ != config_.max_bin) {
Log::Fatal("Dataset max_bin %d != config %d", dataset->max_bin_, config_.max_bin);
}
if (dataset->min_data_in_bin_ != config_.min_data_in_bin) {
Log::Fatal("Dataset min_data_in_bin %d != config %d", dataset->min_data_in_bin_, config_.min_data_in_bin);
}
if (dataset->use_missing_ != config_.use_missing) {
Log::Fatal("Dataset use_missing %d != config %d", dataset->use_missing_, config_.use_missing);
}
if (dataset->zero_as_missing_ != config_.zero_as_missing) {
Log::Fatal("Dataset zero_as_missing %d != config %d", dataset->zero_as_missing_, config_.zero_as_missing);
}
if (dataset->bin_construct_sample_cnt_ != config_.bin_construct_sample_cnt) {
Log::Fatal("Dataset bin_construct_sample_cnt %d != config %d", dataset->bin_construct_sample_cnt_, config_.bin_construct_sample_cnt);
}
if ((dataset->max_bin_by_feature_.size() != config_.max_bin_by_feature.size()) ||
!std::equal(dataset->max_bin_by_feature_.begin(), dataset->max_bin_by_feature_.end(),
config_.max_bin_by_feature.begin())) {
Log::Fatal("Dataset max_bin_by_feature does not match with config");
}
int label_idx = -1;
if (Common::AtoiAndCheck(config_.label_column.c_str(), &label_idx)) {
if (dataset->label_idx_ != label_idx) {
Log::Fatal("Dataset label_idx %d != config %d", dataset->zero_as_missing_, config_.zero_as_missing);
}
} else {
Log::Info("Recommend use integer for label index when loading data from binary for sanity check.");
}
}
}
std::vector<std::string> DatasetLoader::LoadTextDataToMemory(const char* filename, const Metadata& metadata,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment