#include #include #include #include #include #include #include #include #include #include namespace LightGBM { const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n"; Dataset::Dataset() { data_filename_ = "noname"; num_data_ = 0; } Dataset::Dataset(data_size_t num_data) { data_filename_ = "noname"; num_data_ = num_data; metadata_.Init(num_data_, -1, -1); } Dataset::~Dataset() { } void Dataset::FinishLoad() { #pragma omp parallel for schedule(guided) for (int i = 0; i < num_features_; ++i) { features_[i]->FinishLoad(); } } void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { features_.clear(); num_features_ = dataset->num_features_; bool is_enable_sparse = false; for (int i = 0; i < num_features_; ++i) { if (dataset->features_[i]->is_sparse()) { is_enable_sparse = true; break; } } // copy feature bin mapper data for(int i = 0;i < num_features_;++i){ features_.emplace_back(new Feature(dataset->features_[i]->feature_index(), new BinMapper(*(dataset->features_[i]->bin_mapper())), num_data_, is_enable_sparse)); } features_.shrink_to_fit(); used_feature_map_ = dataset->used_feature_map_; num_total_features_ = dataset->num_total_features_; feature_names_ = dataset->feature_names_; label_idx_ = dataset->label_idx_; } void Dataset::ReSize(data_size_t num_data) { if (num_data_ != num_data) { num_data_ = num_data; #pragma omp parallel for schedule(guided) for (int fidx = 0; fidx < num_features_; ++fidx) { features_[fidx]->ReSize(num_data_); } } } void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices, data_size_t num_used_indices, bool need_meta_data) { CHECK(num_used_indices == num_data_); #pragma omp parallel for schedule(guided) for (int fidx = 0; fidx < num_features_; ++fidx) { features_[fidx]->CopySubset(fullset->features_[fidx].get(), used_indices, num_used_indices); } if (need_meta_data) { metadata_.Init(metadata_, used_indices, num_used_indices); } } bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) { std::string name(field_name); name = Common::Trim(name); if (name == std::string("label") || name == std::string("target")) { metadata_.SetLabel(field_data, num_element); } else if (name == std::string("weight") || name == std::string("weights")) { metadata_.SetWeights(field_data, num_element); } else { return false; } return true; } bool Dataset::SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element) { std::string name(field_name); name = Common::Trim(name); if (name == std::string("init_score")) { metadata_.SetInitScore(field_data, num_element); } else { return false; } return true; } bool Dataset::SetIntField(const char* field_name, const int* field_data, data_size_t num_element) { std::string name(field_name); name = Common::Trim(name); if (name == std::string("query") || name == std::string("group")) { metadata_.SetQuery(field_data, num_element); } else if (name == std::string("query_id") || name == std::string("group_id")) { metadata_.SetQueryId(field_data, num_element); } else { return false; } return true; } bool Dataset::GetFloatField(const char* field_name, data_size_t* out_len, const float** out_ptr) { std::string name(field_name); name = Common::Trim(name); if (name == std::string("label") || name == std::string("target")) { *out_ptr = metadata_.label(); *out_len = num_data_; } else if (name == std::string("weight") || name == std::string("weights")) { *out_ptr = metadata_.weights(); *out_len = num_data_; } else { return false; } return true; } bool Dataset::GetDoubleField(const char* field_name, data_size_t* out_len, const double** out_ptr) { std::string name(field_name); name = Common::Trim(name); if (name == std::string("init_score")) { *out_ptr = metadata_.init_score(); *out_len = static_cast(metadata_.num_init_score()); } else { return false; } return true; } bool Dataset::GetIntField(const char* field_name, data_size_t* out_len, const int** out_ptr) { std::string name(field_name); name = Common::Trim(name); if (name == std::string("query") || name == std::string("group")) { *out_ptr = metadata_.query_boundaries(); *out_len = metadata_.num_queries() + 1; } else { return false; } return true; } void Dataset::SaveBinaryFile(const char* bin_filename) { if (bin_filename != nullptr && std::string(bin_filename) == std::string(data_filename_)) { Log::Warning("Bianry file %s already existed", bin_filename); return; } // if not pass a filename, just append ".bin" of original file std::string bin_filename_str(data_filename_); if (bin_filename == nullptr || bin_filename[0] == '\0') { bin_filename_str.append(".bin"); bin_filename = bin_filename_str.c_str(); } bool is_file_existed = false; FILE* file; #ifdef _MSC_VER fopen_s(&file, bin_filename, "rb"); #else file = fopen(bin_filename, "rb"); #endif if (file != NULL) { is_file_existed = true; Log::Warning("File %s existed, cannot save binary to it", bin_filename); fclose(file); } if (!is_file_existed) { #ifdef _MSC_VER fopen_s(&file, bin_filename, "wb"); #else file = fopen(bin_filename, "wb"); #endif if (file == NULL) { Log::Fatal("Cannot write binary data to %s ", bin_filename); } Log::Info("Saving data to binary file %s", bin_filename); size_t size_of_token = std::strlen(binary_file_token); fwrite(binary_file_token, sizeof(char), size_of_token, file); // get size of header size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) + sizeof(size_t) + sizeof(int) * used_feature_map_.size(); // size of feature names for (int i = 0; i < num_total_features_; ++i) { size_of_header += feature_names_[i].size() + sizeof(int); } fwrite(&size_of_header, sizeof(size_of_header), 1, file); // write header fwrite(&num_data_, sizeof(num_data_), 1, file); fwrite(&num_features_, sizeof(num_features_), 1, file); fwrite(&num_total_features_, sizeof(num_features_), 1, file); size_t num_used_feature_map = used_feature_map_.size(); fwrite(&num_used_feature_map, sizeof(num_used_feature_map), 1, file); fwrite(used_feature_map_.data(), sizeof(int), num_used_feature_map, file); // write feature names for (int i = 0; i < num_total_features_; ++i) { int str_len = static_cast(feature_names_[i].size()); fwrite(&str_len, sizeof(int), 1, file); const char* c_str = feature_names_[i].c_str(); fwrite(c_str, sizeof(char), str_len, file); } // get size of meta data size_t size_of_metadata = metadata_.SizesInByte(); fwrite(&size_of_metadata, sizeof(size_of_metadata), 1, file); // write meta data metadata_.SaveBinaryToFile(file); // write feature data for (int i = 0; i < num_features_; ++i) { // get size of feature size_t size_of_feature = features_[i]->SizesInByte(); fwrite(&size_of_feature, sizeof(size_of_feature), 1, file); // write feature features_[i]->SaveBinaryToFile(file); } fclose(file); } } } // namespace LightGBM