Commit 522e9993 authored by Guolin Ke's avatar Guolin Ke
Browse files

support identity bin file from file content

parent 0ae51f14
...@@ -62,15 +62,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename, ...@@ -62,15 +62,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
const DatesetHandle* reference, const DatesetHandle* reference,
DatesetHandle* out); DatesetHandle* out);
/*!
* \brief load data set from binary file like the command_line LightGBM do
* \param filename the name of the file
* \param out a loaded dataset
* \return 0 when succeed, -1 when failure happens
*/
DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
DatesetHandle* out);
/*! /*!
* \brief create a dataset from CSR format * \brief create a dataset from CSR format
* \param indptr pointer to row headers * \param indptr pointer to row headers
......
...@@ -402,6 +402,8 @@ private: ...@@ -402,6 +402,8 @@ private:
int label_idx_ = 0; int label_idx_ = 0;
/*! \brief store feature names */ /*! \brief store feature names */
std::vector<std::string> feature_names_; std::vector<std::string> feature_names_;
/*! \brief store feature names */
static const char* binary_file_token;
}; };
} // namespace LightGBM } // namespace LightGBM
......
...@@ -49,7 +49,7 @@ private: ...@@ -49,7 +49,7 @@ private:
void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset); void ExtractFeaturesFromFile(const char* filename, const Parser* parser, const std::vector<data_size_t>& used_data_indices, Dataset* dataset);
/*! \brief Check can load from binary file */ /*! \brief Check can load from binary file */
bool CheckCanLoadFromBin(const char* filename); std::string CheckCanLoadFromBin(const char* filename);
const IOConfig& io_config_; const IOConfig& io_config_;
/*! \brief Random generator*/ /*! \brief Random generator*/
......
...@@ -223,15 +223,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename, ...@@ -223,15 +223,6 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
API_END(); API_END();
} }
DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
DatesetHandle* out) {
API_BEGIN();
OverallConfig config;
DatasetLoader loader(config.io_config, nullptr);
*out = loader.LoadFromBinFile(filename, 0, 1);
API_END();
}
DllExport int LGBM_CreateDatasetFromMat(const void* data, DllExport int LGBM_CreateDatasetFromMat(const void* data,
int data_type, int data_type,
int32_t nrow, int32_t nrow,
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
namespace LightGBM { namespace LightGBM {
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
Dataset::Dataset() { Dataset::Dataset() {
num_class_ = 1; num_class_ = 1;
...@@ -135,7 +136,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -135,7 +136,8 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log::Fatal("Cannot write binary data to %s ", bin_filename); Log::Fatal("Cannot write binary data to %s ", bin_filename);
} }
Log::Info("Saving data to binary file %s", bin_filename); Log::Info("Saving data to binary file %s", bin_filename);
size_t size_of_token = std::strlen(binary_file_token);
fwrite(binary_file_token, sizeof(char), size_of_token, file);
// get size of header // get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_class_) + sizeof(num_features_) + sizeof(num_total_features_) size_t size_of_header = sizeof(num_data_) + sizeof(num_class_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(size_t) + sizeof(int) * used_feature_map_.size(); + sizeof(size_t) + sizeof(int) * used_feature_map_.size();
......
...@@ -152,8 +152,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -152,8 +152,8 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
dataset->data_filename_ = filename; dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class; dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_); dataset->metadata_.Init(filename, dataset->num_class_);
bool is_loading_from_binfile = CheckCanLoadFromBin(filename); auto bin_filename = CheckCanLoadFromBin(filename);
if (!is_loading_from_binfile) { if (bin_filename.size() == 0) {
if (!io_config_.use_two_round_loading) { if (!io_config_.use_two_round_loading) {
// read data to memory // read data to memory
auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, rank, num_machines,&num_global_data, &used_data_indices); auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, rank, num_machines,&num_global_data, &used_data_indices);
...@@ -185,8 +185,6 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -185,8 +185,6 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
} }
} else { } else {
// load data from binary file // load data from binary file
std::string bin_filename(filename);
bin_filename.append(".bin");
dataset.reset(LoadFromBinFile(bin_filename.c_str(), rank, num_machines)); dataset.reset(LoadFromBinFile(bin_filename.c_str(), rank, num_machines));
} }
// check meta data // check meta data
...@@ -209,8 +207,8 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, ...@@ -209,8 +207,8 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset->data_filename_ = filename; dataset->data_filename_ = filename;
dataset->num_class_ = io_config_.num_class; dataset->num_class_ = io_config_.num_class;
dataset->metadata_.Init(filename, dataset->num_class_); dataset->metadata_.Init(filename, dataset->num_class_);
bool is_loading_from_binfile = CheckCanLoadFromBin(filename); auto bin_filename = CheckCanLoadFromBin(filename);
if (!is_loading_from_binfile) { if (bin_filename.size() == 0) {
if (!io_config_.use_two_round_loading) { if (!io_config_.use_two_round_loading) {
// read data in memory // read data in memory
auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices); auto text_data = LoadTextDataToMemory(filename, dataset->metadata_, 0, 1, &num_global_data, &used_data_indices);
...@@ -234,8 +232,6 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, ...@@ -234,8 +232,6 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
} }
} else { } else {
// load data from binary file // load data from binary file
std::string bin_filename(filename);
bin_filename.append(".bin");
dataset.reset(LoadFromBinFile(bin_filename.c_str(), 0, 1)); dataset.reset(LoadFromBinFile(bin_filename.c_str(), 0, 1));
} }
// not need to check validation data // not need to check validation data
...@@ -261,8 +257,18 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int ...@@ -261,8 +257,18 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
size_t buffer_size = 16 * 1024 * 1024; size_t buffer_size = 16 * 1024 * 1024;
auto buffer = std::vector<char>(buffer_size); auto buffer = std::vector<char>(buffer_size);
// check token
size_t size_of_token = std::strlen(Dataset::binary_file_token);
size_t read_cnt = fread(buffer.data(), sizeof(char), size_of_token, file);
if (read_cnt != size_of_token) {
Log::Fatal("Binary file error: token has the wrong size");
}
if (std::string(buffer.data()) != std::string(Dataset::binary_file_token)) {
Log::Fatal("input file is not LightGBM binary file");
}
// read size of header // read size of header
size_t read_cnt = fread(buffer.data(), sizeof(size_t), 1, file); read_cnt = fread(buffer.data(), sizeof(size_t), 1, file);
if (read_cnt != 1) { if (read_cnt != 1) {
Log::Fatal("Binary file error: header has the wrong size"); Log::Fatal("Binary file error: header has the wrong size");
...@@ -849,7 +855,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser* ...@@ -849,7 +855,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
} }
/*! \brief Check can load from binary file */ /*! \brief Check can load from binary file */
bool DatasetLoader::CheckCanLoadFromBin(const char* filename) { std::string DatasetLoader::CheckCanLoadFromBin(const char* filename) {
std::string bin_filename(filename); std::string bin_filename(filename);
bin_filename.append(".bin"); bin_filename.append(".bin");
...@@ -860,12 +866,32 @@ bool DatasetLoader::CheckCanLoadFromBin(const char* filename) { ...@@ -860,12 +866,32 @@ bool DatasetLoader::CheckCanLoadFromBin(const char* filename) {
#else #else
file = fopen(bin_filename.c_str(), "rb"); file = fopen(bin_filename.c_str(), "rb");
#endif #endif
if (file == NULL) { if (file == NULL) {
return false; bin_filename = std::string(filename);
} else { #ifdef _MSC_VER
fopen_s(&file, bin_filename.c_str(), "rb");
#else
file = fopen(bin_filename.c_str(), "rb");
#endif
if (file == NULL) {
Log::Fatal("cannot open data file %s", bin_filename.c_str());
}
}
size_t buffer_size = 256;
auto buffer = std::vector<char>(buffer_size);
// read size of token
size_t size_of_token = std::strlen(Dataset::binary_file_token);
size_t read_cnt = fread(buffer.data(), sizeof(char), size_of_token, file);
fclose(file); fclose(file);
return true; if (read_cnt == size_of_token
&& std::string(buffer.data()) == std::string(Dataset::binary_file_token)) {
return bin_filename;
} else {
return std::string();
} }
} }
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment