Commit 33344088 authored by Guolin Ke's avatar Guolin Ke
Browse files

clean code

parent 56f6f30f
...@@ -246,6 +246,8 @@ public: ...@@ -246,6 +246,8 @@ public:
Dataset(); Dataset();
explicit Dataset(data_size_t num_data);
/*! \brief Destructor */ /*! \brief Destructor */
~Dataset(); ~Dataset();
...@@ -267,7 +269,7 @@ public: ...@@ -267,7 +269,7 @@ public:
} }
} }
inline void PushOneCol(int tid, data_size_t col_idx, const std::vector<std::pair<int, double>>& feature_values) { inline void PushOneColumn(int tid, data_size_t col_idx, const std::vector<std::pair<int, double>>& feature_values) {
int feature_idx = used_feature_map_[col_idx]; int feature_idx = used_feature_map_[col_idx];
if (feature_idx >= 0) { if (feature_idx >= 0) {
for (auto& inner_data : feature_values) { for (auto& inner_data : feature_values) {
...@@ -276,10 +278,6 @@ public: ...@@ -276,10 +278,6 @@ public:
} }
} }
inline void SetNumData(data_size_t num_data) {
num_data_ = num_data;
}
void FinishLoad(); void FinishLoad();
void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type); void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type);
...@@ -293,7 +291,7 @@ public: ...@@ -293,7 +291,7 @@ public:
std::vector<const BinMapper*> GetBinMappers() const; std::vector<const BinMapper*> GetBinMappers() const;
void CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const; void CopyFeatureBinMapperTo(Dataset *dataset, bool is_enable_sparse) const;
/*! /*!
* \brief Get a feature pointer for specific index * \brief Get a feature pointer for specific index
......
...@@ -12,7 +12,7 @@ public: ...@@ -12,7 +12,7 @@ public:
~DatasetLoader(); ~DatasetLoader();
void SetHeadder(const char* filename); void SetHeader(const char* filename);
Dataset* LoadFromFile(const char* filename, int rank, int num_machines); Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
...@@ -20,7 +20,7 @@ public: ...@@ -20,7 +20,7 @@ public:
return LoadFromFile(filename, 0, 1); return LoadFromFile(filename, 0, 1);
} }
Dataset* LoadFromFileLikeOthers(const char* filename, const Dataset* other); Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines); Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines);
...@@ -51,7 +51,6 @@ private: ...@@ -51,7 +51,6 @@ private:
/*! \brief Check can load from binary file */ /*! \brief Check can load from binary file */
bool CheckCanLoadFromBin(const char* filename); bool CheckCanLoadFromBin(const char* filename);
const IOConfig& io_config_; const IOConfig& io_config_;
/*! \brief Random generator*/ /*! \brief Random generator*/
Random random_; Random random_;
......
...@@ -382,7 +382,7 @@ inline void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, size_t s ...@@ -382,7 +382,7 @@ inline void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, size_t s
} }
inline std::function<std::vector<double>(int row_idx)> inline std::function<std::vector<double>(int row_idx)>
GetRowFunctionFromMat(const void* data, int num_row, int num_col, int float_type, int is_row_major) { RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int float_type, int is_row_major) {
if (float_type == 0) { if (float_type == 0) {
const float* dptr = reinterpret_cast<const float*>(data); const float* dptr = reinterpret_cast<const float*>(data);
if (is_row_major) { if (is_row_major) {
...@@ -432,7 +432,7 @@ GetRowFunctionFromMat(const void* data, int num_row, int num_col, int float_type ...@@ -432,7 +432,7 @@ GetRowFunctionFromMat(const void* data, int num_row, int num_col, int float_type
inline std::function<std::vector<std::pair<int, double>>(int idx)> inline std::function<std::vector<std::pair<int, double>>(int idx)>
GetRowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void* data, int float_type, uint64_t nindptr, uint64_t nelem) { RowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void* data, int float_type, uint64_t nindptr, uint64_t nelem) {
if (float_type == 0) { if (float_type == 0) {
const float* dptr = reinterpret_cast<const float*>(data); const float* dptr = reinterpret_cast<const float*>(data);
return [&indptr, &indices, &dptr, &nindptr, &nelem](int idx) { return [&indptr, &indices, &dptr, &nindptr, &nelem](int idx) {
...@@ -463,7 +463,7 @@ GetRowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void* ...@@ -463,7 +463,7 @@ GetRowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void*
} }
inline std::function<std::vector<std::pair<int, double>>(int idx)> inline std::function<std::vector<std::pair<int, double>>(int idx)>
GetColFunctionFromCSC(const int32_t* col_ptr, const int32_t* indices, const void* data, int float_type, uint64_t ncol_ptr, uint64_t nelem) { ColumnFunctionFromCSC(const int32_t* col_ptr, const int32_t* indices, const void* data, int float_type, uint64_t ncol_ptr, uint64_t nelem) {
if (float_type == 0) { if (float_type == 0) {
const float* dptr = reinterpret_cast<const float*>(data); const float* dptr = reinterpret_cast<const float*>(data);
return [&col_ptr, &indices, &dptr, &ncol_ptr, &nelem](int idx) { return [&col_ptr, &indices, &dptr, &ncol_ptr, &nelem](int idx) {
......
...@@ -145,7 +145,7 @@ void Application::LoadData() { ...@@ -145,7 +145,7 @@ void Application::LoadData() {
} }
dataset_loader_ = new DatasetLoader(config_.io_config, predict_fun); dataset_loader_ = new DatasetLoader(config_.io_config, predict_fun);
dataset_loader_->SetHeadder(config_.io_config.data_filename.c_str()); dataset_loader_->SetHeader(config_.io_config.data_filename.c_str());
// load Training data // load Training data
if (config_.is_parallel_find_bin) { if (config_.is_parallel_find_bin) {
// load data for parallel training // load data for parallel training
...@@ -173,7 +173,7 @@ void Application::LoadData() { ...@@ -173,7 +173,7 @@ void Application::LoadData() {
// Add validation data, if it exists // Add validation data, if it exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) { for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add // add
valid_datas_.push_back(dataset_loader_->LoadFromFileLikeOthers(config_.io_config.valid_data_filenames[i].c_str(), valid_datas_.push_back(dataset_loader_->LoadFromFileAlignWithOtherDataset(config_.io_config.valid_data_filenames[i].c_str(),
train_data_)); train_data_));
// need save binary file // need save binary file
if (config_.io_config.is_save_binary_file) { if (config_.io_config.is_save_binary_file) {
......
...@@ -126,7 +126,7 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename, ...@@ -126,7 +126,7 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
if (reference == nullptr) { if (reference == nullptr) {
*out = loader.LoadFromFile(filename); *out = loader.LoadFromFile(filename);
} else { } else {
*out = loader.LoadFromFileLikeOthers(filename, reinterpret_cast<const Dataset*>(*reference)); *out = loader.LoadFromFileAlignWithOtherDataset(filename, reinterpret_cast<const Dataset*>(*reference));
} }
return 0; return 0;
} }
...@@ -154,7 +154,7 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data, ...@@ -154,7 +154,7 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
config.LoadFromString(parameters); config.LoadFromString(parameters);
DatasetLoader loader(config.io_config, nullptr); DatasetLoader loader(config.io_config, nullptr);
Dataset* ret = nullptr; Dataset* ret = nullptr;
auto get_row_fun = Common::GetRowFunctionFromMat(data, nrow, ncol, float_type, is_row_major); auto get_row_fun = Common::RowFunctionFromDenseMatric(data, nrow, ncol, float_type, is_row_major);
if (reference == nullptr) { if (reference == nullptr) {
// sample data first // sample data first
Random rand(config.io_config.data_random_seed); Random rand(config.io_config.data_random_seed);
...@@ -170,10 +170,8 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data, ...@@ -170,10 +170,8 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
} }
ret = loader.CostructFromSampleData(sample_values, nrow); ret = loader.CostructFromSampleData(sample_values, nrow);
} else { } else {
ret = new Dataset(); ret = new Dataset(nrow);
// need to set num_data first reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
ret->SetNumData(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureMetadataTo(ret, config.io_config.is_enable_sparse);
} }
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
...@@ -202,7 +200,7 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr, ...@@ -202,7 +200,7 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
config.LoadFromString(parameters); config.LoadFromString(parameters);
DatasetLoader loader(config.io_config, nullptr); DatasetLoader loader(config.io_config, nullptr);
Dataset* ret = nullptr; Dataset* ret = nullptr;
auto get_row_fun = Common::GetRowFunctionFromCSR(indptr, indices, data, float_type, nindptr, nelem); auto get_row_fun = Common::RowFunctionFromCSR(indptr, indices, data, float_type, nindptr, nelem);
int32_t nrow = static_cast<int32_t>(nindptr - 1); int32_t nrow = static_cast<int32_t>(nindptr - 1);
if (reference == nullptr) { if (reference == nullptr) {
// sample data first // sample data first
...@@ -233,10 +231,8 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr, ...@@ -233,10 +231,8 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
CHECK(num_col >= sample_values.size()); CHECK(num_col >= sample_values.size());
ret = loader.CostructFromSampleData(sample_values, nrow); ret = loader.CostructFromSampleData(sample_values, nrow);
} else { } else {
ret = new Dataset(); ret = new Dataset(nrow);
// need to set num_data first reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
ret->SetNumData(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureMetadataTo(ret, config.io_config.is_enable_sparse);
} }
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
...@@ -266,7 +262,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr, ...@@ -266,7 +262,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
config.LoadFromString(parameters); config.LoadFromString(parameters);
DatasetLoader loader(config.io_config, nullptr); DatasetLoader loader(config.io_config, nullptr);
Dataset* ret = nullptr; Dataset* ret = nullptr;
auto get_col_fun = Common::GetColFunctionFromCSC(col_ptr, indices, data, float_type, ncol_ptr, nelem); auto get_col_fun = Common::ColumnFunctionFromCSC(col_ptr, indices, data, float_type, ncol_ptr, nelem);
int32_t nrow = static_cast<int32_t>(num_row); int32_t nrow = static_cast<int32_t>(num_row);
if (reference == nullptr) { if (reference == nullptr) {
Log::Warning("Construct from CSC format is not efficient"); Log::Warning("Construct from CSC format is not efficient");
...@@ -282,17 +278,15 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr, ...@@ -282,17 +278,15 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
} }
ret = loader.CostructFromSampleData(sample_values, nrow); ret = loader.CostructFromSampleData(sample_values, nrow);
} else { } else {
ret = new Dataset(); ret = new Dataset(nrow);
// need to set num_data first reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
ret->SetNumData(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureMetadataTo(ret, config.io_config.is_enable_sparse);
} }
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < ncol_ptr - 1; ++i) { for (int i = 0; i < ncol_ptr - 1; ++i) {
const int tid = omp_get_thread_num(); const int tid = omp_get_thread_num();
auto one_col = get_col_fun(i); auto one_col = get_col_fun(i);
ret->PushOneCol(tid, i, one_col); ret->PushOneColumn(tid, i, one_col);
} }
ret->FinishLoad(); ret->FinishLoad();
*out = ret; *out = ret;
......
...@@ -21,6 +21,12 @@ Dataset::Dataset() { ...@@ -21,6 +21,12 @@ Dataset::Dataset() {
is_loading_from_binfile_ = false; is_loading_from_binfile_ = false;
} }
Dataset::Dataset(data_size_t num_data) {
num_class_ = 1;
num_data_ = num_data;
is_loading_from_binfile_ = false;
}
Dataset::~Dataset() { Dataset::~Dataset() {
for (auto& feature : features_) { for (auto& feature : features_) {
delete feature; delete feature;
...@@ -35,13 +41,14 @@ void Dataset::FinishLoad() { ...@@ -35,13 +41,14 @@ void Dataset::FinishLoad() {
} }
} }
void Dataset::CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const { void Dataset::CopyFeatureBinMapperTo(Dataset* dataset, bool is_enable_sparse) const {
dataset->features_.clear(); dataset->features_.clear();
// copy feature bin mapper data // copy feature bin mapper data
for (Feature* feature : features_) { for (Feature* feature : features_) {
dataset->features_.push_back(new Feature(feature->feature_index(), dataset->features_.push_back(new Feature(feature->feature_index(),
new BinMapper(*feature->bin_mapper()), dataset->num_data_, is_enable_sparse)); new BinMapper(*feature->bin_mapper()), dataset->num_data_, is_enable_sparse));
} }
dataset->num_class_ = num_class_;
dataset->used_feature_map_ = used_feature_map_; dataset->used_feature_map_ = used_feature_map_;
dataset->num_features_ = static_cast<int>(dataset->features_.size()); dataset->num_features_ = static_cast<int>(dataset->features_.size());
dataset->num_total_features_ = num_total_features_; dataset->num_total_features_ = num_total_features_;
...@@ -131,7 +138,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -131,7 +138,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log::Info("Saving data to binary file %s", data_filename_); Log::Info("Saving data to binary file %s", data_filename_);
// get size of header // get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_) size_t size_of_header = sizeof(num_data_) + sizeof(num_class_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(size_t) + sizeof(int) * used_feature_map_.size(); + sizeof(size_t) + sizeof(int) * used_feature_map_.size();
// size of feature names // size of feature names
for (int i = 0; i < num_total_features_; ++i) { for (int i = 0; i < num_total_features_; ++i) {
...@@ -140,6 +147,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { ...@@ -140,6 +147,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
fwrite(&size_of_header, sizeof(size_of_header), 1, file); fwrite(&size_of_header, sizeof(size_of_header), 1, file);
// write header // write header
fwrite(&num_data_, sizeof(num_data_), 1, file); fwrite(&num_data_, sizeof(num_data_), 1, file);
fwrite(&num_class_, sizeof(num_class_), 1, file);
fwrite(&num_features_, sizeof(num_features_), 1, file); fwrite(&num_features_, sizeof(num_features_), 1, file);
fwrite(&num_total_features_, sizeof(num_features_), 1, file); fwrite(&num_total_features_, sizeof(num_features_), 1, file);
size_t num_used_feature_map = used_feature_map_.size(); size_t num_used_feature_map = used_feature_map_.size();
......
...@@ -17,7 +17,7 @@ DatasetLoader::~DatasetLoader() { ...@@ -17,7 +17,7 @@ DatasetLoader::~DatasetLoader() {
} }
void DatasetLoader::SetHeadder(const char* filename) { void DatasetLoader::SetHeader(const char* filename) {
TextReader<data_size_t> text_reader(filename, io_config_.has_header); TextReader<data_size_t> text_reader(filename, io_config_.has_header);
std::unordered_map<std::string, int> name2idx; std::unordered_map<std::string, int> name2idx;
...@@ -200,7 +200,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac ...@@ -200,7 +200,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Dataset* other) { Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data) {
auto parser = Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_); auto parser = Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_);
if (parser == nullptr) { if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename); Log::Fatal("Could not recognize data format of %s", filename);
...@@ -219,7 +219,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas ...@@ -219,7 +219,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
dataset->num_data_ = static_cast<data_size_t>(text_data.size()); dataset->num_data_ = static_cast<data_size_t>(text_data.size());
// initialize label // initialize label
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_); dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
other->CopyFeatureMetadataTo(dataset, io_config_.is_enable_sparse); train_data->CopyFeatureBinMapperTo(dataset, io_config_.is_enable_sparse);
// extract features // extract features
ExtractFeaturesFromMemory(text_data, parser, dataset); ExtractFeaturesFromMemory(text_data, parser, dataset);
text_data.clear(); text_data.clear();
...@@ -230,7 +230,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas ...@@ -230,7 +230,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
num_global_data = dataset->num_data_; num_global_data = dataset->num_data_;
// initialize label // initialize label
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_); dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
other->CopyFeatureMetadataTo(dataset, io_config_.is_enable_sparse); train_data->CopyFeatureBinMapperTo(dataset, io_config_.is_enable_sparse);
// extract features // extract features
ExtractFeaturesFromFile(filename, parser, used_data_indices, dataset); ExtractFeaturesFromFile(filename, parser, used_data_indices, dataset);
} }
...@@ -290,6 +290,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int ...@@ -290,6 +290,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
const char* mem_ptr = buffer; const char* mem_ptr = buffer;
dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr)); dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(dataset->num_data_); mem_ptr += sizeof(dataset->num_data_);
dataset->num_class_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_class_);
dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_features_); mem_ptr += sizeof(dataset->num_features_);
dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr)); dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
...@@ -415,7 +417,7 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& ...@@ -415,7 +417,7 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
} }
Dataset* dataset = new Dataset(); Dataset* dataset = new Dataset();
dataset->num_class_ = io_config_.num_class;
dataset->features_.clear(); dataset->features_.clear();
dataset->num_data_ = num_data; dataset->num_data_ = num_data;
// -1 means doesn't use this feature // -1 means doesn't use this feature
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment