Commit 33344088 authored by Guolin Ke's avatar Guolin Ke
Browse files

clean code

parent 56f6f30f
......@@ -246,6 +246,8 @@ public:
Dataset();
explicit Dataset(data_size_t num_data);
/*! \brief Destructor */
~Dataset();
......@@ -267,7 +269,7 @@ public:
}
}
inline void PushOneCol(int tid, data_size_t col_idx, const std::vector<std::pair<int, double>>& feature_values) {
inline void PushOneColumn(int tid, data_size_t col_idx, const std::vector<std::pair<int, double>>& feature_values) {
int feature_idx = used_feature_map_[col_idx];
if (feature_idx >= 0) {
for (auto& inner_data : feature_values) {
......@@ -276,10 +278,6 @@ public:
}
}
inline void SetNumData(data_size_t num_data) {
num_data_ = num_data;
}
void FinishLoad();
void SetField(const char* field_name, const void* field_data, data_size_t num_element, int type);
......@@ -293,7 +291,7 @@ public:
std::vector<const BinMapper*> GetBinMappers() const;
void CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const;
void CopyFeatureBinMapperTo(Dataset *dataset, bool is_enable_sparse) const;
/*!
* \brief Get a feature pointer for specific index
......
......@@ -12,7 +12,7 @@ public:
~DatasetLoader();
void SetHeadder(const char* filename);
void SetHeader(const char* filename);
Dataset* LoadFromFile(const char* filename, int rank, int num_machines);
......@@ -20,7 +20,7 @@ public:
return LoadFromFile(filename, 0, 1);
}
Dataset* LoadFromFileLikeOthers(const char* filename, const Dataset* other);
Dataset* LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data);
Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines);
......@@ -51,7 +51,6 @@ private:
/*! \brief Check can load from binary file */
bool CheckCanLoadFromBin(const char* filename);
const IOConfig& io_config_;
/*! \brief Random generator*/
Random random_;
......
......@@ -382,7 +382,7 @@ inline void SortForPair(std::vector<T1>& keys, std::vector<T2>& values, size_t s
}
inline std::function<std::vector<double>(int row_idx)>
GetRowFunctionFromMat(const void* data, int num_row, int num_col, int float_type, int is_row_major) {
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int float_type, int is_row_major) {
if (float_type == 0) {
const float* dptr = reinterpret_cast<const float*>(data);
if (is_row_major) {
......@@ -432,7 +432,7 @@ GetRowFunctionFromMat(const void* data, int num_row, int num_col, int float_type
inline std::function<std::vector<std::pair<int, double>>(int idx)>
GetRowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void* data, int float_type, uint64_t nindptr, uint64_t nelem) {
RowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void* data, int float_type, uint64_t nindptr, uint64_t nelem) {
if (float_type == 0) {
const float* dptr = reinterpret_cast<const float*>(data);
return [&indptr, &indices, &dptr, &nindptr, &nelem](int idx) {
......@@ -463,7 +463,7 @@ GetRowFunctionFromCSR(const int32_t* indptr, const int32_t* indices, const void*
}
inline std::function<std::vector<std::pair<int, double>>(int idx)>
GetColFunctionFromCSC(const int32_t* col_ptr, const int32_t* indices, const void* data, int float_type, uint64_t ncol_ptr, uint64_t nelem) {
ColumnFunctionFromCSC(const int32_t* col_ptr, const int32_t* indices, const void* data, int float_type, uint64_t ncol_ptr, uint64_t nelem) {
if (float_type == 0) {
const float* dptr = reinterpret_cast<const float*>(data);
return [&col_ptr, &indices, &dptr, &ncol_ptr, &nelem](int idx) {
......
......@@ -145,7 +145,7 @@ void Application::LoadData() {
}
dataset_loader_ = new DatasetLoader(config_.io_config, predict_fun);
dataset_loader_->SetHeadder(config_.io_config.data_filename.c_str());
dataset_loader_->SetHeader(config_.io_config.data_filename.c_str());
// load Training data
if (config_.is_parallel_find_bin) {
// load data for parallel training
......@@ -173,7 +173,7 @@ void Application::LoadData() {
// Add validation data, if it exists
for (size_t i = 0; i < config_.io_config.valid_data_filenames.size(); ++i) {
// add
valid_datas_.push_back(dataset_loader_->LoadFromFileLikeOthers(config_.io_config.valid_data_filenames[i].c_str(),
valid_datas_.push_back(dataset_loader_->LoadFromFileAlignWithOtherDataset(config_.io_config.valid_data_filenames[i].c_str(),
train_data_));
// need save binary file
if (config_.io_config.is_save_binary_file) {
......
......@@ -126,7 +126,7 @@ DllExport int LGBM_CreateDatasetFromFile(const char* filename,
if (reference == nullptr) {
*out = loader.LoadFromFile(filename);
} else {
*out = loader.LoadFromFileLikeOthers(filename, reinterpret_cast<const Dataset*>(*reference));
*out = loader.LoadFromFileAlignWithOtherDataset(filename, reinterpret_cast<const Dataset*>(*reference));
}
return 0;
}
......@@ -154,7 +154,7 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
config.LoadFromString(parameters);
DatasetLoader loader(config.io_config, nullptr);
Dataset* ret = nullptr;
auto get_row_fun = Common::GetRowFunctionFromMat(data, nrow, ncol, float_type, is_row_major);
auto get_row_fun = Common::RowFunctionFromDenseMatric(data, nrow, ncol, float_type, is_row_major);
if (reference == nullptr) {
// sample data first
Random rand(config.io_config.data_random_seed);
......@@ -170,10 +170,8 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
}
ret = loader.CostructFromSampleData(sample_values, nrow);
} else {
ret = new Dataset();
// need to set num_data first
ret->SetNumData(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureMetadataTo(ret, config.io_config.is_enable_sparse);
ret = new Dataset(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
}
#pragma omp parallel for schedule(guided)
......@@ -202,7 +200,7 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
config.LoadFromString(parameters);
DatasetLoader loader(config.io_config, nullptr);
Dataset* ret = nullptr;
auto get_row_fun = Common::GetRowFunctionFromCSR(indptr, indices, data, float_type, nindptr, nelem);
auto get_row_fun = Common::RowFunctionFromCSR(indptr, indices, data, float_type, nindptr, nelem);
int32_t nrow = static_cast<int32_t>(nindptr - 1);
if (reference == nullptr) {
// sample data first
......@@ -233,10 +231,8 @@ DllExport int LGBM_CreateDatasetFromCSR(const int32_t* indptr,
CHECK(num_col >= sample_values.size());
ret = loader.CostructFromSampleData(sample_values, nrow);
} else {
ret = new Dataset();
// need to set num_data first
ret->SetNumData(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureMetadataTo(ret, config.io_config.is_enable_sparse);
ret = new Dataset(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
}
#pragma omp parallel for schedule(guided)
......@@ -266,7 +262,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
config.LoadFromString(parameters);
DatasetLoader loader(config.io_config, nullptr);
Dataset* ret = nullptr;
auto get_col_fun = Common::GetColFunctionFromCSC(col_ptr, indices, data, float_type, ncol_ptr, nelem);
auto get_col_fun = Common::ColumnFunctionFromCSC(col_ptr, indices, data, float_type, ncol_ptr, nelem);
int32_t nrow = static_cast<int32_t>(num_row);
if (reference == nullptr) {
Log::Warning("Construct from CSC format is not efficient");
......@@ -282,17 +278,15 @@ DllExport int LGBM_CreateDatasetFromCSC(const int32_t* col_ptr,
}
ret = loader.CostructFromSampleData(sample_values, nrow);
} else {
ret = new Dataset();
// need to set num_data first
ret->SetNumData(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureMetadataTo(ret, config.io_config.is_enable_sparse);
ret = new Dataset(nrow);
reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
}
#pragma omp parallel for schedule(guided)
for (int i = 0; i < ncol_ptr - 1; ++i) {
const int tid = omp_get_thread_num();
auto one_col = get_col_fun(i);
ret->PushOneCol(tid, i, one_col);
ret->PushOneColumn(tid, i, one_col);
}
ret->FinishLoad();
*out = ret;
......
......@@ -21,6 +21,12 @@ Dataset::Dataset() {
is_loading_from_binfile_ = false;
}
Dataset::Dataset(data_size_t num_data) {
num_class_ = 1;
num_data_ = num_data;
is_loading_from_binfile_ = false;
}
Dataset::~Dataset() {
for (auto& feature : features_) {
delete feature;
......@@ -35,13 +41,14 @@ void Dataset::FinishLoad() {
}
}
void Dataset::CopyFeatureMetadataTo(Dataset *dataset, bool is_enable_sparse) const {
void Dataset::CopyFeatureBinMapperTo(Dataset* dataset, bool is_enable_sparse) const {
dataset->features_.clear();
// copy feature bin mapper data
for (Feature* feature : features_) {
dataset->features_.push_back(new Feature(feature->feature_index(),
new BinMapper(*feature->bin_mapper()), dataset->num_data_, is_enable_sparse));
}
dataset->num_class_ = num_class_;
dataset->used_feature_map_ = used_feature_map_;
dataset->num_features_ = static_cast<int>(dataset->features_.size());
dataset->num_total_features_ = num_total_features_;
......@@ -131,7 +138,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
Log::Info("Saving data to binary file %s", data_filename_);
// get size of header
size_t size_of_header = sizeof(num_data_) + sizeof(num_features_) + sizeof(num_total_features_)
size_t size_of_header = sizeof(num_data_) + sizeof(num_class_) + sizeof(num_features_) + sizeof(num_total_features_)
+ sizeof(size_t) + sizeof(int) * used_feature_map_.size();
// size of feature names
for (int i = 0; i < num_total_features_; ++i) {
......@@ -140,6 +147,7 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
fwrite(&size_of_header, sizeof(size_of_header), 1, file);
// write header
fwrite(&num_data_, sizeof(num_data_), 1, file);
fwrite(&num_class_, sizeof(num_class_), 1, file);
fwrite(&num_features_, sizeof(num_features_), 1, file);
fwrite(&num_total_features_, sizeof(num_features_), 1, file);
size_t num_used_feature_map = used_feature_map_.size();
......
......@@ -17,7 +17,7 @@ DatasetLoader::~DatasetLoader() {
}
void DatasetLoader::SetHeadder(const char* filename) {
void DatasetLoader::SetHeader(const char* filename) {
TextReader<data_size_t> text_reader(filename, io_config_.has_header);
std::unordered_map<std::string, int> name2idx;
......@@ -200,7 +200,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Dataset* other) {
Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename, const Dataset* train_data) {
auto parser = Parser::CreateParser(filename, io_config_.has_header, 0, label_idx_);
if (parser == nullptr) {
Log::Fatal("Could not recognize data format of %s", filename);
......@@ -219,7 +219,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
dataset->num_data_ = static_cast<data_size_t>(text_data.size());
// initialize label
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
other->CopyFeatureMetadataTo(dataset, io_config_.is_enable_sparse);
train_data->CopyFeatureBinMapperTo(dataset, io_config_.is_enable_sparse);
// extract features
ExtractFeaturesFromMemory(text_data, parser, dataset);
text_data.clear();
......@@ -230,7 +230,7 @@ Dataset* DatasetLoader::LoadFromFileLikeOthers(const char* filename, const Datas
num_global_data = dataset->num_data_;
// initialize label
dataset->metadata_.Init(dataset->num_data_, dataset->num_class_, weight_idx_, group_idx_);
other->CopyFeatureMetadataTo(dataset, io_config_.is_enable_sparse);
train_data->CopyFeatureBinMapperTo(dataset, io_config_.is_enable_sparse);
// extract features
ExtractFeaturesFromFile(filename, parser, used_data_indices, dataset);
}
......@@ -290,6 +290,8 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
const char* mem_ptr = buffer;
dataset->num_data_ = *(reinterpret_cast<const data_size_t*>(mem_ptr));
mem_ptr += sizeof(dataset->num_data_);
dataset->num_class_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_class_);
dataset->num_features_ = *(reinterpret_cast<const int*>(mem_ptr));
mem_ptr += sizeof(dataset->num_features_);
dataset->num_total_features_ = *(reinterpret_cast<const int*>(mem_ptr));
......@@ -415,7 +417,7 @@ Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>&
}
Dataset* dataset = new Dataset();
dataset->num_class_ = io_config_.num_class;
dataset->features_.clear();
dataset->num_data_ = num_data;
// -1 means doesn't use this feature
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment