Commit c6512e01 authored by Guolin Ke's avatar Guolin Ke
Browse files

reduce memory cost at sample phase

parent 2e5e9134
...@@ -86,7 +86,7 @@ public: ...@@ -86,7 +86,7 @@ public:
* \param values (Sampled) values of this feature * \param values (Sampled) values of this feature
* \param max_bin The maximal number of bin * \param max_bin The maximal number of bin
*/ */
void FindBin(std::vector<double>* values, int max_bin); void FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin);
/*! /*!
* \brief Use specific number of bin to calculate the size of this class * \brief Use specific number of bin to calculate the size of this class
......
...@@ -24,7 +24,7 @@ public: ...@@ -24,7 +24,7 @@ public:
Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines); Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines);
Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, data_size_t num_data); Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data);
/*! \brief Disable copy */ /*! \brief Disable copy */
DatasetLoader& operator=(const DatasetLoader&) = delete; DatasetLoader& operator=(const DatasetLoader&) = delete;
......
...@@ -464,7 +464,9 @@ std::string GBDT::FeatureImportance() const { ...@@ -464,7 +464,9 @@ std::string GBDT::FeatureImportance() const {
// store the importance first // store the importance first
std::vector<std::pair<size_t, std::string>> pairs; std::vector<std::pair<size_t, std::string>> pairs;
for (size_t i = 0; i < feature_importances.size(); ++i) { for (size_t i = 0; i < feature_importances.size(); ++i) {
pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]); if (feature_importances[i] > 0) {
pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]);
}
} }
// sort the importance // sort the importance
std::sort(pairs.begin(), pairs.end(), std::sort(pairs.begin(), pairs.end(),
......
...@@ -206,10 +206,12 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data, ...@@ -206,10 +206,12 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
auto idx = sample_indices[i]; auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx)); auto row = get_row_fun(static_cast<int>(idx));
for (size_t j = 0; j < row.size(); ++j) { for (size_t j = 0; j < row.size(); ++j) {
sample_values[j].push_back(row[j]); if (std::fabs(row[j]) > 1e-15) {
sample_values[j].push_back(row[j]);
}
} }
} }
ret = loader.CostructFromSampleData(sample_values, nrow); ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
} else { } else {
ret = new Dataset(nrow, config.io_config.num_class); ret = new Dataset(nrow, config.io_config.num_class);
ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse); ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
...@@ -253,25 +255,22 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr, ...@@ -253,25 +255,22 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
for (size_t i = 0; i < sample_indices.size(); ++i) { for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i]; auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx)); auto row = get_row_fun(static_cast<int>(idx));
// push 0 first, then edit the value according existing feature values
for (auto& feature_values : sample_values) {
feature_values.push_back(0.0);
}
for (std::pair<int, double>& inner_data : row) { for (std::pair<int, double>& inner_data : row) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) { if (std::fabs(inner_data.second) > 1e-15) {
// if need expand feature set if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
size_t need_size = inner_data.first - sample_values.size() + 1; // if need expand feature set
for (size_t j = 0; j < need_size; ++j) { size_t need_size = inner_data.first - sample_values.size() + 1;
// push i+1 0 for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back(i + 1, 0.0f); sample_values.emplace_back();
}
} }
// edit the feature value
sample_values[inner_data.first].push_back(inner_data.second);
} }
// edit the feature value
sample_values[inner_data.first][i] = inner_data.second;
} }
} }
CHECK(num_col >= static_cast<int>(sample_values.size())); CHECK(num_col >= static_cast<int>(sample_values.size()));
ret = loader.CostructFromSampleData(sample_values, nrow); ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
} else { } else {
ret = new Dataset(nrow, config.io_config.num_class); ret = new Dataset(nrow, config.io_config.num_class);
ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse); ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
...@@ -319,7 +318,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr, ...@@ -319,7 +318,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
auto cur_col = get_col_fun(i); auto cur_col = get_col_fun(i);
sample_values[i] = SampleFromOneColumn(cur_col, sample_indices); sample_values[i] = SampleFromOneColumn(cur_col, sample_indices);
} }
ret = loader.CostructFromSampleData(sample_values, nrow); ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
} else { } else {
ret = new Dataset(nrow, config.io_config.num_class); ret = new Dataset(nrow, config.io_config.num_class);
ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse); ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
......
...@@ -39,16 +39,24 @@ BinMapper::~BinMapper() { ...@@ -39,16 +39,24 @@ BinMapper::~BinMapper() {
delete[] bin_upper_bound_; delete[] bin_upper_bound_;
} }
void BinMapper::FindBin(std::vector<double>* values, int max_bin) { void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin) {
std::vector<double>& ref_values = (*values); std::vector<double>& ref_values = (*values);
size_t sample_size = values->size(); size_t sample_size = total_sample_cnt;
size_t zero_cnt = total_sample_cnt - ref_values.size();
// find distinct_values first // find distinct_values first
std::vector<double> distinct_values; std::vector<double> distinct_values;
std::vector<int> counts; std::vector<int> counts;
std::sort(ref_values.begin(), ref_values.end()); std::sort(ref_values.begin(), ref_values.end());
distinct_values.push_back(ref_values[0]); // push 0 first
counts.push_back(1); if (zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(static_cast<int>(zero_cnt));
}
if (ref_values.size() > 0) {
distinct_values.push_back(ref_values[0]);
counts.push_back(1);
}
for (size_t i = 1; i < ref_values.size(); ++i) { for (size_t i = 1; i < ref_values.size(); ++i) {
if (ref_values[i] != ref_values[i - 1]) { if (ref_values[i] != ref_values[i - 1]) {
distinct_values.push_back(ref_values[i]); distinct_values.push_back(ref_values[i]);
......
...@@ -408,12 +408,12 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int ...@@ -408,12 +408,12 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
return dataset; return dataset;
} }
Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values, data_size_t num_data) { Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data) {
std::vector<BinMapper*> bin_mappers(sample_values.size()); std::vector<BinMapper*> bin_mappers(sample_values.size());
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) { for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
bin_mappers[i] = new BinMapper(); bin_mappers[i] = new BinMapper();
bin_mappers[i]->FindBin(&sample_values[i], io_config_.max_bin); bin_mappers[i]->FindBin(&sample_values[i], total_sample_size, io_config_.max_bin);
} }
Dataset* dataset = new Dataset(); Dataset* dataset = new Dataset();
...@@ -580,21 +580,17 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -580,21 +580,17 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
oneline_features.clear(); oneline_features.clear();
// parse features // parse features
parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label); parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
// push 0 first, then edit the value according existing feature values
for (auto& feature_values : sample_values) {
feature_values.push_back(0.0);
}
for (std::pair<int, double>& inner_data : oneline_features) { for (std::pair<int, double>& inner_data : oneline_features) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) { if (std::fabs(inner_data.second) > 1e-15) {
// if need expand feature set if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
size_t need_size = inner_data.first - sample_values.size() + 1; // if need expand feature set
for (size_t j = 0; j < need_size; ++j) { size_t need_size = inner_data.first - sample_values.size() + 1;
// push i+1 0 for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back(i + 1, 0.0f); sample_values.emplace_back();
}
} }
sample_values[inner_data.first].push_back(inner_data.second);
} }
// edit the feature value
sample_values[inner_data.first][i] = inner_data.second;
} }
} }
...@@ -629,7 +625,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -629,7 +625,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
continue; continue;
} }
bin_mappers[i] = new BinMapper(); bin_mappers[i] = new BinMapper();
bin_mappers[i]->FindBin(&sample_values[i], io_config_.max_bin); bin_mappers[i]->FindBin(&sample_values[i], sample_data.size(), io_config_.max_bin);
} }
for (size_t i = 0; i < sample_values.size(); ++i) { for (size_t i = 0; i < sample_values.size(); ++i) {
...@@ -676,7 +672,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, ...@@ -676,7 +672,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
#pragma omp parallel for schedule(guided) #pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) { for (int i = 0; i < len[rank]; ++i) {
BinMapper* bin_mapper = new BinMapper(); BinMapper* bin_mapper = new BinMapper();
bin_mapper->FindBin(&sample_values[start[rank] + i], io_config_.max_bin); bin_mapper->FindBin(&sample_values[start[rank] + i], sample_data.size(), io_config_.max_bin);
bin_mapper->CopyTo(input_buffer + i * type_size); bin_mapper->CopyTo(input_buffer + i * type_size);
// don't need this any more // don't need this any more
delete bin_mapper; delete bin_mapper;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment