Commit c6512e01 authored by Guolin Ke's avatar Guolin Ke
Browse files

reduce memory cost at sample phase

parent 2e5e9134
......@@ -86,7 +86,7 @@ public:
* \param values (Sampled) values of this feature
* \param max_bin The maximal number of bin
*/
void FindBin(std::vector<double>* values, int max_bin);
void FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin);
/*!
* \brief Use specific number of bin to calculate the size of this class
......
......@@ -24,7 +24,7 @@ public:
Dataset* LoadFromBinFile(const char* bin_filename, int rank, int num_machines);
Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, data_size_t num_data);
Dataset* CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data);
/*! \brief Disable copy */
DatasetLoader& operator=(const DatasetLoader&) = delete;
......
......@@ -464,7 +464,9 @@ std::string GBDT::FeatureImportance() const {
// store the importance first
std::vector<std::pair<size_t, std::string>> pairs;
for (size_t i = 0; i < feature_importances.size(); ++i) {
pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]);
if (feature_importances[i] > 0) {
pairs.emplace_back(feature_importances[i], train_data_->feature_names()[i]);
}
}
// sort the importance
std::sort(pairs.begin(), pairs.end(),
......
......@@ -206,10 +206,12 @@ DllExport int LGBM_CreateDatasetFromMat(const void* data,
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
for (size_t j = 0; j < row.size(); ++j) {
sample_values[j].push_back(row[j]);
if (std::fabs(row[j]) > 1e-15) {
sample_values[j].push_back(row[j]);
}
}
}
ret = loader.CostructFromSampleData(sample_values, nrow);
ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
} else {
ret = new Dataset(nrow, config.io_config.num_class);
ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
......@@ -253,25 +255,22 @@ DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
for (size_t i = 0; i < sample_indices.size(); ++i) {
auto idx = sample_indices[i];
auto row = get_row_fun(static_cast<int>(idx));
// push 0 first, then edit the value according existing feature values
for (auto& feature_values : sample_values) {
feature_values.push_back(0.0);
}
for (std::pair<int, double>& inner_data : row) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
// push i+1 0
sample_values.emplace_back(i + 1, 0.0f);
if (std::fabs(inner_data.second) > 1e-15) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back();
}
}
// edit the feature value
sample_values[inner_data.first].push_back(inner_data.second);
}
// edit the feature value
sample_values[inner_data.first][i] = inner_data.second;
}
}
CHECK(num_col >= static_cast<int>(sample_values.size()));
ret = loader.CostructFromSampleData(sample_values, nrow);
ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
} else {
ret = new Dataset(nrow, config.io_config.num_class);
ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
......@@ -319,7 +318,7 @@ DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
auto cur_col = get_col_fun(i);
sample_values[i] = SampleFromOneColumn(cur_col, sample_indices);
}
ret = loader.CostructFromSampleData(sample_values, nrow);
ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
} else {
ret = new Dataset(nrow, config.io_config.num_class);
ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
......
......@@ -39,16 +39,24 @@ BinMapper::~BinMapper() {
delete[] bin_upper_bound_;
}
void BinMapper::FindBin(std::vector<double>* values, int max_bin) {
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin) {
std::vector<double>& ref_values = (*values);
size_t sample_size = values->size();
size_t sample_size = total_sample_cnt;
size_t zero_cnt = total_sample_cnt - ref_values.size();
// find distinct_values first
std::vector<double> distinct_values;
std::vector<int> counts;
std::sort(ref_values.begin(), ref_values.end());
distinct_values.push_back(ref_values[0]);
counts.push_back(1);
// push 0 first
if (zero_cnt > 0) {
distinct_values.push_back(0.0f);
counts.push_back(static_cast<int>(zero_cnt));
}
if (ref_values.size() > 0) {
distinct_values.push_back(ref_values[0]);
counts.push_back(1);
}
for (size_t i = 1; i < ref_values.size(); ++i) {
if (ref_values[i] != ref_values[i - 1]) {
distinct_values.push_back(ref_values[i]);
......
......@@ -408,12 +408,12 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* bin_filename, int rank, int
return dataset;
}
Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values, data_size_t num_data) {
Dataset* DatasetLoader::CostructFromSampleData(std::vector<std::vector<double>>& sample_values, size_t total_sample_size, data_size_t num_data) {
std::vector<BinMapper*> bin_mappers(sample_values.size());
#pragma omp parallel for schedule(guided)
for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
bin_mappers[i] = new BinMapper();
bin_mappers[i]->FindBin(&sample_values[i], io_config_.max_bin);
bin_mappers[i]->FindBin(&sample_values[i], total_sample_size, io_config_.max_bin);
}
Dataset* dataset = new Dataset();
......@@ -580,21 +580,17 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
oneline_features.clear();
// parse features
parser->ParseOneLine(sample_data[i].c_str(), &oneline_features, &label);
// push 0 first, then edit the value according existing feature values
for (auto& feature_values : sample_values) {
feature_values.push_back(0.0);
}
for (std::pair<int, double>& inner_data : oneline_features) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
// push i+1 0
sample_values.emplace_back(i + 1, 0.0f);
if (std::fabs(inner_data.second) > 1e-15) {
if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
// if need expand feature set
size_t need_size = inner_data.first - sample_values.size() + 1;
for (size_t j = 0; j < need_size; ++j) {
sample_values.emplace_back();
}
}
sample_values[inner_data.first].push_back(inner_data.second);
}
// edit the feature value
sample_values[inner_data.first][i] = inner_data.second;
}
}
......@@ -629,7 +625,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
continue;
}
bin_mappers[i] = new BinMapper();
bin_mappers[i]->FindBin(&sample_values[i], io_config_.max_bin);
bin_mappers[i]->FindBin(&sample_values[i], sample_data.size(), io_config_.max_bin);
}
for (size_t i = 0; i < sample_values.size(); ++i) {
......@@ -676,7 +672,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
#pragma omp parallel for schedule(guided)
for (int i = 0; i < len[rank]; ++i) {
BinMapper* bin_mapper = new BinMapper();
bin_mapper->FindBin(&sample_values[start[rank] + i], io_config_.max_bin);
bin_mapper->FindBin(&sample_values[start[rank] + i], sample_data.size(), io_config_.max_bin);
bin_mapper->CopyTo(input_buffer + i * type_size);
// don't need this any more
delete bin_mapper;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment