Commit 3489607f authored by Guolin Ke's avatar Guolin Ke
Browse files

reduce function call cost for constructing subset.

parent 6c736da9
......@@ -263,6 +263,8 @@ public:
*/
virtual void Push(int tid, data_size_t idx, uint32_t value) = 0;
virtual void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) = 0;
/*!
* \brief Get bin interator of this bin
* \param start_idx start index of this
......
......@@ -351,7 +351,7 @@ public:
*/
LIGHTGBM_EXPORT void SaveBinaryFile(const char* bin_filename);
LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_sparse);
LIGHTGBM_EXPORT void CopyFeatureMapperFrom(const Dataset* dataset);
/*!
* \brief Get a feature pointer for specific index
......
......@@ -80,12 +80,21 @@ public:
unsigned int bin = bin_mapper_->ValueToBin(value);
bin_data_->Push(tid, line_idx, bin);
}
inline void PushBin(int tid, data_size_t line_idx, unsigned int bin) {
bin_data_->Push(tid, line_idx, bin);
}
void ReSize(data_size_t num_data) {
inline void CopySubset(const Feature* full_feature, const data_size_t* used_indices, data_size_t num_used_indices) {
bin_data_->CopySubset(full_feature->bin_data_.get(), used_indices, num_used_indices);
}
inline void ReSize(data_size_t num_data) {
bin_data_->ReSize(num_data);
}
inline bool is_sparse() const { return is_sparse_; }
inline void FinishLoad() { bin_data_->FinishLoad(); }
/*! \brief Index of this feature */
inline int feature_index() const { return feature_index_; }
......
......@@ -133,9 +133,9 @@ void GBDT::ResetTrainingData(const BoostingConfig* config, const Dataset* train_
right_write_pos_buf_.resize(num_threads_);
double average_bag_rate = new_config->bagging_fraction / new_config->bagging_freq;
is_use_subset_ = false;
if (average_bag_rate < 0.5) {
if (average_bag_rate <= 0.5) {
tmp_subset_.reset(new Dataset(bag_data_cnt_));
tmp_subset_->CopyFeatureMapperFrom(train_data, false);
tmp_subset_->CopyFeatureMapperFrom(train_data);
is_use_subset_ = true;
Log::Debug("use subset for bagging");
}
......
......@@ -335,8 +335,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMat(const void* data,
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
reinterpret_cast<const Dataset*>(reference),
io_config.is_enable_sparse);
reinterpret_cast<const Dataset*>(reference));
}
#pragma omp parallel for schedule(guided)
......@@ -397,8 +396,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSR(const void* indptr,
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
reinterpret_cast<const Dataset*>(reference),
io_config.is_enable_sparse);
reinterpret_cast<const Dataset*>(reference));
}
#pragma omp parallel for schedule(guided)
......@@ -450,8 +448,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromCSC(const void* col_ptr,
} else {
ret.reset(new Dataset(nrow));
ret->CopyFeatureMapperFrom(
reinterpret_cast<const Dataset*>(reference),
io_config.is_enable_sparse);
reinterpret_cast<const Dataset*>(reference));
}
#pragma omp parallel for schedule(guided)
......@@ -486,7 +483,7 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetGetSubset(
io_config.Set(param);
auto full_dataset = reinterpret_cast<const Dataset*>(handle);
auto ret = std::unique_ptr<Dataset>(new Dataset(num_used_row_indices));
ret->CopyFeatureMapperFrom(full_dataset, io_config.is_enable_sparse);
ret->CopyFeatureMapperFrom(full_dataset);
ret->CopySubset(full_dataset, used_row_indices, num_used_row_indices, true);
*out = ret.release();
API_END();
......
......@@ -38,9 +38,16 @@ void Dataset::FinishLoad() {
}
}
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset, bool is_enable_sparse) {
void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
features_.clear();
num_features_ = dataset->num_features_;
bool is_enable_sparse = false;
for (int i = 0; i < num_features_; ++i) {
if (dataset->features_[i]->is_sparse()) {
is_enable_sparse = true;
break;
}
}
// copy feature bin mapper data
for(int i = 0;i < num_features_;++i){
features_.emplace_back(new Feature(dataset->features_[i]->feature_index(),
......@@ -69,15 +76,11 @@ void Dataset::CopySubset(const Dataset* fullset, const data_size_t* used_indices
CHECK(num_used_indices == num_data_);
#pragma omp parallel for schedule(guided)
for (int fidx = 0; fidx < num_features_; ++fidx) {
auto iterator = fullset->features_[fidx]->bin_data()->GetIterator(used_indices[0]);
for (data_size_t i = 0; i < num_used_indices; ++i) {
features_[fidx]->PushBin(0, i, iterator->Get(used_indices[i]));
}
features_[fidx]->CopySubset(fullset->features_[fidx].get(), used_indices, num_used_indices);
}
if (need_meta_data) {
metadata_.Init(metadata_, used_indices, num_used_indices);
}
FinishLoad();
}
bool Dataset::SetFloatField(const char* field_name, const float* field_data, data_size_t num_element) {
......
......@@ -238,7 +238,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
dataset->num_data_ = static_cast<data_size_t>(text_data.size());
// initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data, io_config_.is_enable_sparse);
dataset->CopyFeatureMapperFrom(train_data);
// extract features
ExtractFeaturesFromMemory(text_data, parser.get(), dataset.get());
text_data.clear();
......@@ -249,7 +249,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
num_global_data = dataset->num_data_;
// initialize label
dataset->metadata_.Init(dataset->num_data_, weight_idx_, group_idx_);
dataset->CopyFeatureMapperFrom(train_data, io_config_.is_enable_sparse);
dataset->CopyFeatureMapperFrom(train_data);
// extract features
ExtractFeaturesFromFile(filename, parser.get(), used_data_indices, dataset.get());
}
......
......@@ -147,6 +147,13 @@ public:
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = reinterpret_cast<const DenseBin<VAL_T>*>(full_bin);
for (int i = 0; i < num_used_indices; ++i) {
data_[i] = other_bin->data_[used_indices[i]];
}
}
void SaveBinaryToFile(FILE* file) const override {
fwrite(data_.data(), sizeof(VAL_T), num_data_, file);
}
......
......@@ -29,17 +29,18 @@ public:
struct SparsePair {
data_size_t ridx; // data(row) index
VAL_T bin; // bin for this data
SparsePair(data_size_t r, VAL_T b) : ridx(r), bin(b) {}
SparsePair() : ridx(0), bin(0) {}
};
OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:bin_data_(bin_data) {
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
int non_zero_cnt = 0;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_.emplace_back(cur_pos, static_cast<VAL_T>(0));
++non_zero_cnt;
}
ordered_pair_.shrink_to_fit();
ordered_pair_.resize(non_zero_cnt);
}
~OrderedSparseBin() {
......
......@@ -134,25 +134,23 @@ public:
for (size_t i = 0; i < push_buffers_.size(); ++i) {
non_zero_size += push_buffers_[i].size();
}
std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair;
// merge
non_zero_pair_.reserve(non_zero_size);
non_zero_pair.reserve(non_zero_size);
for (size_t i = 0; i < push_buffers_.size(); ++i) {
non_zero_pair_.insert(non_zero_pair_.end(), push_buffers_[i].begin(), push_buffers_[i].end());
non_zero_pair.insert(non_zero_pair.end(), push_buffers_[i].begin(), push_buffers_[i].end());
push_buffers_[i].clear();
push_buffers_[i].shrink_to_fit();
}
push_buffers_.clear();
push_buffers_.shrink_to_fit();
// sort by data index
std::sort(non_zero_pair_.begin(), non_zero_pair_.end(),
std::sort(non_zero_pair.begin(), non_zero_pair.end(),
[](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
return a.first < b.first;
});
// load detla array
LoadFromPair(non_zero_pair_);
// free memory
non_zero_pair_.clear();
non_zero_pair_.shrink_to_fit();
LoadFromPair(non_zero_pair);
}
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& non_zero_pair) {
......@@ -264,12 +262,23 @@ public:
}
LoadFromPair(tmp_pair);
}
}
void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
auto other_bin = reinterpret_cast<const SparseBin<VAL_T>*>(full_bin);
SparseBinIterator<VAL_T> iterator(other_bin, used_indices[0]);
std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
for (data_size_t i = 0; i < num_used_indices; ++i) {
VAL_T bin = iterator.InnerGet(used_indices[i]);
if (bin > 0) {
tmp_pair.emplace_back(i, bin);
}
}
LoadFromPair(tmp_pair);
}
protected:
data_size_t num_data_;
std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair_;
std::vector<uint8_t> deltas_;
std::vector<VAL_T> vals_;
data_size_t num_vals_;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment