Commit f56699eb authored by Guolin Ke's avatar Guolin Ke
Browse files

better compression algorithm for sparse bin

parent 80495ca6
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "dense_bin.hpp" #include "dense_bin.hpp"
#include "sparse_bin.hpp" #include "sparse_bin.hpp"
#include "ordered_sparse_bin.hpp"
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
......
...@@ -16,7 +16,7 @@ namespace LightGBM { ...@@ -16,7 +16,7 @@ namespace LightGBM {
template <typename VAL_T> template <typename VAL_T>
class DenseBin: public Bin { class DenseBin: public Bin {
public: public:
explicit DenseBin(data_size_t num_data, int default_bin) DenseBin(data_size_t num_data, int default_bin)
: num_data_(num_data) { : num_data_(num_data) {
data_.resize(num_data_); data_.resize(num_data_);
VAL_T default_bin_T = static_cast<VAL_T>(default_bin); VAL_T default_bin_T = static_cast<VAL_T>(default_bin);
...@@ -37,8 +37,8 @@ public: ...@@ -37,8 +37,8 @@ public:
BinIterator* GetIterator(data_size_t start_idx) const override; BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data, void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians, const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster // use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data if (data_indices != nullptr) { // if use part of data
data_size_t rest = num_data % 4; data_size_t rest = num_data % 4;
...@@ -70,8 +70,7 @@ public: ...@@ -70,8 +70,7 @@ public:
out[bin].sum_hessians += ordered_hessians[i]; out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt; ++out[bin].cnt;
} }
} } else { // use full data
else { // use full data
data_size_t rest = num_data % 4; data_size_t rest = num_data % 4;
data_size_t i = 0; data_size_t i = 0;
for (; i < num_data - rest; i += 4) { for (; i < num_data - rest; i += 4) {
...@@ -105,7 +104,7 @@ public: ...@@ -105,7 +104,7 @@ public:
} }
data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data, data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override { data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t lte_count = 0; data_size_t lte_count = 0;
data_size_t gt_count = 0; data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
...@@ -168,5 +167,6 @@ template <typename VAL_T> ...@@ -168,5 +167,6 @@ template <typename VAL_T>
BinIterator* DenseBin<VAL_T>::GetIterator(data_size_t) const { BinIterator* DenseBin<VAL_T>::GetIterator(data_size_t) const {
return new DenseBinIterator<VAL_T>(this); return new DenseBinIterator<VAL_T>(this);
} }
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_IO_DENSE_BIN_HPP_ #endif // LightGBM_IO_DENSE_BIN_HPP_
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
#include <mutex> #include <mutex>
#include <algorithm> #include <algorithm>
#include "sparse_bin.hpp"
namespace LightGBM { namespace LightGBM {
/*! /*!
...@@ -21,7 +23,7 @@ namespace LightGBM { ...@@ -21,7 +23,7 @@ namespace LightGBM {
* So we only using ordered bin for sparse situations. * So we only using ordered bin for sparse situations.
*/ */
template <typename VAL_T> template <typename VAL_T>
class OrderedSparseBin:public OrderedBin { class OrderedSparseBin: public OrderedBin {
public: public:
/*! \brief Pair to store one bin entry */ /*! \brief Pair to store one bin entry */
struct SparsePair { struct SparsePair {
...@@ -30,14 +32,12 @@ public: ...@@ -30,14 +32,12 @@ public:
SparsePair(data_size_t r, VAL_T b) : ridx(r), bin(b) {} SparsePair(data_size_t r, VAL_T b) : ridx(r), bin(b) {}
}; };
OrderedSparseBin(const std::vector<uint8_t>& delta, const std::vector<VAL_T>& vals) OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:delta_(delta), vals_(vals) { :bin_data_(bin_data) {
data_size_t cur_pos = 0; data_size_t cur_pos = 0;
for (size_t i = 0; i < vals_.size(); ++i) { data_size_t i_delta = -1;
cur_pos += delta_[i]; while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
if (vals_[i] > 0) { ordered_pair_.emplace_back(cur_pos, 0);
ordered_pair_.emplace_back(cur_pos, vals_[i]);
}
} }
ordered_pair_.shrink_to_fit(); ordered_pair_.shrink_to_fit();
} }
...@@ -51,26 +51,24 @@ public: ...@@ -51,26 +51,24 @@ public:
leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0); leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
if (used_idices == nullptr) { if (used_idices == nullptr) {
// if using all data, copy all non-zero pair // if using all data, copy all non-zero pair
data_size_t cur_pos = 0;
data_size_t j = 0; data_size_t j = 0;
for (size_t i = 0; i < vals_.size(); ++i) { data_size_t cur_pos = 0;
cur_pos += delta_[i]; data_size_t i_delta = -1;
if (vals_[i] > 0) { while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_[j].ridx = cur_pos; ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = vals_[i]; ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j; ++j;
}
} }
leaf_cnt_[0] = static_cast<data_size_t>(ordered_pair_.size()); leaf_cnt_[0] = static_cast<data_size_t>(j);
} else { } else {
// if using part of data(bagging) // if using part of data(bagging)
data_size_t j = 0; data_size_t j = 0;
data_size_t cur_pos = 0; data_size_t cur_pos = 0;
for (size_t i = 0; i < vals_.size(); ++i) { data_size_t i_delta = -1;
cur_pos += delta_[i]; while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
if (vals_[i] > 0 && used_idices[cur_pos]) { if (used_idices[cur_pos]) {
ordered_pair_[j].ridx = cur_pos; ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = vals_[i]; ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j; ++j;
} }
} }
...@@ -79,7 +77,7 @@ public: ...@@ -79,7 +77,7 @@ public:
} }
void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian, void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
HistogramBinEntry* out) const override { HistogramBinEntry* out) const override {
// get current leaf boundary // get current leaf boundary
const data_size_t start = leaf_start_[leaf]; const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf]; const data_size_t end = start + leaf_cnt_[leaf];
...@@ -118,9 +116,7 @@ public: ...@@ -118,9 +116,7 @@ public:
OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete; OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;
private: private:
const std::vector<uint8_t>& delta_; const SparseBin<VAL_T>* bin_data_;
const std::vector<VAL_T>& vals_;
/*! \brief Store non-zero pair , group by leaf */ /*! \brief Store non-zero pair , group by leaf */
std::vector<SparsePair> ordered_pair_; std::vector<SparsePair> ordered_pair_;
/*! \brief leaf_start_[i] means data in i-th leaf start from */ /*! \brief leaf_start_[i] means data in i-th leaf start from */
...@@ -128,5 +124,11 @@ private: ...@@ -128,5 +124,11 @@ private:
/*! \brief leaf_cnt_[i] means number of data in i-th leaf */ /*! \brief leaf_cnt_[i] means number of data in i-th leaf */
std::vector<data_size_t> leaf_cnt_; std::vector<data_size_t> leaf_cnt_;
}; };
template <typename VAL_T>
OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
return new OrderedSparseBin<VAL_T>(this);
}
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_ #endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <LightGBM/utils/log.h> #include <LightGBM/utils/log.h>
#include <LightGBM/bin.h> #include <LightGBM/bin.h>
#include "ordered_sparse_bin.hpp"
#include <omp.h> #include <omp.h>
...@@ -15,23 +14,50 @@ ...@@ -15,23 +14,50 @@
namespace LightGBM { namespace LightGBM {
template <typename VAL_T>
class SparseBin;
const size_t kNumFastIndex = 64; const size_t kNumFastIndex = 64;
const uint8_t kMaxDelta = 255;
template <typename VAL_T> class SparseBinIterator; template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
: bin_data_(bin_data) {
Reset(start_idx);
}
inline VAL_T InnerGet(data_size_t idx);
inline uint32_t Get(data_size_t idx) override {
return InnerGet(idx);
}
inline void Reset(data_size_t idx);
private:
const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_;
data_size_t i_delta_;
};
template <typename VAL_T>
class OrderedSparseBin;
template <typename VAL_T> template <typename VAL_T>
class SparseBin:public Bin { class SparseBin: public Bin {
public: public:
friend class SparseBinIterator<VAL_T>; friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
explicit SparseBin(data_size_t num_data, int default_bin) SparseBin(data_size_t num_data, int default_bin)
: num_data_(num_data) { : num_data_(num_data) {
default_bin_ = static_cast<VAL_T>(default_bin); default_bin_ = static_cast<VAL_T>(default_bin);
if (default_bin_ != 0) { if (default_bin_ != 0) {
Log::Info("Warning: sparse feature with negative values, treating negative values as zero"); Log::Info("Warning: sparse feature with negative values, treating negative values as zero");
} }
#pragma omp parallel #pragma omp parallel
#pragma omp master #pragma omp master
{ {
num_threads_ = omp_get_num_threads(); num_threads_ = omp_get_num_threads();
} }
...@@ -51,31 +77,39 @@ public: ...@@ -51,31 +77,39 @@ public:
BinIterator* GetIterator(data_size_t start_idx) const override; BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(const data_size_t*, data_size_t , const score_t* , void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
const score_t* , HistogramBinEntry*) const override { const score_t*, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead // Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead"); Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
} }
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
++(*i_delta);
*cur_pos += deltas_[*i_delta];
data_size_t factor = 1;
while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
++(*i_delta);
factor *= kMaxDelta;
*cur_pos += deltas_[*i_delta] * factor;
}
if (*i_delta >= 0 && *i_delta < num_vals_) {
return true;
} else {
return false;
}
}
data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data, data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override { data_size_t* lte_indices, data_size_t* gt_indices) const override {
// not need to split // not need to split
if (num_data <= 0) { return 0; } if (num_data <= 0) { return 0; }
const auto fast_pair = fast_index_[(data_indices[0]) >> fast_index_shift_]; SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t j = fast_pair.first;
data_size_t cur_pos = fast_pair.second;
data_size_t lte_count = 0; data_size_t lte_count = 0;
data_size_t gt_count = 0; data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) { for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i]; const data_size_t idx = data_indices[i];
while (cur_pos < idx && j < num_vals_) { VAL_T bin = iterator.InnerGet(idx);
++j;
cur_pos += delta_[j];
}
VAL_T bin = 0;
if (cur_pos == idx && j < num_vals_) {
bin = vals_[j];
}
if (bin > threshold) { if (bin > threshold) {
gt_indices[gt_count++] = idx; gt_indices[gt_count++] = idx;
} else { } else {
...@@ -87,9 +121,7 @@ public: ...@@ -87,9 +121,7 @@ public:
data_size_t num_data() const override { return num_data_; } data_size_t num_data() const override { return num_data_; }
OrderedBin* CreateOrderedBin() const override { OrderedBin* CreateOrderedBin() const override;
return new OrderedSparseBin<VAL_T>(delta_, vals_);
}
void FinishLoad() override { void FinishLoad() override {
// get total non zero size // get total non zero size
...@@ -119,30 +151,29 @@ public: ...@@ -119,30 +151,29 @@ public:
} }
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& non_zero_pair) { void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& non_zero_pair) {
delta_.clear(); deltas_.clear();
vals_.clear(); vals_.clear();
// transform to delta array // transform to delta array
const uint8_t kMaxDelta = 255;
data_size_t last_idx = 0; data_size_t last_idx = 0;
for (size_t i = 0; i < non_zero_pair.size(); ++i) { for (size_t i = 0; i < non_zero_pair.size(); ++i) {
const data_size_t cur_idx = non_zero_pair[i].first; const data_size_t cur_idx = non_zero_pair[i].first;
const VAL_T bin = non_zero_pair[i].second; const VAL_T bin = non_zero_pair[i].second;
data_size_t cur_delta = cur_idx - last_idx; data_size_t cur_delta = cur_idx - last_idx;
while (cur_delta > kMaxDelta) { while (cur_delta > kMaxDelta) {
delta_.push_back(255); deltas_.push_back(cur_delta % kMaxDelta);
vals_.push_back(0); vals_.push_back(0);
cur_delta -= kMaxDelta; cur_delta /= kMaxDelta;
} }
delta_.push_back(static_cast<uint8_t>(cur_delta)); deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin); vals_.push_back(bin);
last_idx = cur_idx; last_idx = cur_idx;
} }
// avoid out of range // avoid out of range
delta_.push_back(0); deltas_.push_back(0);
num_vals_ = static_cast<data_size_t>(vals_.size()); num_vals_ = static_cast<data_size_t>(vals_.size());
// reduce memory cost // reduce memory cost
delta_.shrink_to_fit(); deltas_.shrink_to_fit();
vals_.shrink_to_fit(); vals_.shrink_to_fit();
// generate fast index // generate fast index
...@@ -160,26 +191,26 @@ public: ...@@ -160,26 +191,26 @@ public:
++fast_index_shift_; ++fast_index_shift_;
} }
// build fast index // build fast index
data_size_t next_i = 0; data_size_t i_delta = -1;
data_size_t cur_pos = 0; data_size_t cur_pos = 0;
for (data_size_t i = 0; i < num_vals_; ++i) { data_size_t next_threshold = 0;
cur_pos += delta_[i]; while (NextNonzero(&i_delta, &cur_pos)) {
while (next_i < cur_pos) { while (next_threshold < cur_pos) {
fast_index_.emplace_back(i, cur_pos); fast_index_.emplace_back(i_delta, cur_pos);
next_i += pow2_mod_size; next_threshold += pow2_mod_size;
} }
} }
// avoid out of range // avoid out of range
while (next_i < num_data_) { while (next_threshold < num_data_) {
fast_index_.emplace_back(num_vals_ - 1, cur_pos); fast_index_.emplace_back(num_vals_ - 1, cur_pos);
next_i += pow2_mod_size; next_threshold += pow2_mod_size;
} }
fast_index_.shrink_to_fit(); fast_index_.shrink_to_fit();
} }
void SaveBinaryToFile(FILE* file) const override { void SaveBinaryToFile(FILE* file) const override {
fwrite(&num_vals_, sizeof(num_vals_), 1, file); fwrite(&num_vals_, sizeof(num_vals_), 1, file);
fwrite(delta_.data(), sizeof(uint8_t), num_vals_ + 1, file); fwrite(deltas_.data(), sizeof(uint8_t), num_vals_ + 1, file);
fwrite(vals_.data(), sizeof(VAL_T), num_vals_, file); fwrite(vals_.data(), sizeof(VAL_T), num_vals_, file);
} }
...@@ -196,39 +227,33 @@ public: ...@@ -196,39 +227,33 @@ public:
mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1); mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1);
const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr); const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr);
if (local_used_indices.size() <= 0) { deltas_.clear();
delta_.clear(); vals_.clear();
vals_.clear(); num_vals_ = tmp_num_vals;
num_vals_ = tmp_num_vals; for (data_size_t i = 0; i < num_vals_; ++i) {
for (data_size_t i = 0; i < num_vals_; ++i) { deltas_.push_back(tmp_delta[i]);
delta_.push_back(tmp_delta[i]); vals_.push_back(tmp_vals[i]);
vals_.push_back(tmp_vals[i]); }
} deltas_.push_back(0);
delta_.push_back(0); // reduce memory cost
// reduce memory cost deltas_.shrink_to_fit();
delta_.shrink_to_fit(); vals_.shrink_to_fit();
vals_.shrink_to_fit();
if (local_used_indices.size() <= 0) {
// generate fast index // generate fast index
GetFastIndex(); GetFastIndex();
} else { } else {
std::vector<std::pair<data_size_t, VAL_T>> tmp_pair; std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
data_size_t cur_pos = tmp_delta[0]; data_size_t cur_pos = 0;
data_size_t j = 0; data_size_t j = -1;
for (data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) { for (data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) {
const data_size_t idx = local_used_indices[i]; const data_size_t idx = local_used_indices[i];
while (cur_pos < idx && j < tmp_num_vals) { while (cur_pos < idx && j < num_vals_) {
++j; NextNonzero(&j, &cur_pos);
cur_pos += tmp_delta[j];
} }
VAL_T bin = 0; if (cur_pos == idx && j < num_vals_) {
if (cur_pos == idx && j < tmp_num_vals) {
bin = tmp_vals[j];
}
if (bin > 0) {
// new row index is i // new row index is i
tmp_pair.emplace_back(i, bin); tmp_pair.emplace_back(i, vals_[j]);
} }
} }
LoadFromPair(tmp_pair); LoadFromPair(tmp_pair);
...@@ -239,7 +264,7 @@ public: ...@@ -239,7 +264,7 @@ public:
private: private:
data_size_t num_data_; data_size_t num_data_;
std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair_; std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair_;
std::vector<uint8_t> delta_; std::vector<uint8_t> deltas_;
std::vector<VAL_T> vals_; std::vector<VAL_T> vals_;
data_size_t num_vals_; data_size_t num_vals_;
int num_threads_; int num_threads_;
...@@ -250,36 +275,30 @@ private: ...@@ -250,36 +275,30 @@ private:
}; };
template <typename VAL_T> template <typename VAL_T>
class SparseBinIterator: public BinIterator { inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
public: while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx) bin_data_->NextNonzero(&i_delta_, &cur_pos_);
: bin_data_(bin_data) {
const auto fast_pair = bin_data->fast_index_[start_idx >> bin_data->fast_index_shift_];
i_delta_ = fast_pair.first;
cur_pos_ = fast_pair.second;
} }
uint32_t Get(data_size_t idx) override { if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_) {
while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) { return bin_data_->vals_[i_delta_];
++i_delta_; } else {
cur_pos_ += bin_data_->delta_[i_delta_]; return 0;
}
if (idx == cur_pos_ && i_delta_ >= 0
&& i_delta_ < bin_data_->vals_.size()) {
return bin_data_->vals_[i_delta_];
} else { return 0; }
} }
}
private: template <typename VAL_T>
const SparseBin<VAL_T>* bin_data_; inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
data_size_t cur_pos_ = 0; const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
data_size_t i_delta_ = 0; i_delta_ = fast_pair.first;
}; cur_pos_ = fast_pair.second;
}
template <typename VAL_T> template <typename VAL_T>
BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const { BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
return new SparseBinIterator<VAL_T>(this, start_idx); return new SparseBinIterator<VAL_T>(this, start_idx);
} }
} // namespace LightGBM } // namespace LightGBM
#endif // LightGBM_IO_SPARSE_BIN_HPP_ #endif // LightGBM_IO_SPARSE_BIN_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment