Commit f56699eb authored by Guolin Ke's avatar Guolin Ke
Browse files

better compression algorithm for sparse bin

parent 80495ca6
......@@ -3,6 +3,7 @@
#include "dense_bin.hpp"
#include "sparse_bin.hpp"
#include "ordered_sparse_bin.hpp"
#include <cmath>
#include <cstring>
......
......@@ -16,7 +16,7 @@ namespace LightGBM {
template <typename VAL_T>
class DenseBin: public Bin {
public:
explicit DenseBin(data_size_t num_data, int default_bin)
DenseBin(data_size_t num_data, int default_bin)
: num_data_(num_data) {
data_.resize(num_data_);
VAL_T default_bin_T = static_cast<VAL_T>(default_bin);
......@@ -37,8 +37,8 @@ public:
BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(const data_size_t* data_indices, data_size_t num_data,
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
const score_t* ordered_gradients, const score_t* ordered_hessians,
HistogramBinEntry* out) const override {
// use 4-way unrolling, will be faster
if (data_indices != nullptr) { // if use part of data
data_size_t rest = num_data % 4;
......@@ -70,8 +70,7 @@ public:
out[bin].sum_hessians += ordered_hessians[i];
++out[bin].cnt;
}
}
else { // use full data
} else { // use full data
data_size_t rest = num_data % 4;
data_size_t i = 0;
for (; i < num_data - rest; i += 4) {
......@@ -105,7 +104,7 @@ public:
}
data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
......@@ -168,5 +167,6 @@ template <typename VAL_T>
BinIterator* DenseBin<VAL_T>::GetIterator(data_size_t) const {
return new DenseBinIterator<VAL_T>(this);
}
} // namespace LightGBM
#endif // LightGBM_IO_DENSE_BIN_HPP_
......@@ -10,6 +10,8 @@
#include <mutex>
#include <algorithm>
#include "sparse_bin.hpp"
namespace LightGBM {
/*!
......@@ -21,7 +23,7 @@ namespace LightGBM {
* So we only using ordered bin for sparse situations.
*/
template <typename VAL_T>
class OrderedSparseBin:public OrderedBin {
class OrderedSparseBin: public OrderedBin {
public:
/*! \brief Pair to store one bin entry */
struct SparsePair {
......@@ -30,14 +32,12 @@ public:
SparsePair(data_size_t r, VAL_T b) : ridx(r), bin(b) {}
};
OrderedSparseBin(const std::vector<uint8_t>& delta, const std::vector<VAL_T>& vals)
:delta_(delta), vals_(vals) {
OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
:bin_data_(bin_data) {
data_size_t cur_pos = 0;
for (size_t i = 0; i < vals_.size(); ++i) {
cur_pos += delta_[i];
if (vals_[i] > 0) {
ordered_pair_.emplace_back(cur_pos, vals_[i]);
}
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_.emplace_back(cur_pos, 0);
}
ordered_pair_.shrink_to_fit();
}
......@@ -51,26 +51,24 @@ public:
leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
if (used_idices == nullptr) {
// if using all data, copy all non-zero pair
data_size_t cur_pos = 0;
data_size_t j = 0;
for (size_t i = 0; i < vals_.size(); ++i) {
cur_pos += delta_[i];
if (vals_[i] > 0) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = vals_[i];
++j;
}
data_size_t cur_pos = 0;
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
leaf_cnt_[0] = static_cast<data_size_t>(ordered_pair_.size());
leaf_cnt_[0] = static_cast<data_size_t>(j);
} else {
// if using part of data(bagging)
data_size_t j = 0;
data_size_t cur_pos = 0;
for (size_t i = 0; i < vals_.size(); ++i) {
cur_pos += delta_[i];
if (vals_[i] > 0 && used_idices[cur_pos]) {
data_size_t i_delta = -1;
while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
if (used_idices[cur_pos]) {
ordered_pair_[j].ridx = cur_pos;
ordered_pair_[j].bin = vals_[i];
ordered_pair_[j].bin = bin_data_->vals_[i_delta];
++j;
}
}
......@@ -79,7 +77,7 @@ public:
}
void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
HistogramBinEntry* out) const override {
HistogramBinEntry* out) const override {
// get current leaf boundary
const data_size_t start = leaf_start_[leaf];
const data_size_t end = start + leaf_cnt_[leaf];
......@@ -118,9 +116,7 @@ public:
OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;
private:
const std::vector<uint8_t>& delta_;
const std::vector<VAL_T>& vals_;
const SparseBin<VAL_T>* bin_data_;
/*! \brief Store non-zero pair , group by leaf */
std::vector<SparsePair> ordered_pair_;
/*! \brief leaf_start_[i] means data in i-th leaf start from */
......@@ -128,5 +124,11 @@ private:
/*! \brief leaf_cnt_[i] means number of data in i-th leaf */
std::vector<data_size_t> leaf_cnt_;
};
template <typename VAL_T>
OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
return new OrderedSparseBin<VAL_T>(this);
}
} // namespace LightGBM
#endif // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_
......@@ -4,7 +4,6 @@
#include <LightGBM/utils/log.h>
#include <LightGBM/bin.h>
#include "ordered_sparse_bin.hpp"
#include <omp.h>
......@@ -15,23 +14,50 @@
namespace LightGBM {
template <typename VAL_T>
class SparseBin;
const size_t kNumFastIndex = 64;
const uint8_t kMaxDelta = 255;
template <typename VAL_T> class SparseBinIterator;
template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
: bin_data_(bin_data) {
Reset(start_idx);
}
inline VAL_T InnerGet(data_size_t idx);
inline uint32_t Get(data_size_t idx) override {
return InnerGet(idx);
}
inline void Reset(data_size_t idx);
private:
const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_;
data_size_t i_delta_;
};
template <typename VAL_T>
class OrderedSparseBin;
template <typename VAL_T>
class SparseBin:public Bin {
class SparseBin: public Bin {
public:
friend class SparseBinIterator<VAL_T>;
friend class OrderedSparseBin<VAL_T>;
explicit SparseBin(data_size_t num_data, int default_bin)
SparseBin(data_size_t num_data, int default_bin)
: num_data_(num_data) {
default_bin_ = static_cast<VAL_T>(default_bin);
if (default_bin_ != 0) {
Log::Info("Warning: sparse feature with negative values, treating negative values as zero");
}
#pragma omp parallel
#pragma omp master
#pragma omp parallel
#pragma omp master
{
num_threads_ = omp_get_num_threads();
}
......@@ -51,31 +77,39 @@ public:
BinIterator* GetIterator(data_size_t start_idx) const override;
void ConstructHistogram(const data_size_t*, data_size_t , const score_t* ,
const score_t* , HistogramBinEntry*) const override {
void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
const score_t*, HistogramBinEntry*) const override {
// Will use OrderedSparseBin->ConstructHistogram() instead
Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
}
inline bool NextNonzero(data_size_t* i_delta,
data_size_t* cur_pos) const {
++(*i_delta);
*cur_pos += deltas_[*i_delta];
data_size_t factor = 1;
while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
++(*i_delta);
factor *= kMaxDelta;
*cur_pos += deltas_[*i_delta] * factor;
}
if (*i_delta >= 0 && *i_delta < num_vals_) {
return true;
} else {
return false;
}
}
data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
data_size_t* lte_indices, data_size_t* gt_indices) const override {
data_size_t* lte_indices, data_size_t* gt_indices) const override {
// not need to split
if (num_data <= 0) { return 0; }
const auto fast_pair = fast_index_[(data_indices[0]) >> fast_index_shift_];
data_size_t j = fast_pair.first;
data_size_t cur_pos = fast_pair.second;
SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
data_size_t lte_count = 0;
data_size_t gt_count = 0;
for (data_size_t i = 0; i < num_data; ++i) {
const data_size_t idx = data_indices[i];
while (cur_pos < idx && j < num_vals_) {
++j;
cur_pos += delta_[j];
}
VAL_T bin = 0;
if (cur_pos == idx && j < num_vals_) {
bin = vals_[j];
}
VAL_T bin = iterator.InnerGet(idx);
if (bin > threshold) {
gt_indices[gt_count++] = idx;
} else {
......@@ -87,9 +121,7 @@ public:
data_size_t num_data() const override { return num_data_; }
OrderedBin* CreateOrderedBin() const override {
return new OrderedSparseBin<VAL_T>(delta_, vals_);
}
OrderedBin* CreateOrderedBin() const override;
void FinishLoad() override {
// get total non zero size
......@@ -119,30 +151,29 @@ public:
}
void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& non_zero_pair) {
delta_.clear();
deltas_.clear();
vals_.clear();
// transform to delta array
const uint8_t kMaxDelta = 255;
data_size_t last_idx = 0;
for (size_t i = 0; i < non_zero_pair.size(); ++i) {
const data_size_t cur_idx = non_zero_pair[i].first;
const VAL_T bin = non_zero_pair[i].second;
data_size_t cur_delta = cur_idx - last_idx;
while (cur_delta > kMaxDelta) {
delta_.push_back(255);
deltas_.push_back(cur_delta % kMaxDelta);
vals_.push_back(0);
cur_delta -= kMaxDelta;
cur_delta /= kMaxDelta;
}
delta_.push_back(static_cast<uint8_t>(cur_delta));
deltas_.push_back(static_cast<uint8_t>(cur_delta));
vals_.push_back(bin);
last_idx = cur_idx;
}
// avoid out of range
delta_.push_back(0);
deltas_.push_back(0);
num_vals_ = static_cast<data_size_t>(vals_.size());
// reduce memory cost
delta_.shrink_to_fit();
deltas_.shrink_to_fit();
vals_.shrink_to_fit();
// generate fast index
......@@ -160,26 +191,26 @@ public:
++fast_index_shift_;
}
// build fast index
data_size_t next_i = 0;
data_size_t i_delta = -1;
data_size_t cur_pos = 0;
for (data_size_t i = 0; i < num_vals_; ++i) {
cur_pos += delta_[i];
while (next_i < cur_pos) {
fast_index_.emplace_back(i, cur_pos);
next_i += pow2_mod_size;
data_size_t next_threshold = 0;
while (NextNonzero(&i_delta, &cur_pos)) {
while (next_threshold < cur_pos) {
fast_index_.emplace_back(i_delta, cur_pos);
next_threshold += pow2_mod_size;
}
}
// avoid out of range
while (next_i < num_data_) {
while (next_threshold < num_data_) {
fast_index_.emplace_back(num_vals_ - 1, cur_pos);
next_i += pow2_mod_size;
next_threshold += pow2_mod_size;
}
fast_index_.shrink_to_fit();
}
void SaveBinaryToFile(FILE* file) const override {
fwrite(&num_vals_, sizeof(num_vals_), 1, file);
fwrite(delta_.data(), sizeof(uint8_t), num_vals_ + 1, file);
fwrite(deltas_.data(), sizeof(uint8_t), num_vals_ + 1, file);
fwrite(vals_.data(), sizeof(VAL_T), num_vals_, file);
}
......@@ -196,39 +227,33 @@ public:
mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1);
const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr);
if (local_used_indices.size() <= 0) {
delta_.clear();
vals_.clear();
num_vals_ = tmp_num_vals;
for (data_size_t i = 0; i < num_vals_; ++i) {
delta_.push_back(tmp_delta[i]);
vals_.push_back(tmp_vals[i]);
}
delta_.push_back(0);
// reduce memory cost
delta_.shrink_to_fit();
vals_.shrink_to_fit();
deltas_.clear();
vals_.clear();
num_vals_ = tmp_num_vals;
for (data_size_t i = 0; i < num_vals_; ++i) {
deltas_.push_back(tmp_delta[i]);
vals_.push_back(tmp_vals[i]);
}
deltas_.push_back(0);
// reduce memory cost
deltas_.shrink_to_fit();
vals_.shrink_to_fit();
if (local_used_indices.size() <= 0) {
// generate fast index
GetFastIndex();
} else {
std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
data_size_t cur_pos = tmp_delta[0];
data_size_t j = 0;
data_size_t cur_pos = 0;
data_size_t j = -1;
for (data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) {
const data_size_t idx = local_used_indices[i];
while (cur_pos < idx && j < tmp_num_vals) {
++j;
cur_pos += tmp_delta[j];
while (cur_pos < idx && j < num_vals_) {
NextNonzero(&j, &cur_pos);
}
VAL_T bin = 0;
if (cur_pos == idx && j < tmp_num_vals) {
bin = tmp_vals[j];
}
if (bin > 0) {
if (cur_pos == idx && j < num_vals_) {
// new row index is i
tmp_pair.emplace_back(i, bin);
tmp_pair.emplace_back(i, vals_[j]);
}
}
LoadFromPair(tmp_pair);
......@@ -239,7 +264,7 @@ public:
private:
data_size_t num_data_;
std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair_;
std::vector<uint8_t> delta_;
std::vector<uint8_t> deltas_;
std::vector<VAL_T> vals_;
data_size_t num_vals_;
int num_threads_;
......@@ -250,36 +275,30 @@ private:
};
template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
: bin_data_(bin_data) {
const auto fast_pair = bin_data->fast_index_[start_idx >> bin_data->fast_index_shift_];
i_delta_ = fast_pair.first;
cur_pos_ = fast_pair.second;
inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
bin_data_->NextNonzero(&i_delta_, &cur_pos_);
}
uint32_t Get(data_size_t idx) override {
while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
++i_delta_;
cur_pos_ += bin_data_->delta_[i_delta_];
}
if (idx == cur_pos_ && i_delta_ >= 0
&& i_delta_ < bin_data_->vals_.size()) {
return bin_data_->vals_[i_delta_];
} else { return 0; }
if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_) {
return bin_data_->vals_[i_delta_];
} else {
return 0;
}
}
private:
const SparseBin<VAL_T>* bin_data_;
data_size_t cur_pos_ = 0;
data_size_t i_delta_ = 0;
};
template <typename VAL_T>
inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
i_delta_ = fast_pair.first;
cur_pos_ = fast_pair.second;
}
template <typename VAL_T>
BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
return new SparseBinIterator<VAL_T>(this, start_idx);
}
} // namespace LightGBM
#endif // LightGBM_IO_SPARSE_BIN_HPP_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment