sparse_bin.hpp 10.2 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
#ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_SPARSE_BIN_HPP_

#include <LightGBM/utils/log.h>

#include <LightGBM/bin.h>

8
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
9
10
11

#include <cstring>
#include <cstdint>
12
#include <limits>
Guolin Ke's avatar
Guolin Ke committed
13
14
15
16
#include <vector>

namespace LightGBM {

17
18
19
template <typename VAL_T>
class SparseBin;

Guolin Ke's avatar
Guolin Ke committed
20
const size_t kNumFastIndex = 64;
21
const uint8_t kMaxDelta = 255;
Guolin Ke's avatar
Guolin Ke committed
22

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
  SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
    : bin_data_(bin_data) {
    Reset(start_idx);
  }

  inline VAL_T InnerGet(data_size_t idx);

  inline uint32_t Get(data_size_t idx) override {
    return InnerGet(idx);
  }

  inline void Reset(data_size_t idx);
private:
  const SparseBin<VAL_T>* bin_data_;
  data_size_t cur_pos_;
  data_size_t i_delta_;
};

template <typename VAL_T>
class OrderedSparseBin;
Guolin Ke's avatar
Guolin Ke committed
46
47

template <typename VAL_T>
48
class SparseBin: public Bin {
Guolin Ke's avatar
Guolin Ke committed
49
50
public:
  friend class SparseBinIterator<VAL_T>;
51
  friend class OrderedSparseBin<VAL_T>;
Guolin Ke's avatar
Guolin Ke committed
52

53
  SparseBin(data_size_t num_data, uint32_t default_bin)
Guolin Ke's avatar
Guolin Ke committed
54
    : num_data_(num_data) {
Guolin Ke's avatar
Guolin Ke committed
55
    default_bin_ = static_cast<VAL_T>(default_bin);
56
57
#pragma omp parallel
#pragma omp master
Guolin Ke's avatar
Guolin Ke committed
58
59
60
61
62
63
64
65
66
    {
      num_threads_ = omp_get_num_threads();
    }
    for (int i = 0; i < num_threads_; ++i) {
      push_buffers_.emplace_back();
    }
  }

  ~SparseBin() {
Guolin Ke's avatar
Guolin Ke committed
67
68
69
70
71

  }

  void ReSize(data_size_t num_data) override {
    num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
72
73
74
  }

  void Push(int tid, data_size_t idx, uint32_t value) override {
75
76
77
78
    auto cur_bin = static_cast<VAL_T>(value);
    if (cur_bin != default_bin_) {
      push_buffers_[tid].emplace_back(idx, cur_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
79
80
81
82
  }

  BinIterator* GetIterator(data_size_t start_idx) const override;

83
84
  void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
    const score_t*, HistogramBinEntry*) const override {
Guolin Ke's avatar
Guolin Ke committed
85
    // Will use OrderedSparseBin->ConstructHistogram() instead
Guolin Ke's avatar
Guolin Ke committed
86
    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
Guolin Ke's avatar
Guolin Ke committed
87
88
  }

89
90
  inline bool NextNonzero(data_size_t* i_delta,
    data_size_t* cur_pos) const {
91
    const VAL_T non_data_flag = std::numeric_limits<VAL_T>::max();
92
93
94
    ++(*i_delta);
    *cur_pos += deltas_[*i_delta];
    data_size_t factor = 1;
95
    while (*i_delta < num_vals_ && vals_[*i_delta] == non_data_flag) {
96
97
98
99
100
101
102
103
104
105
106
      ++(*i_delta);
      factor *= kMaxDelta;
      *cur_pos += deltas_[*i_delta] * factor;
    }
    if (*i_delta >= 0 && *i_delta < num_vals_) {
      return true;
    } else {
      return false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
107
  virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
108
    data_size_t* lte_indices, data_size_t* gt_indices) const override {
109
110
    // not need to split
    if (num_data <= 0) { return 0; }
111
    SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
Guolin Ke's avatar
Guolin Ke committed
112
113
    data_size_t lte_count = 0;
    data_size_t gt_count = 0;
114
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
115
      const data_size_t idx = data_indices[i];
116
      VAL_T bin = iterator.InnerGet(idx);
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
121
122
123
124
125
126
127
      if (bin > threshold) {
        gt_indices[gt_count++] = idx;
      } else {
        lte_indices[lte_count++] = idx;
      }
    }
    return lte_count;
  }

  data_size_t num_data() const override { return num_data_; }

128
  OrderedBin* CreateOrderedBin() const override;
Guolin Ke's avatar
Guolin Ke committed
129
130
131

  void FinishLoad() override {
    // get total non zero size
132
    size_t pair_cnt = 0;
133
    for (size_t i = 0; i < push_buffers_.size(); ++i) {
134
      pair_cnt += push_buffers_[i].size();
Guolin Ke's avatar
Guolin Ke committed
135
    }
136
    std::vector<std::pair<data_size_t, VAL_T>> idx_val_pairs;
Guolin Ke's avatar
Guolin Ke committed
137
    // merge
138
    idx_val_pairs.reserve(pair_cnt);
139
    for (size_t i = 0; i < push_buffers_.size(); ++i) {
140
      idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end());
Guolin Ke's avatar
Guolin Ke committed
141
142
143
144
145
146
      push_buffers_[i].clear();
      push_buffers_[i].shrink_to_fit();
    }
    push_buffers_.clear();
    push_buffers_.shrink_to_fit();
    // sort by data index
147
    std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
Guolin Ke's avatar
Guolin Ke committed
148
149
150
151
      [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
      return a.first < b.first;
    });
    // load detla array
152
    LoadFromPair(idx_val_pairs);
Guolin Ke's avatar
Guolin Ke committed
153
154
  }

155
  void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) {
156
    deltas_.clear();
Guolin Ke's avatar
Guolin Ke committed
157
    vals_.clear();
158
    const VAL_T non_data_flag = std::numeric_limits<VAL_T>::max();
Guolin Ke's avatar
Guolin Ke committed
159
160
    // transform to delta array
    data_size_t last_idx = 0;
161
162
163
    for (size_t i = 0; i < idx_val_pairs.size(); ++i) {
      const data_size_t cur_idx = idx_val_pairs[i].first;
      const VAL_T bin = idx_val_pairs[i].second;
Guolin Ke's avatar
Guolin Ke committed
164
165
      data_size_t cur_delta = cur_idx - last_idx;
      while (cur_delta > kMaxDelta) {
166
        deltas_.push_back(cur_delta %  kMaxDelta);
167
        vals_.push_back(non_data_flag);
168
        cur_delta /= kMaxDelta;
Guolin Ke's avatar
Guolin Ke committed
169
      }
170
      deltas_.push_back(static_cast<uint8_t>(cur_delta));
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
      vals_.push_back(bin);
      last_idx = cur_idx;
    }
    // avoid out of range
175
    deltas_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
176
177
178
    num_vals_ = static_cast<data_size_t>(vals_.size());

    // reduce memory cost
179
    deltas_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
180
181
182
183
184
185
186
    vals_.shrink_to_fit();

    // generate fast index
    GetFastIndex();
  }

  void GetFastIndex() {
Guolin Ke's avatar
Guolin Ke committed
187

Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
192
193
194
195
196
197
    fast_index_.clear();
    // get shift cnt
    data_size_t mod_size = (num_data_ + kNumFastIndex - 1) / kNumFastIndex;
    data_size_t pow2_mod_size = 1;
    fast_index_shift_ = 0;
    while (pow2_mod_size < mod_size) {
      pow2_mod_size <<= 1;
      ++fast_index_shift_;
    }
    // build fast index
198
    data_size_t i_delta = -1;
Guolin Ke's avatar
Guolin Ke committed
199
    data_size_t cur_pos = 0;
200
201
    data_size_t next_threshold = 0;
    while (NextNonzero(&i_delta, &cur_pos)) {
Guolin Ke's avatar
Guolin Ke committed
202
      while (next_threshold <= cur_pos) {
203
204
        fast_index_.emplace_back(i_delta, cur_pos);
        next_threshold += pow2_mod_size;
Guolin Ke's avatar
Guolin Ke committed
205
206
207
      }
    }
    // avoid out of range
208
    while (next_threshold < num_data_) {
Guolin Ke's avatar
Guolin Ke committed
209
      fast_index_.emplace_back(num_vals_ - 1, cur_pos);
210
      next_threshold += pow2_mod_size;
Guolin Ke's avatar
Guolin Ke committed
211
212
213
214
215
216
    }
    fast_index_.shrink_to_fit();
  }

  void SaveBinaryToFile(FILE* file) const override {
    fwrite(&num_vals_, sizeof(num_vals_), 1, file);
217
    fwrite(deltas_.data(), sizeof(uint8_t), num_vals_ + 1, file);
Guolin Ke's avatar
Guolin Ke committed
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    fwrite(vals_.data(), sizeof(VAL_T), num_vals_, file);
  }

  size_t SizesInByte() const override {
    return sizeof(num_vals_) + sizeof(uint8_t) * (num_vals_ + 1)
      + sizeof(VAL_T) * num_vals_;
  }

  void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
    const char* mem_ptr = reinterpret_cast<const char*>(memory);
    data_size_t tmp_num_vals = *(reinterpret_cast<const data_size_t*>(mem_ptr));
    mem_ptr += sizeof(tmp_num_vals);
    const uint8_t* tmp_delta = reinterpret_cast<const uint8_t*>(mem_ptr);
    mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1);
    const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr);

234
235
236
237
238
239
240
241
242
243
244
    deltas_.clear();
    vals_.clear();
    num_vals_ = tmp_num_vals;
    for (data_size_t i = 0; i < num_vals_; ++i) {
      deltas_.push_back(tmp_delta[i]);
      vals_.push_back(tmp_vals[i]);
    }
    deltas_.push_back(0);
    // reduce memory cost
    deltas_.shrink_to_fit();
    vals_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
245

Guolin Ke's avatar
Guolin Ke committed
246
    if (local_used_indices.empty()) {
Guolin Ke's avatar
Guolin Ke committed
247
248
249
250
      // generate fast index
      GetFastIndex();
    } else {
      std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
251
252
      data_size_t cur_pos = 0;
      data_size_t j = -1;
Guolin Ke's avatar
Guolin Ke committed
253
254
      for (data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) {
        const data_size_t idx = local_used_indices[i];
255
256
        while (cur_pos < idx && j < num_vals_) {
          NextNonzero(&j, &cur_pos);
Guolin Ke's avatar
Guolin Ke committed
257
        }
258
        if (cur_pos == idx && j < num_vals_) {
Guolin Ke's avatar
Guolin Ke committed
259
          // new row index is i
260
          tmp_pair.emplace_back(i, vals_[j]);
Guolin Ke's avatar
Guolin Ke committed
261
262
263
264
        }
      }
      LoadFromPair(tmp_pair);
    }
265
  }
Guolin Ke's avatar
Guolin Ke committed
266

267
268
269
270
271
272
  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
    auto other_bin = reinterpret_cast<const SparseBin<VAL_T>*>(full_bin);
    SparseBinIterator<VAL_T> iterator(other_bin, used_indices[0]);
    std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
    for (data_size_t i = 0; i < num_used_indices; ++i) {
      VAL_T bin = iterator.InnerGet(used_indices[i]);
273
      if (bin != default_bin_) {
274
275
276
277
        tmp_pair.emplace_back(i, bin);
      }
    }
    LoadFromPair(tmp_pair);
Guolin Ke's avatar
Guolin Ke committed
278
279
  }

Guolin Ke's avatar
Guolin Ke committed
280
protected:
Guolin Ke's avatar
Guolin Ke committed
281
  data_size_t num_data_;
282
  std::vector<uint8_t> deltas_;
Guolin Ke's avatar
Guolin Ke committed
283
284
285
286
287
288
  std::vector<VAL_T> vals_;
  data_size_t num_vals_;
  int num_threads_;
  std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
  std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
  data_size_t fast_index_shift_;
Guolin Ke's avatar
Guolin Ke committed
289
  VAL_T default_bin_;
Guolin Ke's avatar
Guolin Ke committed
290
291
292
};

template <typename VAL_T>
293
294
295
inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
  while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
    bin_data_->NextNonzero(&i_delta_, &cur_pos_);
Guolin Ke's avatar
Guolin Ke committed
296
  }
297
  if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_ && i_delta_ >= 0) {
298
299
    return bin_data_->vals_[i_delta_];
  } else {
300
    return bin_data_->default_bin_;
Guolin Ke's avatar
Guolin Ke committed
301
  }
302
}
Guolin Ke's avatar
Guolin Ke committed
303

304
305
306
307
308
309
template <typename VAL_T>
inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
  const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
  i_delta_ = fast_pair.first;
  cur_pos_ = fast_pair.second;
}
Guolin Ke's avatar
Guolin Ke committed
310
311
312
313
314
315

template <typename VAL_T>
BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
  return new SparseBinIterator<VAL_T>(this, start_idx);
}

316

Guolin Ke's avatar
Guolin Ke committed
317
318
319
template <typename VAL_T>
class SparseCategoricalBin: public SparseBin<VAL_T> {
public:
320
  SparseCategoricalBin(data_size_t num_data, uint32_t default_bin)
Guolin Ke's avatar
Guolin Ke committed
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
    : SparseBin<VAL_T>(num_data, default_bin) {
  }

  virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices) const override {
    // not need to split
    if (num_data <= 0) { return 0; }
    SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
    data_size_t lte_count = 0;
    data_size_t gt_count = 0;
    for (data_size_t i = 0; i < num_data; ++i) {
      const data_size_t idx = data_indices[i];
      VAL_T bin = iterator.InnerGet(idx);
      if (bin != threshold) {
        gt_indices[gt_count++] = idx;
      } else {
        lte_indices[lte_count++] = idx;
      }
    }
    return lte_count;
  }
};

344

Guolin Ke's avatar
Guolin Ke committed
345
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
346
#endif   // LightGBM_IO_SPARSE_BIN_HPP_