sparse_bin.hpp 12.7 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
#ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_SPARSE_BIN_HPP_

#include <LightGBM/utils/log.h>

#include <LightGBM/bin.h>

8
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
9
10
11

#include <cstring>
#include <cstdint>
12
#include <limits>
Guolin Ke's avatar
Guolin Ke committed
13
14
15
16
#include <vector>

namespace LightGBM {

17
template <typename VAL_T> class SparseBin;
18

Guolin Ke's avatar
Guolin Ke committed
19
20
const size_t kNumFastIndex = 64;

21
22
23
template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
Guolin Ke's avatar
Guolin Ke committed
24
25
26
27
  SparseBinIterator(const SparseBin<VAL_T>* bin_data,
    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin)
    : bin_data_(bin_data), min_bin_(static_cast<VAL_T>(min_bin)),
    max_bin_(static_cast<VAL_T>(max_bin)),
zhangyafeikimi's avatar
zhangyafeikimi committed
28
    default_bin_(static_cast<VAL_T>(default_bin)) {
Guolin Ke's avatar
Guolin Ke committed
29
30
31
32
33
34
35
    if (default_bin_ == 0) {
      bias_ = 1;
    } else {
      bias_ = 0;
    }
    Reset(0);
  }
36
37
38
39
40
  SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
    : bin_data_(bin_data) {
    Reset(start_idx);
  }

41
42
  inline uint32_t RawGet(data_size_t idx) override;
  inline VAL_T InnerRawGet(data_size_t idx);
43

Guolin Ke's avatar
Guolin Ke committed
44
  inline uint32_t Get( data_size_t idx) override {
Guolin Ke's avatar
Guolin Ke committed
45
    VAL_T ret = InnerRawGet(idx);
Guolin Ke's avatar
Guolin Ke committed
46
47
48
49
50
    if (ret >= min_bin_ && ret <= max_bin_) {
      return ret - min_bin_ + bias_;
    } else {
      return default_bin_;
    }
51
52
  }

Guolin Ke's avatar
Guolin Ke committed
53
  inline void Reset(data_size_t idx) override;
54
55
56
57
private:
  const SparseBin<VAL_T>* bin_data_;
  data_size_t cur_pos_;
  data_size_t i_delta_;
Guolin Ke's avatar
Guolin Ke committed
58
59
60
61
  VAL_T min_bin_;
  VAL_T max_bin_;
  VAL_T default_bin_;
  uint8_t bias_;
62
63
64
65
};

template <typename VAL_T>
class OrderedSparseBin;
Guolin Ke's avatar
Guolin Ke committed
66
67

template <typename VAL_T>
68
class SparseBin: public Bin {
Guolin Ke's avatar
Guolin Ke committed
69
70
public:
  friend class SparseBinIterator<VAL_T>;
71
  friend class OrderedSparseBin<VAL_T>;
Guolin Ke's avatar
Guolin Ke committed
72

Guolin Ke's avatar
Guolin Ke committed
73
  SparseBin(data_size_t num_data)
Guolin Ke's avatar
Guolin Ke committed
74
    : num_data_(num_data) {
Guolin Ke's avatar
Guolin Ke committed
75
    int num_threads = 1;
76
77
#pragma omp parallel
#pragma omp master
Guolin Ke's avatar
Guolin Ke committed
78
    {
Guolin Ke's avatar
Guolin Ke committed
79
      num_threads = omp_get_num_threads();
Guolin Ke's avatar
Guolin Ke committed
80
    }
Guolin Ke's avatar
Guolin Ke committed
81
    push_buffers_.resize(num_threads);
Guolin Ke's avatar
Guolin Ke committed
82
83
84
  }

  ~SparseBin() {
Guolin Ke's avatar
Guolin Ke committed
85
86
87
88
89

  }

  void ReSize(data_size_t num_data) override {
    num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
90
91
92
  }

  void Push(int tid, data_size_t idx, uint32_t value) override {
93
    auto cur_bin = static_cast<VAL_T>(value);
Guolin Ke's avatar
Guolin Ke committed
94
    if (cur_bin != 0) {
95
96
      push_buffers_[tid].emplace_back(idx, cur_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
97
98
  }

Guolin Ke's avatar
Guolin Ke committed
99
  BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const override;
Guolin Ke's avatar
Guolin Ke committed
100

101
102
  void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
    const score_t*, HistogramBinEntry*) const override {
Guolin Ke's avatar
Guolin Ke committed
103
    // Will use OrderedSparseBin->ConstructHistogram() instead
Guolin Ke's avatar
Guolin Ke committed
104
    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
Guolin Ke's avatar
Guolin Ke committed
105
106
  }

107
108
109
110
111
112
  void ConstructHistogram(data_size_t, const score_t*,
                          const score_t*, HistogramBinEntry*) const override {
    // Will use OrderedSparseBin->ConstructHistogram() instead
    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
  }

113
  void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
114
115
                          HistogramBinEntry*) const override {
    // Will use OrderedSparseBin->ConstructHistogram() instead
116
117
118
119
120
121
    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
  }

  void ConstructHistogram(data_size_t, const score_t*,
                          HistogramBinEntry*) const override {
    // Will use OrderedSparseBin->ConstructHistogram() instead
122
123
124
    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
  }

125
  inline bool NextNonzero(data_size_t* i_delta,
Guolin Ke's avatar
Guolin Ke committed
126
                          data_size_t* cur_pos) const {
127
    ++(*i_delta);
128
129
    data_size_t shift = 0;
    data_size_t delta = deltas_[*i_delta];
Guolin Ke's avatar
Guolin Ke committed
130
    while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
131
      ++(*i_delta);
132
      shift += 8;
Guolin Ke's avatar
Guolin Ke committed
133
      delta |= static_cast<data_size_t>(deltas_[*i_delta]) << shift;
134
    }
135
136
    *cur_pos += delta;
    if (*i_delta < num_vals_) {
137
138
      return true;
    } else {
139
      *cur_pos = num_data_;
140
141
142
143
      return false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
144
  virtual data_size_t Split(
Guolin Ke's avatar
Guolin Ke committed
145
    uint32_t min_bin, uint32_t max_bin, uint32_t default_bin, uint32_t default_bin_for_zero,
Guolin Ke's avatar
Guolin Ke committed
146
    uint32_t threshold, data_size_t* data_indices, data_size_t num_data,
147
    data_size_t* lte_indices, data_size_t* gt_indices, BinType bin_type) const override {
148
149
    // not need to split
    if (num_data <= 0) { return 0; }
Guolin Ke's avatar
Guolin Ke committed
150
151
152
    VAL_T th = static_cast<VAL_T>(threshold + min_bin);
    VAL_T minb = static_cast<VAL_T>(min_bin);
    VAL_T maxb = static_cast<VAL_T>(max_bin);
Guolin Ke's avatar
Guolin Ke committed
153
    VAL_T t_default_bin = static_cast<VAL_T>(min_bin + default_bin);
Guolin Ke's avatar
Guolin Ke committed
154
155
    if (default_bin == 0) {
      th -= 1;
Guolin Ke's avatar
Guolin Ke committed
156
      t_default_bin -= 1;
Guolin Ke's avatar
Guolin Ke committed
157
    }
158
    SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
Guolin Ke's avatar
Guolin Ke committed
159
160
    data_size_t lte_count = 0;
    data_size_t gt_count = 0;
Guolin Ke's avatar
Guolin Ke committed
161
162
    data_size_t* default_indices = gt_indices;
    data_size_t* default_count = &gt_count;
163
    if (bin_type == BinType::NumericalBin) {
Guolin Ke's avatar
Guolin Ke committed
164
      if (default_bin_for_zero <= threshold) {
165
166
167
168
169
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
170
        VAL_T bin = iterator.InnerRawGet(idx);
Guolin Ke's avatar
Guolin Ke committed
171
        if (bin < minb || bin > maxb || t_default_bin == bin) {
172
173
174
175
176
177
178
179
          default_indices[(*default_count)++] = idx;
        } else if (bin > th) {
          gt_indices[gt_count++] = idx;
        } else {
          lte_indices[lte_count++] = idx;
        }
      }
    } else {
Guolin Ke's avatar
Guolin Ke committed
180
      if (default_bin_for_zero == threshold) {
181
182
183
184
185
        default_indices = lte_indices;
        default_count = &lte_count;
      }
      for (data_size_t i = 0; i < num_data; ++i) {
        const data_size_t idx = data_indices[i];
186
        VAL_T bin = iterator.InnerRawGet(idx);
Guolin Ke's avatar
Guolin Ke committed
187
        if (bin < minb || bin > maxb || t_default_bin == bin) {
188
189
190
191
192
193
          default_indices[(*default_count)++] = idx;
        } else if (bin != th) {
          gt_indices[gt_count++] = idx;
        } else {
          lte_indices[lte_count++] = idx;
        }
Guolin Ke's avatar
Guolin Ke committed
194
195
196
197
198
199
200
      }
    }
    return lte_count;
  }

  data_size_t num_data() const override { return num_data_; }

201
  OrderedBin* CreateOrderedBin() const override;
Guolin Ke's avatar
Guolin Ke committed
202
203
204

  void FinishLoad() override {
    // get total non zero size
205
    size_t pair_cnt = 0;
206
    for (size_t i = 0; i < push_buffers_.size(); ++i) {
207
      pair_cnt += push_buffers_[i].size();
Guolin Ke's avatar
Guolin Ke committed
208
    }
Guolin Ke's avatar
Guolin Ke committed
209
    std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs = push_buffers_[0];
210
    idx_val_pairs.reserve(pair_cnt);
Guolin Ke's avatar
Guolin Ke committed
211
212

    for (size_t i = 1; i < push_buffers_.size(); ++i) {
213
      idx_val_pairs.insert(idx_val_pairs.end(), push_buffers_[i].begin(), push_buffers_[i].end());
Guolin Ke's avatar
Guolin Ke committed
214
215
216
217
      push_buffers_[i].clear();
      push_buffers_[i].shrink_to_fit();
    }
    // sort by data index
218
    std::sort(idx_val_pairs.begin(), idx_val_pairs.end(),
Guolin Ke's avatar
Guolin Ke committed
219
220
221
      [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
      return a.first < b.first;
    });
zhangyafeikimi's avatar
zhangyafeikimi committed
222
    // load delta array
223
    LoadFromPair(idx_val_pairs);
Guolin Ke's avatar
Guolin Ke committed
224
225
  }

226
  void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& idx_val_pairs) {
227
    deltas_.clear();
Guolin Ke's avatar
Guolin Ke committed
228
229
230
    vals_.clear();
    // transform to delta array
    data_size_t last_idx = 0;
231
232
233
    for (size_t i = 0; i < idx_val_pairs.size(); ++i) {
      const data_size_t cur_idx = idx_val_pairs[i].first;
      const VAL_T bin = idx_val_pairs[i].second;
Guolin Ke's avatar
Guolin Ke committed
234
      data_size_t cur_delta = cur_idx - last_idx;
Guolin Ke's avatar
Guolin Ke committed
235
      if (i > 0 && cur_delta == 0) { continue; }
236
237
      while (cur_delta >= 256) {
        deltas_.push_back(cur_delta & 0xff);
Guolin Ke's avatar
Guolin Ke committed
238
        vals_.push_back(0);
239
        cur_delta >>= 8;
Guolin Ke's avatar
Guolin Ke committed
240
      }
241
      deltas_.push_back(static_cast<uint8_t>(cur_delta));
Guolin Ke's avatar
Guolin Ke committed
242
243
244
245
      vals_.push_back(bin);
      last_idx = cur_idx;
    }
    // avoid out of range
246
    deltas_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
247
248
249
    num_vals_ = static_cast<data_size_t>(vals_.size());

    // reduce memory cost
250
    deltas_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
    vals_.shrink_to_fit();

    // generate fast index
    GetFastIndex();
  }

  void GetFastIndex() {
    fast_index_.clear();
    // get shift cnt
    data_size_t mod_size = (num_data_ + kNumFastIndex - 1) / kNumFastIndex;
    data_size_t pow2_mod_size = 1;
    fast_index_shift_ = 0;
    while (pow2_mod_size < mod_size) {
      pow2_mod_size <<= 1;
      ++fast_index_shift_;
    }
    // build fast index
268
    data_size_t i_delta = -1;
Guolin Ke's avatar
Guolin Ke committed
269
    data_size_t cur_pos = 0;
270
271
    data_size_t next_threshold = 0;
    while (NextNonzero(&i_delta, &cur_pos)) {
Guolin Ke's avatar
Guolin Ke committed
272
      while (next_threshold <= cur_pos) {
273
274
        fast_index_.emplace_back(i_delta, cur_pos);
        next_threshold += pow2_mod_size;
Guolin Ke's avatar
Guolin Ke committed
275
276
277
      }
    }
    // avoid out of range
278
    while (next_threshold < num_data_) {
Guolin Ke's avatar
Guolin Ke committed
279
      fast_index_.emplace_back(num_vals_ - 1, cur_pos);
280
      next_threshold += pow2_mod_size;
Guolin Ke's avatar
Guolin Ke committed
281
282
283
284
285
286
    }
    fast_index_.shrink_to_fit();
  }

  void SaveBinaryToFile(FILE* file) const override {
    fwrite(&num_vals_, sizeof(num_vals_), 1, file);
287
    fwrite(deltas_.data(), sizeof(uint8_t), num_vals_ + 1, file);
Guolin Ke's avatar
Guolin Ke committed
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
    fwrite(vals_.data(), sizeof(VAL_T), num_vals_, file);
  }

  size_t SizesInByte() const override {
    return sizeof(num_vals_) + sizeof(uint8_t) * (num_vals_ + 1)
      + sizeof(VAL_T) * num_vals_;
  }

  void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
    const char* mem_ptr = reinterpret_cast<const char*>(memory);
    data_size_t tmp_num_vals = *(reinterpret_cast<const data_size_t*>(mem_ptr));
    mem_ptr += sizeof(tmp_num_vals);
    const uint8_t* tmp_delta = reinterpret_cast<const uint8_t*>(mem_ptr);
    mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1);
    const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr);

304
305
306
307
308
309
310
311
312
313
314
    deltas_.clear();
    vals_.clear();
    num_vals_ = tmp_num_vals;
    for (data_size_t i = 0; i < num_vals_; ++i) {
      deltas_.push_back(tmp_delta[i]);
      vals_.push_back(tmp_vals[i]);
    }
    deltas_.push_back(0);
    // reduce memory cost
    deltas_.shrink_to_fit();
    vals_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
315

Guolin Ke's avatar
Guolin Ke committed
316
    if (local_used_indices.empty()) {
Guolin Ke's avatar
Guolin Ke committed
317
318
319
320
      // generate fast index
      GetFastIndex();
    } else {
      std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
321
322
      data_size_t cur_pos = 0;
      data_size_t j = -1;
Guolin Ke's avatar
Guolin Ke committed
323
324
      for (data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) {
        const data_size_t idx = local_used_indices[i];
325
326
        while (cur_pos < idx && j < num_vals_) {
          NextNonzero(&j, &cur_pos);
Guolin Ke's avatar
Guolin Ke committed
327
        }
328
        if (cur_pos == idx && j < num_vals_) {
Guolin Ke's avatar
Guolin Ke committed
329
          // new row index is i
330
          tmp_pair.emplace_back(i, vals_[j]);
Guolin Ke's avatar
Guolin Ke committed
331
332
333
334
        }
      }
      LoadFromPair(tmp_pair);
    }
335
  }
Guolin Ke's avatar
Guolin Ke committed
336

337
  void CopySubset(const Bin* full_bin, const data_size_t* used_indices, data_size_t num_used_indices) override {
Guolin Ke's avatar
Guolin Ke committed
338
    auto other_bin = dynamic_cast<const SparseBin<VAL_T>*>(full_bin);
Guolin Ke's avatar
Guolin Ke committed
339
340
    deltas_.clear();
    vals_.clear();
Guolin Ke's avatar
Guolin Ke committed
341
342
343
344
345
    data_size_t start = 0;
    if (num_used_indices > 0) {
      start = used_indices[0];
    }
    SparseBinIterator<VAL_T> iterator(other_bin, start);
346
347
    // transform to delta array
    data_size_t last_idx = 0;
348
    for (data_size_t i = 0; i < num_used_indices; ++i) {
349
      VAL_T bin = iterator.InnerRawGet(used_indices[i]);
Guolin Ke's avatar
Guolin Ke committed
350
      if (bin > 0) {
351
352
353
354
355
356
357
358
359
        data_size_t cur_delta = i - last_idx;
        while (cur_delta >= 256) {
          deltas_.push_back(cur_delta & 0xff);
          vals_.push_back(0);
          cur_delta >>= 8;
        }
        deltas_.push_back(static_cast<uint8_t>(cur_delta));
        vals_.push_back(bin);
        last_idx = i;
360
361
      }
    }
362
363
364
365
366
367
368
369
370
371
    // avoid out of range
    deltas_.push_back(0);
    num_vals_ = static_cast<data_size_t>(vals_.size());

    // reduce memory cost
    deltas_.shrink_to_fit();
    vals_.shrink_to_fit();

    // generate fast index
    GetFastIndex();
Guolin Ke's avatar
Guolin Ke committed
372
373
  }

Guolin Ke's avatar
Guolin Ke committed
374
protected:
Guolin Ke's avatar
Guolin Ke committed
375
  data_size_t num_data_;
376
  std::vector<uint8_t> deltas_;
Guolin Ke's avatar
Guolin Ke committed
377
378
379
380
381
382
383
384
  std::vector<VAL_T> vals_;
  data_size_t num_vals_;
  std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
  std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
  data_size_t fast_index_shift_;
};

template <typename VAL_T>
385
386
387
388
389
390
inline uint32_t SparseBinIterator<VAL_T>::RawGet(data_size_t idx) {
  return InnerRawGet(idx);
}

template <typename VAL_T>
inline VAL_T SparseBinIterator<VAL_T>::InnerRawGet(data_size_t idx) {
391
  while (cur_pos_ < idx) {
392
    bin_data_->NextNonzero(&i_delta_, &cur_pos_);
Guolin Ke's avatar
Guolin Ke committed
393
  }
394
  if (cur_pos_ == idx) {
395
396
    return bin_data_->vals_[i_delta_];
  } else {
Guolin Ke's avatar
Guolin Ke committed
397
    return 0;
Guolin Ke's avatar
Guolin Ke committed
398
  }
399
}
Guolin Ke's avatar
Guolin Ke committed
400

401
402
template <typename VAL_T>
inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
Guolin Ke's avatar
Guolin Ke committed
403
404
405
406
407
408
409
410
411
  auto idx = start_idx >> bin_data_->fast_index_shift_;
  if (static_cast<size_t>(idx) < bin_data_->fast_index_.size()) {
    const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
    i_delta_ = fast_pair.first;
    cur_pos_ = fast_pair.second;
  } else {
    i_delta_ = -1;
    cur_pos_ = 0;
  }
412
}
Guolin Ke's avatar
Guolin Ke committed
413
414

template <typename VAL_T>
Guolin Ke's avatar
Guolin Ke committed
415
416
BinIterator* SparseBin<VAL_T>::GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t default_bin) const {
  return new SparseBinIterator<VAL_T>(this, min_bin, max_bin, default_bin);
Guolin Ke's avatar
Guolin Ke committed
417
418
419
}

}  // namespace LightGBM
zhangyafeikimi's avatar
zhangyafeikimi committed
420
#endif   // LightGBM_IO_SPARSE_BIN_HPP_