sparse_bin.hpp 9.7 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
#ifndef LIGHTGBM_IO_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_SPARSE_BIN_HPP_

#include <LightGBM/utils/log.h>

#include <LightGBM/bin.h>

8
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
9
10
11
12
13
14
15
16

#include <cstring>
#include <cstdint>

#include <vector>

namespace LightGBM {

17
18
19
template <typename VAL_T>
class SparseBin;

Guolin Ke's avatar
Guolin Ke committed
20
const size_t kNumFastIndex = 64;
21
const uint8_t kMaxDelta = 255;
Guolin Ke's avatar
Guolin Ke committed
22

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
template <typename VAL_T>
class SparseBinIterator: public BinIterator {
public:
  SparseBinIterator(const SparseBin<VAL_T>* bin_data, data_size_t start_idx)
    : bin_data_(bin_data) {
    Reset(start_idx);
  }

  inline VAL_T InnerGet(data_size_t idx);

  inline uint32_t Get(data_size_t idx) override {
    return InnerGet(idx);
  }

  inline void Reset(data_size_t idx);
private:
  const SparseBin<VAL_T>* bin_data_;
  data_size_t cur_pos_;
  data_size_t i_delta_;
};

template <typename VAL_T>
class OrderedSparseBin;
Guolin Ke's avatar
Guolin Ke committed
46
47

template <typename VAL_T>
48
class SparseBin: public Bin {
Guolin Ke's avatar
Guolin Ke committed
49
50
public:
  friend class SparseBinIterator<VAL_T>;
51
  friend class OrderedSparseBin<VAL_T>;
Guolin Ke's avatar
Guolin Ke committed
52

53
  SparseBin(data_size_t num_data, int default_bin)
Guolin Ke's avatar
Guolin Ke committed
54
    : num_data_(num_data) {
Guolin Ke's avatar
Guolin Ke committed
55
56
    default_bin_ = static_cast<VAL_T>(default_bin);
    if (default_bin_ != 0) {
57
      Log::Info("Warning: sparse feature with negative values, treating negative values as zero");
Guolin Ke's avatar
Guolin Ke committed
58
    }
59
60
#pragma omp parallel
#pragma omp master
Guolin Ke's avatar
Guolin Ke committed
61
62
63
64
65
66
67
68
69
    {
      num_threads_ = omp_get_num_threads();
    }
    for (int i = 0; i < num_threads_; ++i) {
      push_buffers_.emplace_back();
    }
  }

  ~SparseBin() {
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74

  }

  void ReSize(data_size_t num_data) override {
    num_data_ = num_data;
Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
  }

  void Push(int tid, data_size_t idx, uint32_t value) override {
    // not store zero data
Guolin Ke's avatar
Guolin Ke committed
79
    if (value <= default_bin_) { return; }
Guolin Ke's avatar
Guolin Ke committed
80
81
82
83
84
    push_buffers_[tid].emplace_back(idx, static_cast<VAL_T>(value));
  }

  BinIterator* GetIterator(data_size_t start_idx) const override;

85
86
  void ConstructHistogram(const data_size_t*, data_size_t, const score_t*,
    const score_t*, HistogramBinEntry*) const override {
Guolin Ke's avatar
Guolin Ke committed
87
    // Will use OrderedSparseBin->ConstructHistogram() instead
Guolin Ke's avatar
Guolin Ke committed
88
    Log::Fatal("Using OrderedSparseBin->ConstructHistogram() instead");
Guolin Ke's avatar
Guolin Ke committed
89
90
  }

91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  inline bool NextNonzero(data_size_t* i_delta,
    data_size_t* cur_pos) const {
    ++(*i_delta);
    *cur_pos += deltas_[*i_delta];
    data_size_t factor = 1;
    while (*i_delta < num_vals_ && vals_[*i_delta] == 0) {
      ++(*i_delta);
      factor *= kMaxDelta;
      *cur_pos += deltas_[*i_delta] * factor;
    }
    if (*i_delta >= 0 && *i_delta < num_vals_) {
      return true;
    } else {
      return false;
    }
  }

Guolin Ke's avatar
Guolin Ke committed
108
  virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
109
    data_size_t* lte_indices, data_size_t* gt_indices) const override {
110
111
    // not need to split
    if (num_data <= 0) { return 0; }
112
    SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
Guolin Ke's avatar
Guolin Ke committed
113
114
    data_size_t lte_count = 0;
    data_size_t gt_count = 0;
115
    for (data_size_t i = 0; i < num_data; ++i) {
Guolin Ke's avatar
Guolin Ke committed
116
      const data_size_t idx = data_indices[i];
117
      VAL_T bin = iterator.InnerGet(idx);
Guolin Ke's avatar
Guolin Ke committed
118
119
120
121
122
123
124
125
126
127
128
      if (bin > threshold) {
        gt_indices[gt_count++] = idx;
      } else {
        lte_indices[lte_count++] = idx;
      }
    }
    return lte_count;
  }

  data_size_t num_data() const override { return num_data_; }

129
  OrderedBin* CreateOrderedBin() const override;
Guolin Ke's avatar
Guolin Ke committed
130
131
132
133

  void FinishLoad() override {
    // get total non zero size
    size_t non_zero_size = 0;
134
    for (size_t i = 0; i < push_buffers_.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
135
136
137
138
      non_zero_size += push_buffers_[i].size();
    }
    // merge
    non_zero_pair_.reserve(non_zero_size);
139
    for (size_t i = 0; i < push_buffers_.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
      non_zero_pair_.insert(non_zero_pair_.end(), push_buffers_[i].begin(), push_buffers_[i].end());
      push_buffers_[i].clear();
      push_buffers_[i].shrink_to_fit();
    }
    push_buffers_.clear();
    push_buffers_.shrink_to_fit();
    // sort by data index
    std::sort(non_zero_pair_.begin(), non_zero_pair_.end(),
      [](const std::pair<data_size_t, VAL_T>& a, const std::pair<data_size_t, VAL_T>& b) {
      return a.first < b.first;
    });
    // load detla array
    LoadFromPair(non_zero_pair_);
    // free memory
    non_zero_pair_.clear();
    non_zero_pair_.shrink_to_fit();
  }

  void LoadFromPair(const std::vector<std::pair<data_size_t, VAL_T>>& non_zero_pair) {
159
    deltas_.clear();
Guolin Ke's avatar
Guolin Ke committed
160
161
162
    vals_.clear();
    // transform to delta array
    data_size_t last_idx = 0;
163
    for (size_t i = 0; i < non_zero_pair.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
164
165
166
167
      const data_size_t cur_idx = non_zero_pair[i].first;
      const VAL_T bin = non_zero_pair[i].second;
      data_size_t cur_delta = cur_idx - last_idx;
      while (cur_delta > kMaxDelta) {
168
        deltas_.push_back(cur_delta %  kMaxDelta);
Guolin Ke's avatar
Guolin Ke committed
169
        vals_.push_back(0);
170
        cur_delta /= kMaxDelta;
Guolin Ke's avatar
Guolin Ke committed
171
      }
172
      deltas_.push_back(static_cast<uint8_t>(cur_delta));
Guolin Ke's avatar
Guolin Ke committed
173
174
175
176
      vals_.push_back(bin);
      last_idx = cur_idx;
    }
    // avoid out of range
177
    deltas_.push_back(0);
Guolin Ke's avatar
Guolin Ke committed
178
179
180
    num_vals_ = static_cast<data_size_t>(vals_.size());

    // reduce memory cost
181
    deltas_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
182
183
184
185
186
187
188
    vals_.shrink_to_fit();

    // generate fast index
    GetFastIndex();
  }

  void GetFastIndex() {
Guolin Ke's avatar
Guolin Ke committed
189

Guolin Ke's avatar
Guolin Ke committed
190
191
192
193
194
195
196
197
198
199
    fast_index_.clear();
    // get shift cnt
    data_size_t mod_size = (num_data_ + kNumFastIndex - 1) / kNumFastIndex;
    data_size_t pow2_mod_size = 1;
    fast_index_shift_ = 0;
    while (pow2_mod_size < mod_size) {
      pow2_mod_size <<= 1;
      ++fast_index_shift_;
    }
    // build fast index
200
    data_size_t i_delta = -1;
Guolin Ke's avatar
Guolin Ke committed
201
    data_size_t cur_pos = 0;
202
203
    data_size_t next_threshold = 0;
    while (NextNonzero(&i_delta, &cur_pos)) {
Guolin Ke's avatar
Guolin Ke committed
204
      while (next_threshold <= cur_pos) {
205
206
        fast_index_.emplace_back(i_delta, cur_pos);
        next_threshold += pow2_mod_size;
Guolin Ke's avatar
Guolin Ke committed
207
208
209
      }
    }
    // avoid out of range
210
    while (next_threshold < num_data_) {
Guolin Ke's avatar
Guolin Ke committed
211
      fast_index_.emplace_back(num_vals_ - 1, cur_pos);
212
      next_threshold += pow2_mod_size;
Guolin Ke's avatar
Guolin Ke committed
213
214
215
216
217
218
    }
    fast_index_.shrink_to_fit();
  }

  void SaveBinaryToFile(FILE* file) const override {
    fwrite(&num_vals_, sizeof(num_vals_), 1, file);
219
    fwrite(deltas_.data(), sizeof(uint8_t), num_vals_ + 1, file);
Guolin Ke's avatar
Guolin Ke committed
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
    fwrite(vals_.data(), sizeof(VAL_T), num_vals_, file);
  }

  size_t SizesInByte() const override {
    return sizeof(num_vals_) + sizeof(uint8_t) * (num_vals_ + 1)
      + sizeof(VAL_T) * num_vals_;
  }

  void LoadFromMemory(const void* memory, const std::vector<data_size_t>& local_used_indices) override {
    const char* mem_ptr = reinterpret_cast<const char*>(memory);
    data_size_t tmp_num_vals = *(reinterpret_cast<const data_size_t*>(mem_ptr));
    mem_ptr += sizeof(tmp_num_vals);
    const uint8_t* tmp_delta = reinterpret_cast<const uint8_t*>(mem_ptr);
    mem_ptr += sizeof(uint8_t) * (tmp_num_vals + 1);
    const VAL_T* tmp_vals = reinterpret_cast<const VAL_T*>(mem_ptr);

236
237
238
239
240
241
242
243
244
245
246
    deltas_.clear();
    vals_.clear();
    num_vals_ = tmp_num_vals;
    for (data_size_t i = 0; i < num_vals_; ++i) {
      deltas_.push_back(tmp_delta[i]);
      vals_.push_back(tmp_vals[i]);
    }
    deltas_.push_back(0);
    // reduce memory cost
    deltas_.shrink_to_fit();
    vals_.shrink_to_fit();
Guolin Ke's avatar
Guolin Ke committed
247

Guolin Ke's avatar
Guolin Ke committed
248
    if (local_used_indices.empty()) {
Guolin Ke's avatar
Guolin Ke committed
249
250
251
252
      // generate fast index
      GetFastIndex();
    } else {
      std::vector<std::pair<data_size_t, VAL_T>> tmp_pair;
253
254
      data_size_t cur_pos = 0;
      data_size_t j = -1;
Guolin Ke's avatar
Guolin Ke committed
255
256
      for (data_size_t i = 0; i < static_cast<data_size_t>(local_used_indices.size()); ++i) {
        const data_size_t idx = local_used_indices[i];
257
258
        while (cur_pos < idx && j < num_vals_) {
          NextNonzero(&j, &cur_pos);
Guolin Ke's avatar
Guolin Ke committed
259
        }
260
        if (cur_pos == idx && j < num_vals_) {
Guolin Ke's avatar
Guolin Ke committed
261
          // new row index is i
262
          tmp_pair.emplace_back(i, vals_[j]);
Guolin Ke's avatar
Guolin Ke committed
263
264
265
266
267
268
269
        }
      }
      LoadFromPair(tmp_pair);
    }

  }

Guolin Ke's avatar
Guolin Ke committed
270
protected:
Guolin Ke's avatar
Guolin Ke committed
271
272
  data_size_t num_data_;
  std::vector<std::pair<data_size_t, VAL_T>> non_zero_pair_;
273
  std::vector<uint8_t> deltas_;
Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
279
  std::vector<VAL_T> vals_;
  data_size_t num_vals_;
  int num_threads_;
  std::vector<std::vector<std::pair<data_size_t, VAL_T>>> push_buffers_;
  std::vector<std::pair<data_size_t, data_size_t>> fast_index_;
  data_size_t fast_index_shift_;
Guolin Ke's avatar
Guolin Ke committed
280
  VAL_T default_bin_;
Guolin Ke's avatar
Guolin Ke committed
281
282
283
};

template <typename VAL_T>
284
285
286
inline VAL_T SparseBinIterator<VAL_T>::InnerGet(data_size_t idx) {
  while (cur_pos_ < idx && i_delta_ < bin_data_->num_vals_) {
    bin_data_->NextNonzero(&i_delta_, &cur_pos_);
Guolin Ke's avatar
Guolin Ke committed
287
  }
288
  if (cur_pos_ == idx && i_delta_ < bin_data_->num_vals_ && i_delta_ >= 0) {
289
290
291
    return bin_data_->vals_[i_delta_];
  } else {
    return 0;
Guolin Ke's avatar
Guolin Ke committed
292
  }
293
}
Guolin Ke's avatar
Guolin Ke committed
294

295
296
297
298
299
300
template <typename VAL_T>
inline void SparseBinIterator<VAL_T>::Reset(data_size_t start_idx) {
  const auto fast_pair = bin_data_->fast_index_[start_idx >> bin_data_->fast_index_shift_];
  i_delta_ = fast_pair.first;
  cur_pos_ = fast_pair.second;
}
Guolin Ke's avatar
Guolin Ke committed
301
302
303
304
305
306

template <typename VAL_T>
BinIterator* SparseBin<VAL_T>::GetIterator(data_size_t start_idx) const {
  return new SparseBinIterator<VAL_T>(this, start_idx);
}

307

Guolin Ke's avatar
Guolin Ke committed
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
template <typename VAL_T>
class SparseCategoricalBin: public SparseBin<VAL_T> {
public:
  SparseCategoricalBin(data_size_t num_data, int default_bin)
    : SparseBin<VAL_T>(num_data, default_bin) {
  }

  virtual data_size_t Split(unsigned int threshold, data_size_t* data_indices, data_size_t num_data,
    data_size_t* lte_indices, data_size_t* gt_indices) const override {
    // not need to split
    if (num_data <= 0) { return 0; }
    SparseBinIterator<VAL_T> iterator(this, data_indices[0]);
    data_size_t lte_count = 0;
    data_size_t gt_count = 0;
    for (data_size_t i = 0; i < num_data; ++i) {
      const data_size_t idx = data_indices[i];
      VAL_T bin = iterator.InnerGet(idx);
      if (bin != threshold) {
        gt_indices[gt_count++] = idx;
      } else {
        lte_indices[lte_count++] = idx;
      }
    }
    return lte_count;
  }
};

335

Guolin Ke's avatar
Guolin Ke committed
336
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
337
#endif   // LightGBM_IO_SPARSE_BIN_HPP_