bin.cpp 12.5 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
5
#include "dense_nbits_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
6
#include "sparse_bin.hpp"
7
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
8
9
10
11
12
13
14
15
16
17
18

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
20
21
22
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
23
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
24
25
26
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
27
28
29
30
31
32
33
  bin_type_ = other.bin_type_;
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = other.bin_upper_bound_;
  } else {
    bin_2_categorical_ = other.bin_2_categorical_;
    categorical_2_bin_ = other.categorical_2_bin_;
  }
34
35
  min_val_ = other.min_val_;
  max_val_ = other.max_val_;
Guolin Ke's avatar
Guolin Ke committed
36
  default_bin_ = other.default_bin_;
Guolin Ke's avatar
Guolin Ke committed
37
38
}

Guolin Ke's avatar
Guolin Ke committed
39
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
40
41
42
43
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
44

Guolin Ke's avatar
Guolin Ke committed
45
46
}

47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
  if (bin_type == BinType::NumericalBin) {
    int sum_left = 0;
    for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
      sum_left += cnt_in_bin[i];
      if (sum_left >= filter_cnt) {
        return false;
      } else if (total_cnt - sum_left >= filter_cnt) {
        return false;
      }
    }
  } else {
    for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
      int sum_left = cnt_in_bin[i];
      if (sum_left >= filter_cnt) {
        return false;
      } else if (total_cnt - sum_left >= filter_cnt) {
        return false;
      }
Guolin Ke's avatar
Guolin Ke committed
66
67
68
69
70
71
    }
  }
  return true;
}

void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
72
73
74
  int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
  bin_type_ = bin_type;
  default_bin_ = 0;
Guolin Ke's avatar
Guolin Ke committed
75
76
  std::vector<double>& raw_values = values;
  int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size());
Guolin Ke's avatar
Guolin Ke committed
77
  // find distinct_values first
78
  std::vector<double> distinct_values;
79
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
80

Guolin Ke's avatar
Guolin Ke committed
81
  std::sort(raw_values.begin(), raw_values.end());
Guolin Ke's avatar
Guolin Ke committed
82
83

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
84
85
  if (raw_values.empty() || (raw_values[0] > 0.0f && zero_cnt > 0)) {
    distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
86
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
87
  }
Guolin Ke's avatar
Guolin Ke committed
88

Guolin Ke's avatar
Guolin Ke committed
89
90
  if (!raw_values.empty()) {
    distinct_values.push_back(raw_values[0]);
Guolin Ke's avatar
Guolin Ke committed
91
92
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
93

Guolin Ke's avatar
Guolin Ke committed
94
95
96
97
  for (size_t i = 1; i < raw_values.size(); ++i) {
    if (raw_values[i] != raw_values[i - 1]) {
      if (raw_values[i - 1] < 0.0f && raw_values[i] > 0.0f) {
        distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
98
99
        counts.push_back(zero_cnt);
      }
Guolin Ke's avatar
Guolin Ke committed
100
      distinct_values.push_back(raw_values[i]);
101
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
102
    } else {
103
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
104
105
    }
  }
Guolin Ke's avatar
Guolin Ke committed
106
107

  // push zero in the back
Guolin Ke's avatar
Guolin Ke committed
108
109
  if (!raw_values.empty() && raw_values.back() < 0.0f && zero_cnt > 0) {
    distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
110
111
    counts.push_back(zero_cnt);
  }
112
113
  min_val_ = distinct_values.front();
  max_val_ = distinct_values.back();
114
  std::vector<int> cnt_in_bin;
115
  int num_values = static_cast<int>(distinct_values.size());
116
117
118
119
120
121
122
123
124
125
126
127
  if (bin_type_ == BinType::NumericalBin) {
    if (num_values <= max_bin) {
      // use distinct value is enough
      bin_upper_bound_.clear();
      int cur_cnt_inbin = 0;
      for (int i = 0; i < num_values - 1; ++i) {
        cur_cnt_inbin += counts[i];
        if (cur_cnt_inbin >= min_data_in_bin) {
          bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
          cnt_in_bin.push_back(cur_cnt_inbin);
          cur_cnt_inbin = 0;
        }
Guolin Ke's avatar
Guolin Ke committed
128
      }
129
130
131
132
133
134
135
136
      cur_cnt_inbin += counts.back();
      cnt_in_bin.push_back(cur_cnt_inbin);
      bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
      num_bin_ = static_cast<int>(bin_upper_bound_.size());
    } else {
      if (min_data_in_bin > 0) {
        max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
        max_bin = std::max(max_bin, 1);
Guolin Ke's avatar
Guolin Ke committed
137
      }
138
139
140
141
      double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
      if (zero_cnt > mean_bin_size) {
        int non_zero_cnt = static_cast<int>(raw_values.size());
        max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
Guolin Ke's avatar
Guolin Ke committed
142
      }
143
144
145
146
147
148
149
      // mean size for one bin
      int rest_bin_cnt = max_bin;
      int rest_sample_cnt = static_cast<int>(total_sample_cnt);
      std::vector<bool> is_big_count_value(num_values, false);
      for (int i = 0; i < num_values; ++i) {
        if (counts[i] >= mean_bin_size) {
          is_big_count_value[i] = true;
Guolin Ke's avatar
Guolin Ke committed
150
          --rest_bin_cnt;
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
          rest_sample_cnt -= counts[i];
        }
      }
      mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
      std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
      std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());

      int bin_cnt = 0;
      lower_bounds[bin_cnt] = distinct_values[0];
      int cur_cnt_inbin = 0;
      for (int i = 0; i < num_values - 1; ++i) {
        if (!is_big_count_value[i]) {
          rest_sample_cnt -= counts[i];
        }
        cur_cnt_inbin += counts[i];
        // need a new bin
        if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
          (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
          upper_bounds[bin_cnt] = distinct_values[i];
          cnt_in_bin.push_back(cur_cnt_inbin);
          ++bin_cnt;
          lower_bounds[bin_cnt] = distinct_values[i + 1];
          if (bin_cnt >= max_bin - 1) { break; }
          cur_cnt_inbin = 0;
          if (!is_big_count_value[i]) {
            --rest_bin_cnt;
            mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
          }
Guolin Ke's avatar
Guolin Ke committed
179
        }
Guolin Ke's avatar
Guolin Ke committed
180
      }
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
      cur_cnt_inbin += counts.back();
      cnt_in_bin.push_back(cur_cnt_inbin);
      ++bin_cnt;
      // update bin upper bound
      bin_upper_bound_ = std::vector<double>(bin_cnt);
      num_bin_ = bin_cnt;
      for (int i = 0; i < bin_cnt - 1; ++i) {
        bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
      }
      // last bin upper bound
      bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
    }
    CHECK(num_bin_ <= max_bin);
  } else {
    // convert to int type first
    std::vector<int> distinct_values_int;
    std::vector<int> counts_int;
    distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
    counts_int.push_back(counts[0]);
    for (size_t i = 1; i < distinct_values.size(); ++i) {
      if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
        distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
        counts_int.push_back(counts[i]);
      } else {
        counts_int.back() += counts[i];
      }
Guolin Ke's avatar
Guolin Ke committed
207
    }
208
209
210
211
212
213
214
215
216
217
218
219
220
221
    // sort by counts
    Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
    // will ingore the categorical of small counts
    const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f);
    categorical_2_bin_.clear();
    bin_2_categorical_.clear();
    num_bin_ = 0;
    int used_cnt = 0;
    max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
    while (used_cnt < cut_cnt || num_bin_ < max_bin) {
      bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
      categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
      used_cnt += counts_int[num_bin_];
      ++num_bin_;
Guolin Ke's avatar
Guolin Ke committed
222
    }
223
224
225
    cnt_in_bin = counts_int;
    counts_int.resize(num_bin_);
    counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
Guolin Ke's avatar
Guolin Ke committed
226
  }
Guolin Ke's avatar
Guolin Ke committed
227

Guolin Ke's avatar
Guolin Ke committed
228
229
230
231
232
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
Guolin Ke's avatar
Guolin Ke committed
233
  }
234
235
  // check useless bin
  if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
Guolin Ke's avatar
Guolin Ke committed
236
    is_trival_ = true;
Guolin Ke's avatar
Guolin Ke committed
237
  }
238
239
240
241

  if (!is_trival_) {
    default_bin_ = ValueToBin(0);
  }
Guolin Ke's avatar
Guolin Ke committed
242
  // calculate sparse rate
243
  sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
Guolin Ke's avatar
Guolin Ke committed
244
245
246
247
248
249
250
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
251
  size += sizeof(double);
252
  size += sizeof(BinType);
Guolin Ke's avatar
Guolin Ke committed
253
  size += 2 * sizeof(double);
254
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
255
  size += sizeof(uint32_t);
Guolin Ke's avatar
Guolin Ke committed
256
257
258
259
260
261
262
263
264
265
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
266
267
  std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
268
269
270
271
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
272
273
  std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
  buffer += sizeof(default_bin_);
274
275
276
277
278
  if (bin_type_ == BinType::NumericalBin) {
    std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
  } else {
    std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
  }
Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
283
284
285
286
287
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
288
289
  std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
290
291
292
293
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
294
295
  std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
  buffer += sizeof(default_bin_);
296
297
298
299
300
301
302
303
304
305
306
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = std::vector<double>(num_bin_);
    std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
  } else {
    bin_2_categorical_ = std::vector<int>(num_bin_);
    std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
    categorical_2_bin_.clear();
    for (int i = 0; i < num_bin_; ++i) {
      categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
307
308
309
310
311
312
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
313
  fwrite(&bin_type_, sizeof(bin_type_), 1, file);
314
315
  fwrite(&min_val_, sizeof(min_val_), 1, file);
  fwrite(&max_val_, sizeof(max_val_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
316
  fwrite(&default_bin_, sizeof(default_bin_), 1, file);
317
318
319
320
321
  if (bin_type_ == BinType::NumericalBin) {
    fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
  } else {
    fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
  }
Guolin Ke's avatar
Guolin Ke committed
322
323
324
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
325
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
326
327
328
329
330
331
    + sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
  if (bin_type_ == BinType::NumericalBin) {
    ret += sizeof(double) *  num_bin_;
  } else {
    ret += sizeof(int) * num_bin_;
  }
Guolin Ke's avatar
Guolin Ke committed
332
  return ret;
Guolin Ke's avatar
Guolin Ke committed
333
334
335
336
337
338
339
340
341
342
343
344
345
346
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
347
double BinMapper::kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
348

Guolin Ke's avatar
Guolin Ke committed
349
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
Guolin Ke's avatar
Guolin Ke committed
350
  bool is_enable_sparse, bool* is_sparse) {
Guolin Ke's avatar
Guolin Ke committed
351
  // sparse threshold
Guolin Ke's avatar
Guolin Ke committed
352
  if (sparse_rate >= BinMapper::kSparseThreshold && is_enable_sparse) {
Guolin Ke's avatar
Guolin Ke committed
353
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
354
    return CreateSparseBin(num_data, num_bin);
Guolin Ke's avatar
Guolin Ke committed
355
356
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
357
    return CreateDenseBin(num_data, num_bin);
Guolin Ke's avatar
Guolin Ke committed
358
359
360
  }
}

Guolin Ke's avatar
Guolin Ke committed
361
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
Guolin Ke's avatar
Guolin Ke committed
362
363
364
  if (num_bin <= 16) {
    return new Dense4bitsBin(num_data);
  } else if (num_bin <= 256) {
Guolin Ke's avatar
Guolin Ke committed
365
366
367
    return new DenseBin<uint8_t>(num_data);
  } else if (num_bin <= 65536) {
    return new DenseBin<uint16_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
368
  } else {
Guolin Ke's avatar
Guolin Ke committed
369
    return new DenseBin<uint32_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
370
371
372
  }
}

Guolin Ke's avatar
Guolin Ke committed
373
374
375
376
377
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
  if (num_bin <= 256) {
    return new SparseBin<uint8_t>(num_data);
  } else if (num_bin <= 65536) {
    return new SparseBin<uint16_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
378
  } else {
Guolin Ke's avatar
Guolin Ke committed
379
    return new SparseBin<uint32_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
380
381
382
383
  }
}

}  // namespace LightGBM