bin.cpp 14 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
5
#include "dense_nbits_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
6
#include "sparse_bin.hpp"
7
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
8
9
10
11
12
13
14
15
16
17
18

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
19
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
20
21
22
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
23
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
24
25
26
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
27
28
29
30
31
32
33
  bin_type_ = other.bin_type_;
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = other.bin_upper_bound_;
  } else {
    bin_2_categorical_ = other.bin_2_categorical_;
    categorical_2_bin_ = other.categorical_2_bin_;
  }
34
35
  min_val_ = other.min_val_;
  max_val_ = other.max_val_;
Guolin Ke's avatar
Guolin Ke committed
36
  default_bin_ = other.default_bin_;
Guolin Ke's avatar
Guolin Ke committed
37
38
}

Guolin Ke's avatar
Guolin Ke committed
39
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
40
41
42
43
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
44

Guolin Ke's avatar
Guolin Ke committed
45
46
}

47
48
49
50
51
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt, BinType bin_type) {
  if (bin_type == BinType::NumericalBin) {
    int sum_left = 0;
    for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
      sum_left += cnt_in_bin[i];
Guolin Ke's avatar
Guolin Ke committed
52
      if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
53
54
55
56
57
58
        return false;
      }
    }
  } else {
    for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
      int sum_left = cnt_in_bin[i];
Guolin Ke's avatar
Guolin Ke committed
59
      if (sum_left >= filter_cnt && total_cnt - sum_left >= filter_cnt) {
60
61
        return false;
      }
Guolin Ke's avatar
Guolin Ke committed
62
63
64
65
    }
  }
  return true;
}
Guolin Ke's avatar
Guolin Ke committed
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts, 
                                  int num_distinct_values, int max_bin, int total_cnt, int min_data_in_bin) {
  std::vector<double> bin_upper_bound;
  if (num_distinct_values <= max_bin) {
    bin_upper_bound.clear();
    int cur_cnt_inbin = 0;
    for (int i = 0; i < num_distinct_values - 1; ++i) {
      cur_cnt_inbin += counts[i];
      if (cur_cnt_inbin >= min_data_in_bin) {
        bin_upper_bound.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
        cur_cnt_inbin = 0;
      }
    }
    cur_cnt_inbin += counts[num_distinct_values - 1];
    bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
  } else {
    if (min_data_in_bin > 0) {
      max_bin = std::min(max_bin, static_cast<int>(total_cnt / min_data_in_bin));
      max_bin = std::max(max_bin, 1);
    }
    double mean_bin_size = static_cast<double>(total_cnt) / max_bin;

    // mean size for one bin
    int rest_bin_cnt = max_bin;
    int rest_sample_cnt = static_cast<int>(total_cnt);
    std::vector<bool> is_big_count_value(num_distinct_values, false);
    for (int i = 0; i < num_distinct_values; ++i) {
      if (counts[i] >= mean_bin_size) {
        is_big_count_value[i] = true;
        --rest_bin_cnt;
        rest_sample_cnt -= counts[i];
      }
    }
    mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
    std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
    std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());

    int bin_cnt = 0;
    lower_bounds[bin_cnt] = distinct_values[0];
    int cur_cnt_inbin = 0;
    for (int i = 0; i < num_distinct_values - 1; ++i) {
      if (!is_big_count_value[i]) {
        rest_sample_cnt -= counts[i];
      }
      cur_cnt_inbin += counts[i];
      // need a new bin
      if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
        (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
        upper_bounds[bin_cnt] = distinct_values[i];
        ++bin_cnt;
        lower_bounds[bin_cnt] = distinct_values[i + 1];
        if (bin_cnt >= max_bin - 1) { break; }
        cur_cnt_inbin = 0;
        if (!is_big_count_value[i]) {
          --rest_bin_cnt;
          mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
        }
      }
    }
    ++bin_cnt;
    // update bin upper bound
    bin_upper_bound.resize(bin_cnt);
    for (int i = 0; i < bin_cnt - 1; ++i) {
      bin_upper_bound[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
    }
    // last bin upper bound
    bin_upper_bound[bin_cnt - 1] = std::numeric_limits<double>::infinity();
  }
  return bin_upper_bound;
}
Guolin Ke's avatar
Guolin Ke committed
136

137
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
138
139
140
  int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type) {
  bin_type_ = bin_type;
  default_bin_ = 0;
141
  int zero_cnt = static_cast<int>(total_sample_cnt - num_sample_values);
Guolin Ke's avatar
Guolin Ke committed
142
  // find distinct_values first
143
  std::vector<double> distinct_values;
144
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
145

146
  std::sort(values, values + num_sample_values);
Guolin Ke's avatar
Guolin Ke committed
147
148

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
149
  if (num_sample_values == 0 || (values[0] > 0.0f && zero_cnt > 0)) {
Guolin Ke's avatar
Guolin Ke committed
150
    distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
151
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
152
  }
Guolin Ke's avatar
Guolin Ke committed
153

154
155
  if (num_sample_values > 0) {
    distinct_values.push_back(values[0]);
Guolin Ke's avatar
Guolin Ke committed
156
157
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
158

159
160
161
  for (int i = 1; i < num_sample_values; ++i) {
    if (values[i] != values[i - 1]) {
      if (values[i - 1] < 0.0f && values[i] > 0.0f) {
Guolin Ke's avatar
Guolin Ke committed
162
        distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
163
164
        counts.push_back(zero_cnt);
      }
165
      distinct_values.push_back(values[i]);
166
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
167
    } else {
168
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
169
170
    }
  }
Guolin Ke's avatar
Guolin Ke committed
171
172

  // push zero in the back
173
  if (num_sample_values > 0 && values[num_sample_values - 1] < 0.0f && zero_cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
174
    distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
175
176
    counts.push_back(zero_cnt);
  }
177
178
  min_val_ = distinct_values.front();
  max_val_ = distinct_values.back();
179
  std::vector<int> cnt_in_bin;
180
  int num_distinct_values = static_cast<int>(distinct_values.size());
181
  if (bin_type_ == BinType::NumericalBin) {
Guolin Ke's avatar
Guolin Ke committed
182
183
184
185
186
187
188
189
190
191
192
    bin_upper_bound_.clear();
    int left_cnt_data = 0;
    int missing_cnt_data = 0;
    int right_cnt_data = 0;
    for (int i = 0; i < num_distinct_values; ++i) {
      if (distinct_values[i] <= -kMissingValueRange) {
        left_cnt_data += counts[i];
      } else if (distinct_values[i] > kMissingValueRange) {
        right_cnt_data += counts[i];
      } else {
        missing_cnt_data += counts[i];
Guolin Ke's avatar
Guolin Ke committed
193
      }
Guolin Ke's avatar
Guolin Ke committed
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
    }

    int left_cnt = 0;
    for (int i = 0; i < num_distinct_values; ++i) {
      if (distinct_values[i] > -kMissingValueRange) {
        left_cnt = i;
        break;
      } 
    }

    if (left_cnt > 0) {
      int left_max_bin = static_cast<int>(static_cast<double>(left_cnt_data) / (total_sample_cnt - missing_cnt_data) * (max_bin - 1));
      bin_upper_bound_ = GreedyFindBin(distinct_values.data(), counts.data(), left_cnt, left_max_bin, left_cnt_data, min_data_in_bin);
      bin_upper_bound_.back() = -kMissingValueRange;
    }

    int right_start = -1;
    for (int i = left_cnt; i < num_distinct_values; ++i) {
      if (distinct_values[i] > kMissingValueRange) {
        right_start = i;
        break;
Guolin Ke's avatar
Guolin Ke committed
215
      }
Guolin Ke's avatar
Guolin Ke committed
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
    }

    if (right_start >= 0) {
      int right_max_bin = max_bin - 1 - static_cast<int>(bin_upper_bound_.size());
      auto right_bounds = GreedyFindBin(distinct_values.data() + right_start, counts.data() + right_start, 
                                   num_distinct_values - right_start, right_max_bin, right_cnt_data, min_data_in_bin);
      bin_upper_bound_.push_back(kMissingValueRange);
      bin_upper_bound_.insert(bin_upper_bound_.end(), right_bounds.begin(), right_bounds.end());
    } else {
      bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
    }

    num_bin_ = static_cast<int>(bin_upper_bound_.size());
    {
      cnt_in_bin.resize(num_bin_, 0);
      int i_bin = 0;
232
      for (int i = 0; i < num_distinct_values; ++i) {
Guolin Ke's avatar
Guolin Ke committed
233
234
235
236
        if (distinct_values[i] > bin_upper_bound_[i_bin]) {
          ++i_bin;
        } 
        cnt_in_bin[i_bin] += counts[i];
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
      }
    }
    CHECK(num_bin_ <= max_bin);
  } else {
    // convert to int type first
    std::vector<int> distinct_values_int;
    std::vector<int> counts_int;
    distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
    counts_int.push_back(counts[0]);
    for (size_t i = 1; i < distinct_values.size(); ++i) {
      if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
        distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
        counts_int.push_back(counts[i]);
      } else {
        counts_int.back() += counts[i];
      }
Guolin Ke's avatar
Guolin Ke committed
253
    }
254
255
    // sort by counts
    Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
256
    // will ignore the categorical of small counts
257
258
259
260
261
262
263
264
265
266
267
    const int cut_cnt = static_cast<int>(total_sample_cnt * 0.98f);
    categorical_2_bin_.clear();
    bin_2_categorical_.clear();
    num_bin_ = 0;
    int used_cnt = 0;
    max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
    while (used_cnt < cut_cnt || num_bin_ < max_bin) {
      bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
      categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
      used_cnt += counts_int[num_bin_];
      ++num_bin_;
Guolin Ke's avatar
Guolin Ke committed
268
    }
269
270
271
    cnt_in_bin = counts_int;
    counts_int.resize(num_bin_);
    counts_int.back() += static_cast<int>(total_sample_cnt - used_cnt);
Guolin Ke's avatar
Guolin Ke committed
272
  }
Guolin Ke's avatar
Guolin Ke committed
273

Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
278
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
Guolin Ke's avatar
Guolin Ke committed
279
  }
280
281
  // check useless bin
  if (!is_trival_ && NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data, bin_type_)) {
Guolin Ke's avatar
Guolin Ke committed
282
    is_trival_ = true;
Guolin Ke's avatar
Guolin Ke committed
283
  }
284
285
286
287

  if (!is_trival_) {
    default_bin_ = ValueToBin(0);
  }
Guolin Ke's avatar
Guolin Ke committed
288
  // calculate sparse rate
289
  sparse_rate_ = static_cast<double>(cnt_in_bin[default_bin_]) / static_cast<double>(total_sample_cnt);
Guolin Ke's avatar
Guolin Ke committed
290
291
292
293
294
295
296
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
297
  size += sizeof(double);
298
  size += sizeof(BinType);
Guolin Ke's avatar
Guolin Ke committed
299
  size += 2 * sizeof(double);
300
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
301
  size += sizeof(uint32_t);
Guolin Ke's avatar
Guolin Ke committed
302
303
304
305
306
307
308
309
310
311
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
312
313
  std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
Guolin Ke's avatar
Guolin Ke committed
314
  std::memcpy(buffer, &min_val_, sizeof(min_val_));
315
  buffer += sizeof(min_val_);
Guolin Ke's avatar
Guolin Ke committed
316
  std::memcpy(buffer, &max_val_, sizeof(max_val_));
317
  buffer += sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
318
  std::memcpy(buffer, &default_bin_, sizeof(default_bin_));
Guolin Ke's avatar
Guolin Ke committed
319
  buffer += sizeof(default_bin_);
320
321
322
323
324
  if (bin_type_ == BinType::NumericalBin) {
    std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
  } else {
    std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
  }
Guolin Ke's avatar
Guolin Ke committed
325
326
327
328
329
330
331
332
333
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
334
335
  std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
336
337
338
339
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
340
341
  std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
  buffer += sizeof(default_bin_);
342
343
344
345
346
347
348
349
350
351
352
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = std::vector<double>(num_bin_);
    std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
  } else {
    bin_2_categorical_ = std::vector<int>(num_bin_);
    std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
    categorical_2_bin_.clear();
    for (int i = 0; i < num_bin_; ++i) {
      categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
353
354
355
356
357
358
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
359
  fwrite(&bin_type_, sizeof(bin_type_), 1, file);
360
361
  fwrite(&min_val_, sizeof(min_val_), 1, file);
  fwrite(&max_val_, sizeof(max_val_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
362
  fwrite(&default_bin_, sizeof(default_bin_), 1, file);
363
364
365
366
367
  if (bin_type_ == BinType::NumericalBin) {
    fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
  } else {
    fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
  }
Guolin Ke's avatar
Guolin Ke committed
368
369
370
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
371
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
372
373
374
375
376
377
    + sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
  if (bin_type_ == BinType::NumericalBin) {
    ret += sizeof(double) *  num_bin_;
  } else {
    ret += sizeof(int) * num_bin_;
  }
Guolin Ke's avatar
Guolin Ke committed
378
  return ret;
Guolin Ke's avatar
Guolin Ke committed
379
380
381
382
383
384
385
386
387
388
389
390
391
392
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
393
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
394
  bool is_enable_sparse, double sparse_threshold, bool* is_sparse) {
Guolin Ke's avatar
Guolin Ke committed
395
  // sparse threshold
396
  if (sparse_rate >= sparse_threshold && is_enable_sparse) {
Guolin Ke's avatar
Guolin Ke committed
397
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
398
    return CreateSparseBin(num_data, num_bin);
Guolin Ke's avatar
Guolin Ke committed
399
400
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
401
    return CreateDenseBin(num_data, num_bin);
Guolin Ke's avatar
Guolin Ke committed
402
403
404
  }
}

Guolin Ke's avatar
Guolin Ke committed
405
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
Guolin Ke's avatar
Guolin Ke committed
406
407
408
  if (num_bin <= 16) {
    return new Dense4bitsBin(num_data);
  } else if (num_bin <= 256) {
Guolin Ke's avatar
Guolin Ke committed
409
410
411
    return new DenseBin<uint8_t>(num_data);
  } else if (num_bin <= 65536) {
    return new DenseBin<uint16_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
412
  } else {
Guolin Ke's avatar
Guolin Ke committed
413
    return new DenseBin<uint32_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
414
415
416
  }
}

Guolin Ke's avatar
Guolin Ke committed
417
418
419
420
421
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
  if (num_bin <= 256) {
    return new SparseBin<uint8_t>(num_data);
  } else if (num_bin <= 65536) {
    return new SparseBin<uint16_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
422
  } else {
Guolin Ke's avatar
Guolin Ke committed
423
    return new SparseBin<uint32_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
424
425
426
427
  }
}

}  // namespace LightGBM