bin.cpp 11.8 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
5
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"
6
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
17

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
18
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
22
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
  bin_type_ = other.bin_type_;
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = other.bin_upper_bound_;
  } else {
    bin_2_categorical_ = other.bin_2_categorical_;
    categorical_2_bin_ = other.categorical_2_bin_;
Guolin Ke's avatar
Guolin Ke committed
32
  }
33
34
  min_val_ = other.min_val_;
  max_val_ = other.max_val_;
Guolin Ke's avatar
Guolin Ke committed
35

Guolin Ke's avatar
Guolin Ke committed
36
37
}

Guolin Ke's avatar
Guolin Ke committed
38
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
39
40
41
42
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
43

Guolin Ke's avatar
Guolin Ke committed
44
45
}

Guolin Ke's avatar
Guolin Ke committed
46
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
47
  bin_type_ = bin_type;
48
  std::vector<double>& ref_values = (*values);
Guolin Ke's avatar
Guolin Ke committed
49
  size_t sample_size = total_sample_cnt;
Guolin Ke's avatar
Guolin Ke committed
50
  int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
Guolin Ke's avatar
Guolin Ke committed
51
  // find distinct_values first
52
  std::vector<double> distinct_values;
53
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
54

55
  std::sort(ref_values.begin(), ref_values.end());
Guolin Ke's avatar
Guolin Ke committed
56
57

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
58
  if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) {
Guolin Ke's avatar
Guolin Ke committed
59
60
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
61
  }
Guolin Ke's avatar
Guolin Ke committed
62

Guolin Ke's avatar
Guolin Ke committed
63
  if (!ref_values.empty()) {
Guolin Ke's avatar
Guolin Ke committed
64
65
66
    distinct_values.push_back(ref_values[0]);
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
67

68
69
  for (size_t i = 1; i < ref_values.size(); ++i) {
    if (ref_values[i] != ref_values[i - 1]) {
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
      if (ref_values[i - 1] == 0.0f) {
        counts.back() += zero_cnt;
      } else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
        distinct_values.push_back(0);
        counts.push_back(zero_cnt);
      }
76
77
      distinct_values.push_back(ref_values[i]);
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
78
    } else {
79
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
80
81
    }
  }
Guolin Ke's avatar
Guolin Ke committed
82
83

  // push zero in the back
Guolin Ke's avatar
Guolin Ke committed
84
  if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
85
86
87
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
  }
88
89
  min_val_ = distinct_values.front();
  max_val_ = distinct_values.back();
90
  std::vector<int> cnt_in_bin;
91
  int num_values = static_cast<int>(distinct_values.size());
Guolin Ke's avatar
Guolin Ke committed
92
93
94
95
96
97
98
99
  if (bin_type_ == BinType::NumericalBin) {
    if (num_values <= max_bin) {
      std::sort(distinct_values.begin(), distinct_values.end());
      // use distinct value is enough
      num_bin_ = num_values;
      bin_upper_bound_ = std::vector<double>(num_values);
      for (int i = 0; i < num_values - 1; ++i) {
        bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
wxchan's avatar
wxchan committed
100
      }
101
      cnt_in_bin = counts;
Guolin Ke's avatar
Guolin Ke committed
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
      bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
    } else {
      // mean size for one bin
      double mean_bin_size = sample_size / static_cast<double>(max_bin);
      int rest_bin_cnt = max_bin;
      int rest_sample_cnt = static_cast<int>(sample_size);
      std::vector<bool> is_big_count_value(num_values, false);
      for (int i = 0; i < num_values; ++i) {
        if (counts[i] >= mean_bin_size) {
          is_big_count_value[i] = true;
          --rest_bin_cnt;
          rest_sample_cnt -= counts[i];
        }
      }
      mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
wxchan's avatar
wxchan committed
117

Guolin Ke's avatar
Guolin Ke committed
118
119
      std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
      std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
Guolin Ke's avatar
Guolin Ke committed
120

Guolin Ke's avatar
Guolin Ke committed
121
122
123
124
      int bin_cnt = 0;
      lower_bounds[bin_cnt] = distinct_values[0];
      int cur_cnt_inbin = 0;
      for (int i = 0; i < num_values - 1; ++i) {
wxchan's avatar
wxchan committed
125
        if (!is_big_count_value[i]) {
Guolin Ke's avatar
Guolin Ke committed
126
127
128
129
130
131
132
          rest_sample_cnt -= counts[i];
        }
        cur_cnt_inbin += counts[i];
        // need a new bin
        if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
          (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
          upper_bounds[bin_cnt] = distinct_values[i];
133
          cnt_in_bin.push_back(cur_cnt_inbin);
Guolin Ke's avatar
Guolin Ke committed
134
135
136
137
138
139
140
141
          ++bin_cnt;
          lower_bounds[bin_cnt] = distinct_values[i + 1];
          if (bin_cnt >= max_bin - 1) { break; }
          cur_cnt_inbin = 0;
          if (!is_big_count_value[i]) {
            --rest_bin_cnt;
            mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
          }
wxchan's avatar
wxchan committed
142
        }
Guolin Ke's avatar
Guolin Ke committed
143
      }
Guolin Ke's avatar
Guolin Ke committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
      ++bin_cnt;
      // update bin upper bound
      bin_upper_bound_ = std::vector<double>(bin_cnt);
      num_bin_ = bin_cnt;
      for (int i = 0; i < bin_cnt - 1; ++i) {
        bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
      }
      // last bin upper bound
      bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
    }

  } else {
    // convert to int type first
    std::vector<int> distinct_values_int;
    std::vector<int> counts_int;
    distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
    counts_int.push_back(counts[0]);
    for (size_t i = 1; i < distinct_values.size(); ++i) {
      if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
        distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
        counts_int.push_back(counts[i]);
      } else {
        counts_int.back() += counts[i];
      }
Guolin Ke's avatar
Guolin Ke committed
168
    }
Guolin Ke's avatar
Guolin Ke committed
169
170
171
    // sort by counts
    Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
    // will ingore the categorical of small counts
172
    const int cut_cnt = static_cast<int>(sample_size * 0.95f);
Guolin Ke's avatar
Guolin Ke committed
173
    categorical_2_bin_.clear();
174
175
    bin_2_categorical_.clear();
    num_bin_ = 0;
Guolin Ke's avatar
Guolin Ke committed
176
    int used_cnt = 0;
177
178
179
180
181
182
    max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
    while (used_cnt < cut_cnt || num_bin_ < max_bin ) {
      bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
      categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
      used_cnt += counts_int[num_bin_];
      ++num_bin_;
Guolin Ke's avatar
Guolin Ke committed
183
    }
184
185
    cnt_in_bin = counts_int;
    cnt_in_bin[0] += static_cast<int>(sample_size) - used_cnt;
Guolin Ke's avatar
Guolin Ke committed
186
  }
Guolin Ke's avatar
Guolin Ke committed
187

Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
192
193
194
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
  }
  // calculate sparse rate
195
196
  CHECK(num_bin_ <= max_bin);
  sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(sample_size);
Guolin Ke's avatar
Guolin Ke committed
197
198
199
200
201
202
203
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
204
  size += sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
205
  size += sizeof(BinType);
206
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
207
208
209
210
211
212
213
214
215
216
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
217
218
  std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
219
220
221
222
223
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);

Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
228
  if (bin_type_ == BinType::NumericalBin) {
    std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
  } else {
    std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
  }
Guolin Ke's avatar
Guolin Ke committed
229
230
231
232
233
234
235
236
237
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
238
239
  std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
240
241
242
243
244
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);

Guolin Ke's avatar
Guolin Ke committed
245
246
247
248
249
250
251
252
253
254
255
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = std::vector<double>(num_bin_);
    std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
  } else {
    bin_2_categorical_ = std::vector<int>(num_bin_);
    std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
    categorical_2_bin_.clear();
    for (int i = 0; i < num_bin_; ++i) {
      categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
256
257
258
259
260
261
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
262
  fwrite(&bin_type_, sizeof(bin_type_), 1, file);
263
264
  fwrite(&min_val_, sizeof(min_val_), 1, file);
  fwrite(&max_val_, sizeof(max_val_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
265
266
267
268
269
  if (bin_type_ == BinType::NumericalBin) {
    fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
  } else {
    fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
  }
Guolin Ke's avatar
Guolin Ke committed
270
271
272
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
273
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
274
    + sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
275
276
277
278
279
280
  if (bin_type_ == BinType::NumericalBin) {
    ret += sizeof(double) *  num_bin_;
  } else {
    ret += sizeof(int) * num_bin_;
  }
  return ret;
Guolin Ke's avatar
Guolin Ke committed
281
282
283
284
285
286
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
287
288
289
290
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
291
292
293
294
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
295
296
297
298
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
299
300
301
302
303
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;


Guolin Ke's avatar
Guolin Ke committed
304
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
305
  bool is_enable_sparse, bool* is_sparse, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
306
  // sparse threshold
307
  const double kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
308
309
  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
310
    return CreateSparseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
311
312
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
313
    return CreateDenseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
314
315
316
  }
}

317
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
318
  if (bin_type == BinType::NumericalBin) {
319
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
320
      return new DenseBin<uint8_t>(num_data, default_bin);
321
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
322
323
324
325
      return new DenseBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
326
  } else {
327
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
328
      return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
329
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
330
331
332
333
      return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
334
335
336
  }
}

337
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
338
  if (bin_type == BinType::NumericalBin) {
339
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
340
      return new SparseBin<uint8_t>(num_data, default_bin);
341
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
342
343
344
345
      return new SparseBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
346
  } else {
347
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
348
      return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
349
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
350
351
352
353
      return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
354
355
356
357
  }
}

}  // namespace LightGBM