bin.cpp 11.8 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
5
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"
6
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
17

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
18
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
22
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
  bin_type_ = other.bin_type_;
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = other.bin_upper_bound_;
  } else {
    bin_2_categorical_ = other.bin_2_categorical_;
    categorical_2_bin_ = other.categorical_2_bin_;
Guolin Ke's avatar
Guolin Ke committed
32
  }
33
34
  min_val_ = other.min_val_;
  max_val_ = other.max_val_;
Guolin Ke's avatar
Guolin Ke committed
35

Guolin Ke's avatar
Guolin Ke committed
36
37
}

Guolin Ke's avatar
Guolin Ke committed
38
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
39
40
41
42
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
43

Guolin Ke's avatar
Guolin Ke committed
44
45
}

46
void BinMapper::FindBin(const std::string& column_name, std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
47
  bin_type_ = bin_type;
48
  std::vector<double>& ref_values = (*values);
Guolin Ke's avatar
Guolin Ke committed
49
  size_t sample_size = total_sample_cnt;
Guolin Ke's avatar
Guolin Ke committed
50
  int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
Guolin Ke's avatar
Guolin Ke committed
51
  // find distinct_values first
52
  std::vector<double> distinct_values;
53
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
54

55
  std::sort(ref_values.begin(), ref_values.end());
Guolin Ke's avatar
Guolin Ke committed
56
57

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
58
  if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) {
Guolin Ke's avatar
Guolin Ke committed
59
60
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
61
  }
Guolin Ke's avatar
Guolin Ke committed
62

Guolin Ke's avatar
Guolin Ke committed
63
  if (!ref_values.empty()) {
Guolin Ke's avatar
Guolin Ke committed
64
65
66
    distinct_values.push_back(ref_values[0]);
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
67

68
69
  for (size_t i = 1; i < ref_values.size(); ++i) {
    if (ref_values[i] != ref_values[i - 1]) {
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
      if (ref_values[i - 1] == 0.0f) {
        counts.back() += zero_cnt;
      } else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
        distinct_values.push_back(0);
        counts.push_back(zero_cnt);
      }
76
77
      distinct_values.push_back(ref_values[i]);
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
78
    } else {
79
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
80
81
    }
  }
Guolin Ke's avatar
Guolin Ke committed
82
83

  // push zero in the back
Guolin Ke's avatar
Guolin Ke committed
84
  if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
85
86
87
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
  }
88
89
  min_val_ = distinct_values.front();
  max_val_ = distinct_values.back();
90
  std::vector<int> cnt_in_bin;
91
  int num_values = static_cast<int>(distinct_values.size());
Guolin Ke's avatar
Guolin Ke committed
92
  int cnt_in_bin0 = 0;
Guolin Ke's avatar
Guolin Ke committed
93
94
95
96
97
98
99
100
  if (bin_type_ == BinType::NumericalBin) {
    if (num_values <= max_bin) {
      std::sort(distinct_values.begin(), distinct_values.end());
      // use distinct value is enough
      num_bin_ = num_values;
      bin_upper_bound_ = std::vector<double>(num_values);
      for (int i = 0; i < num_values - 1; ++i) {
        bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
wxchan's avatar
wxchan committed
101
      }
102
      cnt_in_bin = counts;
Guolin Ke's avatar
Guolin Ke committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
      bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
    } else {
      // mean size for one bin
      double mean_bin_size = sample_size / static_cast<double>(max_bin);
      int rest_bin_cnt = max_bin;
      int rest_sample_cnt = static_cast<int>(sample_size);
      std::vector<bool> is_big_count_value(num_values, false);
      for (int i = 0; i < num_values; ++i) {
        if (counts[i] >= mean_bin_size) {
          is_big_count_value[i] = true;
          --rest_bin_cnt;
          rest_sample_cnt -= counts[i];
        }
      }
      mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
wxchan's avatar
wxchan committed
118

Guolin Ke's avatar
Guolin Ke committed
119
120
      std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
      std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
Guolin Ke's avatar
Guolin Ke committed
121

Guolin Ke's avatar
Guolin Ke committed
122
123
124
125
      int bin_cnt = 0;
      lower_bounds[bin_cnt] = distinct_values[0];
      int cur_cnt_inbin = 0;
      for (int i = 0; i < num_values - 1; ++i) {
wxchan's avatar
wxchan committed
126
        if (!is_big_count_value[i]) {
Guolin Ke's avatar
Guolin Ke committed
127
128
129
130
131
132
133
          rest_sample_cnt -= counts[i];
        }
        cur_cnt_inbin += counts[i];
        // need a new bin
        if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
          (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
          upper_bounds[bin_cnt] = distinct_values[i];
134
          cnt_in_bin.push_back(cur_cnt_inbin);
Guolin Ke's avatar
Guolin Ke committed
135
136
137
138
139
140
141
142
          ++bin_cnt;
          lower_bounds[bin_cnt] = distinct_values[i + 1];
          if (bin_cnt >= max_bin - 1) { break; }
          cur_cnt_inbin = 0;
          if (!is_big_count_value[i]) {
            --rest_bin_cnt;
            mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
          }
wxchan's avatar
wxchan committed
143
        }
Guolin Ke's avatar
Guolin Ke committed
144
      }
Guolin Ke's avatar
Guolin Ke committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
      //
      ++bin_cnt;
      // update bin upper bound
      bin_upper_bound_ = std::vector<double>(bin_cnt);
      num_bin_ = bin_cnt;
      for (int i = 0; i < bin_cnt - 1; ++i) {
        bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
      }
      // last bin upper bound
      bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
    }

  } else {
    // convert to int type first
    std::vector<int> distinct_values_int;
    std::vector<int> counts_int;
    distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
    counts_int.push_back(counts[0]);
    for (size_t i = 1; i < distinct_values.size(); ++i) {
      if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
        distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
        counts_int.push_back(counts[i]);
      } else {
        counts_int.back() += counts[i];
      }
Guolin Ke's avatar
Guolin Ke committed
170
    }
Guolin Ke's avatar
Guolin Ke committed
171
172
173
    // sort by counts
    Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
    // will ingore the categorical of small counts
174
    const int cut_cnt = static_cast<int>(sample_size * 0.95f);
Guolin Ke's avatar
Guolin Ke committed
175
    categorical_2_bin_.clear();
176
177
    bin_2_categorical_.clear();
    num_bin_ = 0;
Guolin Ke's avatar
Guolin Ke committed
178
    int used_cnt = 0;
179
180
181
182
183
184
    max_bin = std::min(static_cast<int>(distinct_values_int.size()), max_bin);
    while (used_cnt < cut_cnt || num_bin_ < max_bin ) {
      bin_2_categorical_.push_back(distinct_values_int[num_bin_]);
      categorical_2_bin_[distinct_values_int[num_bin_]] = static_cast<unsigned int>(num_bin_);
      used_cnt += counts_int[num_bin_];
      ++num_bin_;
Guolin Ke's avatar
Guolin Ke committed
185
    }
186
187
    cnt_in_bin = counts_int;
    cnt_in_bin[0] += static_cast<int>(sample_size) - used_cnt;
Guolin Ke's avatar
Guolin Ke committed
188
  }
Guolin Ke's avatar
Guolin Ke committed
189

Guolin Ke's avatar
Guolin Ke committed
190
191
192
193
194
195
196
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
  }
  // calculate sparse rate
197
198
  CHECK(num_bin_ <= max_bin);
  sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(sample_size);
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
203
204
205
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
206
  size += sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
207
  size += sizeof(BinType);
208
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
209
210
211
212
213
214
215
216
217
218
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
219
220
  std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
221
222
223
224
225
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);

Guolin Ke's avatar
Guolin Ke committed
226
227
228
229
230
  if (bin_type_ == BinType::NumericalBin) {
    std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
  } else {
    std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
  }
Guolin Ke's avatar
Guolin Ke committed
231
232
233
234
235
236
237
238
239
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
240
241
  std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
242
243
244
245
246
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);

Guolin Ke's avatar
Guolin Ke committed
247
248
249
250
251
252
253
254
255
256
257
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = std::vector<double>(num_bin_);
    std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
  } else {
    bin_2_categorical_ = std::vector<int>(num_bin_);
    std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
    categorical_2_bin_.clear();
    for (int i = 0; i < num_bin_; ++i) {
      categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
258
259
260
261
262
263
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
264
  fwrite(&bin_type_, sizeof(bin_type_), 1, file);
265
266
  fwrite(&min_val_, sizeof(min_val_), 1, file);
  fwrite(&max_val_, sizeof(max_val_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
267
268
269
270
271
  if (bin_type_ == BinType::NumericalBin) {
    fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
  } else {
    fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
  }
Guolin Ke's avatar
Guolin Ke committed
272
273
274
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
275
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
276
    + sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
277
278
279
280
281
282
  if (bin_type_ == BinType::NumericalBin) {
    ret += sizeof(double) *  num_bin_;
  } else {
    ret += sizeof(int) * num_bin_;
  }
  return ret;
Guolin Ke's avatar
Guolin Ke committed
283
284
285
286
287
288
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
289
290
291
292
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
293
294
295
296
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
297
298
299
300
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
301
302
303
304
305
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;


Guolin Ke's avatar
Guolin Ke committed
306
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
307
  bool is_enable_sparse, bool* is_sparse, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
308
  // sparse threshold
309
  const double kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
310
311
  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
312
    return CreateSparseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
313
314
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
315
    return CreateDenseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
316
317
318
  }
}

319
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
320
  if (bin_type == BinType::NumericalBin) {
321
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
322
      return new DenseBin<uint8_t>(num_data, default_bin);
323
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
324
325
326
327
      return new DenseBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
328
  } else {
329
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
330
      return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
331
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
332
333
334
335
      return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
336
337
338
  }
}

339
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
340
  if (bin_type == BinType::NumericalBin) {
341
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
342
      return new SparseBin<uint8_t>(num_data, default_bin);
343
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
344
345
346
347
      return new SparseBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
348
  } else {
349
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
350
      return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
351
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
352
353
354
355
      return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
356
357
358
359
  }
}

}  // namespace LightGBM