bin.cpp 11.9 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
5
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"
6
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
17

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
18
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
22
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
  bin_type_ = other.bin_type_;
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = other.bin_upper_bound_;
  } else {
    bin_2_categorical_ = other.bin_2_categorical_;
    categorical_2_bin_ = other.categorical_2_bin_;
Guolin Ke's avatar
Guolin Ke committed
32
  }
33
34
  min_val_ = other.min_val_;
  max_val_ = other.max_val_;
Guolin Ke's avatar
Guolin Ke committed
35

Guolin Ke's avatar
Guolin Ke committed
36
37
}

Guolin Ke's avatar
Guolin Ke committed
38
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
39
40
41
42
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
43

Guolin Ke's avatar
Guolin Ke committed
44
45
}

46
void BinMapper::FindBin(const std::string& column_name, std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
47
  bin_type_ = bin_type;
48
  std::vector<double>& ref_values = (*values);
Guolin Ke's avatar
Guolin Ke committed
49
  size_t sample_size = total_sample_cnt;
Guolin Ke's avatar
Guolin Ke committed
50
  int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
Guolin Ke's avatar
Guolin Ke committed
51
  // find distinct_values first
52
  std::vector<double> distinct_values;
53
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
54

55
  std::sort(ref_values.begin(), ref_values.end());
Guolin Ke's avatar
Guolin Ke committed
56
57

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
58
  if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) {
Guolin Ke's avatar
Guolin Ke committed
59
60
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
61
  }
Guolin Ke's avatar
Guolin Ke committed
62

Guolin Ke's avatar
Guolin Ke committed
63
  if (!ref_values.empty()) {
Guolin Ke's avatar
Guolin Ke committed
64
65
66
    distinct_values.push_back(ref_values[0]);
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
67

68
69
  for (size_t i = 1; i < ref_values.size(); ++i) {
    if (ref_values[i] != ref_values[i - 1]) {
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
      if (ref_values[i - 1] == 0.0f) {
        counts.back() += zero_cnt;
      } else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
        distinct_values.push_back(0);
        counts.push_back(zero_cnt);
      }
76
77
      distinct_values.push_back(ref_values[i]);
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
78
    } else {
79
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
80
81
    }
  }
Guolin Ke's avatar
Guolin Ke committed
82
83

  // push zero in the back
Guolin Ke's avatar
Guolin Ke committed
84
  if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
85
86
87
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
  }
88
89
  min_val_ = distinct_values.front();
  max_val_ = distinct_values.back();
90
  std::vector<int> cnt_in_bin;
91
  int num_values = static_cast<int>(distinct_values.size());
Guolin Ke's avatar
Guolin Ke committed
92
  int cnt_in_bin0 = 0;
Guolin Ke's avatar
Guolin Ke committed
93
94
95
96
97
98
99
100
  if (bin_type_ == BinType::NumericalBin) {
    if (num_values <= max_bin) {
      std::sort(distinct_values.begin(), distinct_values.end());
      // use distinct value is enough
      num_bin_ = num_values;
      bin_upper_bound_ = std::vector<double>(num_values);
      for (int i = 0; i < num_values - 1; ++i) {
        bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
wxchan's avatar
wxchan committed
101
      }
102
      cnt_in_bin = counts;
Guolin Ke's avatar
Guolin Ke committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
      bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
    } else {
      // mean size for one bin
      double mean_bin_size = sample_size / static_cast<double>(max_bin);
      int rest_bin_cnt = max_bin;
      int rest_sample_cnt = static_cast<int>(sample_size);
      std::vector<bool> is_big_count_value(num_values, false);
      for (int i = 0; i < num_values; ++i) {
        if (counts[i] >= mean_bin_size) {
          is_big_count_value[i] = true;
          --rest_bin_cnt;
          rest_sample_cnt -= counts[i];
        }
      }
      mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
wxchan's avatar
wxchan committed
118

Guolin Ke's avatar
Guolin Ke committed
119
120
      std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
      std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
Guolin Ke's avatar
Guolin Ke committed
121

Guolin Ke's avatar
Guolin Ke committed
122
123
124
125
      int bin_cnt = 0;
      lower_bounds[bin_cnt] = distinct_values[0];
      int cur_cnt_inbin = 0;
      for (int i = 0; i < num_values - 1; ++i) {
wxchan's avatar
wxchan committed
126
        if (!is_big_count_value[i]) {
Guolin Ke's avatar
Guolin Ke committed
127
128
129
130
131
132
133
          rest_sample_cnt -= counts[i];
        }
        cur_cnt_inbin += counts[i];
        // need a new bin
        if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
          (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
          upper_bounds[bin_cnt] = distinct_values[i];
134
          cnt_in_bin.push_back(cur_cnt_inbin);
Guolin Ke's avatar
Guolin Ke committed
135
136
137
138
139
140
141
142
          ++bin_cnt;
          lower_bounds[bin_cnt] = distinct_values[i + 1];
          if (bin_cnt >= max_bin - 1) { break; }
          cur_cnt_inbin = 0;
          if (!is_big_count_value[i]) {
            --rest_bin_cnt;
            mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
          }
wxchan's avatar
wxchan committed
143
        }
Guolin Ke's avatar
Guolin Ke committed
144
      }
Guolin Ke's avatar
Guolin Ke committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
      //
      ++bin_cnt;
      // update bin upper bound
      bin_upper_bound_ = std::vector<double>(bin_cnt);
      num_bin_ = bin_cnt;
      for (int i = 0; i < bin_cnt - 1; ++i) {
        bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
      }
      // last bin upper bound
      bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
    }

  } else {
    // convert to int type first
    std::vector<int> distinct_values_int;
    std::vector<int> counts_int;
    distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
    counts_int.push_back(counts[0]);
    for (size_t i = 1; i < distinct_values.size(); ++i) {
      if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
        distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
        counts_int.push_back(counts[i]);
      } else {
        counts_int.back() += counts[i];
      }
Guolin Ke's avatar
Guolin Ke committed
170
    }
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175
176
177
178
179
180
181
    // sort by counts
    Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
    // will ingore the categorical of small counts
    num_bin_ = std::min(max_bin, static_cast<int>(counts_int.size()));
    categorical_2_bin_.clear();
    bin_2_categorical_ = std::vector<int>(num_bin_);
    int used_cnt = 0;
    for (int i = 0; i < num_bin_; ++i) {
      bin_2_categorical_[i] = distinct_values_int[i];
      categorical_2_bin_[distinct_values_int[i]] = static_cast<unsigned int>(i);
      used_cnt += counts_int[i];
Guolin Ke's avatar
Guolin Ke committed
182
    }
Guolin Ke's avatar
Guolin Ke committed
183
184
    if (used_cnt / static_cast<double>(sample_size) < 0.95f) {
      Log::Warning("Too many categoricals are ignored, \
185
                   please use bigger max_bin or partition column \"%s\" ", column_name.c_str());
Guolin Ke's avatar
Guolin Ke committed
186
    }
187
188
    cnt_in_bin = counts_int;
    cnt_in_bin[0] += static_cast<int>(sample_size) - used_cnt;
Guolin Ke's avatar
Guolin Ke committed
189
  }
Guolin Ke's avatar
Guolin Ke committed
190

Guolin Ke's avatar
Guolin Ke committed
191
192
193
194
195
196
197
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
  }
  // calculate sparse rate
198
199
  CHECK(num_bin_ <= max_bin);
  sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(sample_size);
Guolin Ke's avatar
Guolin Ke committed
200
201
202
203
204
205
206
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
207
  size += sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
208
  size += sizeof(BinType);
209
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
210
211
212
213
214
215
216
217
218
219
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
220
221
  std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
222
223
224
225
226
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);

Guolin Ke's avatar
Guolin Ke committed
227
228
229
230
231
  if (bin_type_ == BinType::NumericalBin) {
    std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
  } else {
    std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
  }
Guolin Ke's avatar
Guolin Ke committed
232
233
234
235
236
237
238
239
240
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
241
242
  std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
243
244
245
246
247
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);

Guolin Ke's avatar
Guolin Ke committed
248
249
250
251
252
253
254
255
256
257
258
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = std::vector<double>(num_bin_);
    std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
  } else {
    bin_2_categorical_ = std::vector<int>(num_bin_);
    std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
    categorical_2_bin_.clear();
    for (int i = 0; i < num_bin_; ++i) {
      categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
259
260
261
262
263
264
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
265
  fwrite(&bin_type_, sizeof(bin_type_), 1, file);
266
267
  fwrite(&min_val_, sizeof(min_val_), 1, file);
  fwrite(&max_val_, sizeof(max_val_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
268
269
270
271
272
  if (bin_type_ == BinType::NumericalBin) {
    fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
  } else {
    fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
  }
Guolin Ke's avatar
Guolin Ke committed
273
274
275
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
276
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
277
    + sizeof(bin_type_) + sizeof(min_val_) + sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
278
279
280
281
282
283
  if (bin_type_ == BinType::NumericalBin) {
    ret += sizeof(double) *  num_bin_;
  } else {
    ret += sizeof(int) * num_bin_;
  }
  return ret;
Guolin Ke's avatar
Guolin Ke committed
284
285
286
287
288
289
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
290
291
292
293
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
294
295
296
297
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
298
299
300
301
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
302
303
304
305
306
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;


Guolin Ke's avatar
Guolin Ke committed
307
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
308
  bool is_enable_sparse, bool* is_sparse, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
309
  // sparse threshold
310
  const double kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
311
312
  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
313
    return CreateSparseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
314
315
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
316
    return CreateDenseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
317
318
319
  }
}

320
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
321
  if (bin_type == BinType::NumericalBin) {
322
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
323
      return new DenseBin<uint8_t>(num_data, default_bin);
324
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
325
326
327
328
      return new DenseBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
329
  } else {
330
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
331
      return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
332
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
333
334
335
336
      return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
337
338
339
  }
}

340
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, uint32_t default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
341
  if (bin_type == BinType::NumericalBin) {
342
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
343
      return new SparseBin<uint8_t>(num_data, default_bin);
344
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
345
346
347
348
      return new SparseBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
349
  } else {
350
    if (num_bin <= 255) {
Guolin Ke's avatar
Guolin Ke committed
351
      return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
352
    } else if (num_bin <= 65535) {
Guolin Ke's avatar
Guolin Ke committed
353
354
355
356
      return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
357
358
359
360
  }
}

}  // namespace LightGBM