bin.cpp 11.3 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
5
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"
6
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
17

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
18
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
22
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
  bin_type_ = other.bin_type_;
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = other.bin_upper_bound_;
  } else {
    bin_2_categorical_ = other.bin_2_categorical_;
    categorical_2_bin_ = other.categorical_2_bin_;
Guolin Ke's avatar
Guolin Ke committed
32
  }
Guolin Ke's avatar
Guolin Ke committed
33

Guolin Ke's avatar
Guolin Ke committed
34
35
}

Guolin Ke's avatar
Guolin Ke committed
36
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
37
38
39
40
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
41

Guolin Ke's avatar
Guolin Ke committed
42
43
}

Guolin Ke's avatar
Guolin Ke committed
44
45
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) {
  bin_type_ = bin_type;
46
  std::vector<double>& ref_values = (*values);
Guolin Ke's avatar
Guolin Ke committed
47
  size_t sample_size = total_sample_cnt;
Guolin Ke's avatar
Guolin Ke committed
48
  int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
Guolin Ke's avatar
Guolin Ke committed
49
  // find distinct_values first
50
  std::vector<double> distinct_values;
51
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
52

53
  std::sort(ref_values.begin(), ref_values.end());
Guolin Ke's avatar
Guolin Ke committed
54
55

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
56
  if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) {
Guolin Ke's avatar
Guolin Ke committed
57
58
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
59
  }
Guolin Ke's avatar
Guolin Ke committed
60

Guolin Ke's avatar
Guolin Ke committed
61
  if (!ref_values.empty()) {
Guolin Ke's avatar
Guolin Ke committed
62
63
64
    distinct_values.push_back(ref_values[0]);
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
65

66
67
  for (size_t i = 1; i < ref_values.size(); ++i) {
    if (ref_values[i] != ref_values[i - 1]) {
Guolin Ke's avatar
Guolin Ke committed
68
69
70
71
72
73
      if (ref_values[i - 1] == 0.0f) {
        counts.back() += zero_cnt;
      } else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
        distinct_values.push_back(0);
        counts.push_back(zero_cnt);
      }
74
75
      distinct_values.push_back(ref_values[i]);
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
76
    } else {
77
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
78
79
    }
  }
Guolin Ke's avatar
Guolin Ke committed
80
81

  // push zero in the back
Guolin Ke's avatar
Guolin Ke committed
82
  if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
83
84
85
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
  }
86
87
  min_val_ = distinct_values.front();
  max_val_ = distinct_values.back();
88
  int num_values = static_cast<int>(distinct_values.size());
Guolin Ke's avatar
Guolin Ke committed
89
  int cnt_in_bin0 = 0;
Guolin Ke's avatar
Guolin Ke committed
90
91
92
93
94
95
96
97
  if (bin_type_ == BinType::NumericalBin) {
    if (num_values <= max_bin) {
      std::sort(distinct_values.begin(), distinct_values.end());
      // use distinct value is enough
      num_bin_ = num_values;
      bin_upper_bound_ = std::vector<double>(num_values);
      for (int i = 0; i < num_values - 1; ++i) {
        bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
wxchan's avatar
wxchan committed
98
      }
Guolin Ke's avatar
Guolin Ke committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
      cnt_in_bin0 = counts[0];
      bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
    } else {
      // mean size for one bin
      double mean_bin_size = sample_size / static_cast<double>(max_bin);
      int rest_bin_cnt = max_bin;
      int rest_sample_cnt = static_cast<int>(sample_size);
      std::vector<bool> is_big_count_value(num_values, false);
      for (int i = 0; i < num_values; ++i) {
        if (counts[i] >= mean_bin_size) {
          is_big_count_value[i] = true;
          --rest_bin_cnt;
          rest_sample_cnt -= counts[i];
        }
      }
      mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
wxchan's avatar
wxchan committed
115

Guolin Ke's avatar
Guolin Ke committed
116
117
      std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
      std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
Guolin Ke's avatar
Guolin Ke committed
118

Guolin Ke's avatar
Guolin Ke committed
119
120
121
122
      int bin_cnt = 0;
      lower_bounds[bin_cnt] = distinct_values[0];
      int cur_cnt_inbin = 0;
      for (int i = 0; i < num_values - 1; ++i) {
wxchan's avatar
wxchan committed
123
        if (!is_big_count_value[i]) {
Guolin Ke's avatar
Guolin Ke committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
          rest_sample_cnt -= counts[i];
        }
        cur_cnt_inbin += counts[i];
        // need a new bin
        if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
          (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
          upper_bounds[bin_cnt] = distinct_values[i];
          if (bin_cnt == 0) {
            cnt_in_bin0 = cur_cnt_inbin;
          }
          ++bin_cnt;
          lower_bounds[bin_cnt] = distinct_values[i + 1];
          if (bin_cnt >= max_bin - 1) { break; }
          cur_cnt_inbin = 0;
          if (!is_big_count_value[i]) {
            --rest_bin_cnt;
            mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
          }
wxchan's avatar
wxchan committed
142
        }
Guolin Ke's avatar
Guolin Ke committed
143
      }
Guolin Ke's avatar
Guolin Ke committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
      //
      ++bin_cnt;
      // update bin upper bound
      bin_upper_bound_ = std::vector<double>(bin_cnt);
      num_bin_ = bin_cnt;
      for (int i = 0; i < bin_cnt - 1; ++i) {
        bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
      }
      // last bin upper bound
      bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
    }

  } else {
    // convert to int type first
    std::vector<int> distinct_values_int;
    std::vector<int> counts_int;
    distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
    counts_int.push_back(counts[0]);
    for (size_t i = 1; i < distinct_values.size(); ++i) {
      if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
        distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
        counts_int.push_back(counts[i]);
      } else {
        counts_int.back() += counts[i];
      }
Guolin Ke's avatar
Guolin Ke committed
169
    }
Guolin Ke's avatar
Guolin Ke committed
170
171
172
173
174
175
176
177
178
179
180
    // sort by counts
    Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
    // will ingore the categorical of small counts
    num_bin_ = std::min(max_bin, static_cast<int>(counts_int.size()));
    categorical_2_bin_.clear();
    bin_2_categorical_ = std::vector<int>(num_bin_);
    int used_cnt = 0;
    for (int i = 0; i < num_bin_; ++i) {
      bin_2_categorical_[i] = distinct_values_int[i];
      categorical_2_bin_[distinct_values_int[i]] = static_cast<unsigned int>(i);
      used_cnt += counts_int[i];
Guolin Ke's avatar
Guolin Ke committed
181
    }
Guolin Ke's avatar
Guolin Ke committed
182
183
184
185
186
    if (used_cnt / static_cast<double>(sample_size) < 0.95f) {
      Log::Warning("Too many categoricals are ignored, \
                   please use bigger max_bin or partition this column ");
    }
    cnt_in_bin0 = static_cast<int>(sample_size) - used_cnt + counts_int[0];
Guolin Ke's avatar
Guolin Ke committed
187
  }
Guolin Ke's avatar
Guolin Ke committed
188

Guolin Ke's avatar
Guolin Ke committed
189
190
191
192
193
194
195
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
  }
  // calculate sparse rate
196
  sparse_rate_ = static_cast<double>(cnt_in_bin0) / static_cast<double>(sample_size);
Guolin Ke's avatar
Guolin Ke committed
197
198
199
200
201
202
203
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
204
  size += sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
205
  size += sizeof(BinType);
206
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
207
208
209
210
211
212
213
214
215
216
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
217
218
219
220
221
222
223
  std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
  if (bin_type_ == BinType::NumericalBin) {
    std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
  } else {
    std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
  }
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
228
229
230
231
232
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
233
234
235
236
237
238
239
240
241
242
243
244
245
  std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = std::vector<double>(num_bin_);
    std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
  } else {
    bin_2_categorical_ = std::vector<int>(num_bin_);
    std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
    categorical_2_bin_.clear();
    for (int i = 0; i < num_bin_; ++i) {
      categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
246
247
248
249
250
251
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
252
253
254
255
256
257
  fwrite(&bin_type_, sizeof(bin_type_), 1, file);
  if (bin_type_ == BinType::NumericalBin) {
    fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
  } else {
    fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
  }
Guolin Ke's avatar
Guolin Ke committed
258
259
260
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
261
262
263
264
265
266
267
268
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
    + sizeof(bin_type_);
  if (bin_type_ == BinType::NumericalBin) {
    ret += sizeof(double) *  num_bin_;
  } else {
    ret += sizeof(int) * num_bin_;
  }
  return ret;
Guolin Ke's avatar
Guolin Ke committed
269
270
271
272
273
274
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
275
276
277
278
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
283
284
285
286
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
287
288
289
290
291
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;


Guolin Ke's avatar
Guolin Ke committed
292
293
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
  bool is_enable_sparse, bool* is_sparse, int default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
294
  // sparse threshold
295
  const double kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
296
297
  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
298
    return CreateSparseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
299
300
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
301
    return CreateDenseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
302
303
304
  }
}

Guolin Ke's avatar
Guolin Ke committed
305
306
307
308
309
310
311
312
313
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
  if (bin_type == BinType::NumericalBin) {
    if (num_bin <= 256) {
      return new DenseBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new DenseBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
314
  } else {
Guolin Ke's avatar
Guolin Ke committed
315
316
317
318
319
320
321
    if (num_bin <= 256) {
      return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
322
323
324
  }
}

Guolin Ke's avatar
Guolin Ke committed
325
326
327
328
329
330
331
332
333
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
  if (bin_type == BinType::NumericalBin) {
    if (num_bin <= 256) {
      return new SparseBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new SparseBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
334
  } else {
Guolin Ke's avatar
Guolin Ke committed
335
336
337
338
339
340
341
    if (num_bin <= 256) {
      return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
342
343
344
345
  }
}

}  // namespace LightGBM