"vscode:/vscode.git/clone" did not exist on "28e9875e1851b4d11590e7947e40fbc4b6d90cbe"
bin.cpp 11.2 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
5
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"
6
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
17

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
18
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
22
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
  bin_type_ = other.bin_type_;
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = other.bin_upper_bound_;
  } else {
    bin_2_categorical_ = other.bin_2_categorical_;
    categorical_2_bin_ = other.categorical_2_bin_;
Guolin Ke's avatar
Guolin Ke committed
32
  }
Guolin Ke's avatar
Guolin Ke committed
33

Guolin Ke's avatar
Guolin Ke committed
34
35
}

Guolin Ke's avatar
Guolin Ke committed
36
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
37
38
39
40
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
41

Guolin Ke's avatar
Guolin Ke committed
42
43
}

Guolin Ke's avatar
Guolin Ke committed
44
45
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin, BinType bin_type) {
  bin_type_ = bin_type;
46
  std::vector<double>& ref_values = (*values);
Guolin Ke's avatar
Guolin Ke committed
47
  size_t sample_size = total_sample_cnt;
Guolin Ke's avatar
Guolin Ke committed
48
  int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
Guolin Ke's avatar
Guolin Ke committed
49
  // find distinct_values first
50
  std::vector<double> distinct_values;
51
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
52

53
  std::sort(ref_values.begin(), ref_values.end());
Guolin Ke's avatar
Guolin Ke committed
54
55

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
56
  if (ref_values.empty() || (ref_values[0] > 0.0f && zero_cnt > 0)) {
Guolin Ke's avatar
Guolin Ke committed
57
58
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
59
  }
Guolin Ke's avatar
Guolin Ke committed
60

Guolin Ke's avatar
Guolin Ke committed
61
  if (!ref_values.empty()) {
Guolin Ke's avatar
Guolin Ke committed
62
63
64
    distinct_values.push_back(ref_values[0]);
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
65

66
67
  for (size_t i = 1; i < ref_values.size(); ++i) {
    if (ref_values[i] != ref_values[i - 1]) {
Guolin Ke's avatar
Guolin Ke committed
68
69
70
71
72
73
      if (ref_values[i - 1] == 0.0f) {
        counts.back() += zero_cnt;
      } else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
        distinct_values.push_back(0);
        counts.push_back(zero_cnt);
      }
74
75
      distinct_values.push_back(ref_values[i]);
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
76
    } else {
77
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
78
79
    }
  }
Guolin Ke's avatar
Guolin Ke committed
80
81

  // push zero in the back
Guolin Ke's avatar
Guolin Ke committed
82
  if (!ref_values.empty() && ref_values.back() < 0.0f && zero_cnt > 0) {
Guolin Ke's avatar
Guolin Ke committed
83
84
85
86
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
  }

87
  int num_values = static_cast<int>(distinct_values.size());
Guolin Ke's avatar
Guolin Ke committed
88
  int cnt_in_bin0 = 0;
Guolin Ke's avatar
Guolin Ke committed
89
90
91
92
93
94
95
96
  if (bin_type_ == BinType::NumericalBin) {
    if (num_values <= max_bin) {
      std::sort(distinct_values.begin(), distinct_values.end());
      // use distinct value is enough
      num_bin_ = num_values;
      bin_upper_bound_ = std::vector<double>(num_values);
      for (int i = 0; i < num_values - 1; ++i) {
        bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
wxchan's avatar
wxchan committed
97
      }
Guolin Ke's avatar
Guolin Ke committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
      cnt_in_bin0 = counts[0];
      bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
    } else {
      // mean size for one bin
      double mean_bin_size = sample_size / static_cast<double>(max_bin);
      int rest_bin_cnt = max_bin;
      int rest_sample_cnt = static_cast<int>(sample_size);
      std::vector<bool> is_big_count_value(num_values, false);
      for (int i = 0; i < num_values; ++i) {
        if (counts[i] >= mean_bin_size) {
          is_big_count_value[i] = true;
          --rest_bin_cnt;
          rest_sample_cnt -= counts[i];
        }
      }
      mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
wxchan's avatar
wxchan committed
114

Guolin Ke's avatar
Guolin Ke committed
115
116
      std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
      std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
Guolin Ke's avatar
Guolin Ke committed
117

Guolin Ke's avatar
Guolin Ke committed
118
119
120
121
      int bin_cnt = 0;
      lower_bounds[bin_cnt] = distinct_values[0];
      int cur_cnt_inbin = 0;
      for (int i = 0; i < num_values - 1; ++i) {
wxchan's avatar
wxchan committed
122
        if (!is_big_count_value[i]) {
Guolin Ke's avatar
Guolin Ke committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
          rest_sample_cnt -= counts[i];
        }
        cur_cnt_inbin += counts[i];
        // need a new bin
        if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
          (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
          upper_bounds[bin_cnt] = distinct_values[i];
          if (bin_cnt == 0) {
            cnt_in_bin0 = cur_cnt_inbin;
          }
          ++bin_cnt;
          lower_bounds[bin_cnt] = distinct_values[i + 1];
          if (bin_cnt >= max_bin - 1) { break; }
          cur_cnt_inbin = 0;
          if (!is_big_count_value[i]) {
            --rest_bin_cnt;
            mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
          }
wxchan's avatar
wxchan committed
141
        }
Guolin Ke's avatar
Guolin Ke committed
142
      }
Guolin Ke's avatar
Guolin Ke committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
      //
      ++bin_cnt;
      // update bin upper bound
      bin_upper_bound_ = std::vector<double>(bin_cnt);
      num_bin_ = bin_cnt;
      for (int i = 0; i < bin_cnt - 1; ++i) {
        bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
      }
      // last bin upper bound
      bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
    }

  } else {
    // convert to int type first
    std::vector<int> distinct_values_int;
    std::vector<int> counts_int;
    distinct_values_int.push_back(static_cast<int>(distinct_values[0]));
    counts_int.push_back(counts[0]);
    for (size_t i = 1; i < distinct_values.size(); ++i) {
      if (static_cast<int>(distinct_values[i]) != distinct_values_int.back()) {
        distinct_values_int.push_back(static_cast<int>(distinct_values[i]));
        counts_int.push_back(counts[i]);
      } else {
        counts_int.back() += counts[i];
      }
Guolin Ke's avatar
Guolin Ke committed
168
    }
Guolin Ke's avatar
Guolin Ke committed
169
170
171
172
173
174
175
176
177
178
179
    // sort by counts
    Common::SortForPair<int, int>(counts_int, distinct_values_int, 0, true);
    // will ingore the categorical of small counts
    num_bin_ = std::min(max_bin, static_cast<int>(counts_int.size()));
    categorical_2_bin_.clear();
    bin_2_categorical_ = std::vector<int>(num_bin_);
    int used_cnt = 0;
    for (int i = 0; i < num_bin_; ++i) {
      bin_2_categorical_[i] = distinct_values_int[i];
      categorical_2_bin_[distinct_values_int[i]] = static_cast<unsigned int>(i);
      used_cnt += counts_int[i];
Guolin Ke's avatar
Guolin Ke committed
180
    }
Guolin Ke's avatar
Guolin Ke committed
181
182
183
184
185
    if (used_cnt / static_cast<double>(sample_size) < 0.95f) {
      Log::Warning("Too many categoricals are ignored, \
                   please use bigger max_bin or partition this column ");
    }
    cnt_in_bin0 = static_cast<int>(sample_size) - used_cnt + counts_int[0];
Guolin Ke's avatar
Guolin Ke committed
186
  }
Guolin Ke's avatar
Guolin Ke committed
187

Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
192
193
194
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
  }
  // calculate sparse rate
195
  sparse_rate_ = static_cast<double>(cnt_in_bin0) / static_cast<double>(sample_size);
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
200
201
202
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
203
  size += sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
204
  size += sizeof(BinType);
205
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
206
207
208
209
210
211
212
213
214
215
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
216
217
218
219
220
221
222
  std::memcpy(buffer, &bin_type_, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
  if (bin_type_ == BinType::NumericalBin) {
    std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
  } else {
    std::memcpy(buffer, bin_2_categorical_.data(), num_bin_ * sizeof(int));
  }
Guolin Ke's avatar
Guolin Ke committed
223
224
225
226
227
228
229
230
231
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
232
233
234
235
236
237
238
239
240
241
242
243
244
  std::memcpy(&bin_type_, buffer, sizeof(bin_type_));
  buffer += sizeof(bin_type_);
  if (bin_type_ == BinType::NumericalBin) {
    bin_upper_bound_ = std::vector<double>(num_bin_);
    std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
  } else {
    bin_2_categorical_ = std::vector<int>(num_bin_);
    std::memcpy(bin_2_categorical_.data(), buffer, num_bin_ * sizeof(int));
    categorical_2_bin_.clear();
    for (int i = 0; i < num_bin_; ++i) {
      categorical_2_bin_[bin_2_categorical_[i]] = static_cast<unsigned int>(i);
    }
  }
Guolin Ke's avatar
Guolin Ke committed
245
246
247
248
249
250
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
251
252
253
254
255
256
  fwrite(&bin_type_, sizeof(bin_type_), 1, file);
  if (bin_type_ == BinType::NumericalBin) {
    fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
  } else {
    fwrite(bin_2_categorical_.data(), sizeof(int), num_bin_, file);
  }
Guolin Ke's avatar
Guolin Ke committed
257
258
259
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
260
261
262
263
264
265
266
267
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
    + sizeof(bin_type_);
  if (bin_type_ == BinType::NumericalBin) {
    ret += sizeof(double) *  num_bin_;
  } else {
    ret += sizeof(int) * num_bin_;
  }
  return ret;
Guolin Ke's avatar
Guolin Ke committed
268
269
270
271
272
273
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
274
275
276
277
template class DenseCategoricalBin<uint8_t>;
template class DenseCategoricalBin<uint16_t>;
template class DenseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
278
279
280
281
template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
282
283
284
285
template class SparseCategoricalBin<uint8_t>;
template class SparseCategoricalBin<uint16_t>;
template class SparseCategoricalBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
286
287
288
289
290
template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;


Guolin Ke's avatar
Guolin Ke committed
291
292
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
  bool is_enable_sparse, bool* is_sparse, int default_bin, BinType bin_type) {
Guolin Ke's avatar
Guolin Ke committed
293
  // sparse threshold
294
  const double kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
295
296
  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
297
    return CreateSparseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
298
299
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
300
    return CreateDenseBin(num_data, num_bin, default_bin, bin_type);
Guolin Ke's avatar
Guolin Ke committed
301
302
303
  }
}

Guolin Ke's avatar
Guolin Ke committed
304
305
306
307
308
309
310
311
312
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
  if (bin_type == BinType::NumericalBin) {
    if (num_bin <= 256) {
      return new DenseBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new DenseBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
313
  } else {
Guolin Ke's avatar
Guolin Ke committed
314
315
316
317
318
319
320
    if (num_bin <= 256) {
      return new DenseCategoricalBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new DenseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new DenseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
321
322
323
  }
}

Guolin Ke's avatar
Guolin Ke committed
324
325
326
327
328
329
330
331
332
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, int default_bin, BinType bin_type) {
  if (bin_type == BinType::NumericalBin) {
    if (num_bin <= 256) {
      return new SparseBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new SparseBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
333
  } else {
Guolin Ke's avatar
Guolin Ke committed
334
335
336
337
338
339
340
    if (num_bin <= 256) {
      return new SparseCategoricalBin<uint8_t>(num_data, default_bin);
    } else if (num_bin <= 65536) {
      return new SparseCategoricalBin<uint16_t>(num_data, default_bin);
    } else {
      return new SparseCategoricalBin<uint32_t>(num_data, default_bin);
    }
Guolin Ke's avatar
Guolin Ke committed
341
342
343
344
  }
}

}  // namespace LightGBM