bin.cpp 9.35 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
5
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"
6
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
17

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
18
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
22
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
Guolin Ke's avatar
Guolin Ke committed
26
  bin_upper_bound_ = other.bin_upper_bound_;
27
28
  min_val_ = other.min_val_;
  max_val_ = other.max_val_;
Guolin Ke's avatar
Guolin Ke committed
29
  default_bin_ = other.default_bin_;
Guolin Ke's avatar
Guolin Ke committed
30
31
}

Guolin Ke's avatar
Guolin Ke committed
32
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
33
34
35
36
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
37

Guolin Ke's avatar
Guolin Ke committed
38
39
}

Guolin Ke's avatar
Guolin Ke committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
bool NeedFilter(std::vector<int>& cnt_in_bin, int total_cnt, int filter_cnt) {
  int sum_left = 0;
  for (size_t i = 0; i < cnt_in_bin.size() - 1; ++i) {
    sum_left += cnt_in_bin[i];
    if (sum_left >= filter_cnt) {
      return false;
    } else if (total_cnt - sum_left >= filter_cnt) {
      return false;
    }
  }
  return true;
}

void BinMapper::FindBin(std::vector<double>& values, size_t total_sample_cnt,
  int max_bin, int min_data_in_bin, int min_split_data) {
  // limit max_bin by min_data_in_bin
  std::vector<double>& raw_values = values;
  int zero_cnt = static_cast<int>(total_sample_cnt - raw_values.size());
Guolin Ke's avatar
Guolin Ke committed
58
  // find distinct_values first
59
  std::vector<double> distinct_values;
60
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
61

Guolin Ke's avatar
Guolin Ke committed
62
  std::sort(raw_values.begin(), raw_values.end());
Guolin Ke's avatar
Guolin Ke committed
63
64

  // push zero in the front
Guolin Ke's avatar
Guolin Ke committed
65
66
  if (raw_values.empty() || (raw_values[0] > 0.0f && zero_cnt > 0)) {
    distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
67
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
68
  }
Guolin Ke's avatar
Guolin Ke committed
69

Guolin Ke's avatar
Guolin Ke committed
70
71
  if (!raw_values.empty()) {
    distinct_values.push_back(raw_values[0]);
Guolin Ke's avatar
Guolin Ke committed
72
73
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
74

Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
  for (size_t i = 1; i < raw_values.size(); ++i) {
    if (raw_values[i] != raw_values[i - 1]) {
      if (raw_values[i - 1] < 0.0f && raw_values[i] > 0.0f) {
        distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
79
80
        counts.push_back(zero_cnt);
      }
Guolin Ke's avatar
Guolin Ke committed
81
      distinct_values.push_back(raw_values[i]);
82
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
83
    } else {
84
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
85
86
    }
  }
Guolin Ke's avatar
Guolin Ke committed
87
88

  // push zero in the back
Guolin Ke's avatar
Guolin Ke committed
89
90
  if (!raw_values.empty() && raw_values.back() < 0.0f && zero_cnt > 0) {
    distinct_values.push_back(0.0f);
Guolin Ke's avatar
Guolin Ke committed
91
92
    counts.push_back(zero_cnt);
  }
93
94
  min_val_ = distinct_values.front();
  max_val_ = distinct_values.back();
95
  std::vector<int> cnt_in_bin;
96
  int num_values = static_cast<int>(distinct_values.size());
Guolin Ke's avatar
Guolin Ke committed
97

Guolin Ke's avatar
Guolin Ke committed
98
99
100
101
102
103
104
105
106
107
  if (num_values <= max_bin) {
    // use distinct value is enough
    bin_upper_bound_.clear();
    int cur_cnt_inbin = 0;
    for (int i = 0; i < num_values - 1; ++i) {
      cur_cnt_inbin += counts[i];
      if (cur_cnt_inbin >= min_data_in_bin) {
        bin_upper_bound_.push_back((distinct_values[i] + distinct_values[i + 1]) / 2);
        cnt_in_bin.push_back(cur_cnt_inbin);
        cur_cnt_inbin = 0;
Guolin Ke's avatar
Guolin Ke committed
108
      }
Guolin Ke's avatar
Guolin Ke committed
109
    }
Guolin Ke's avatar
Guolin Ke committed
110
111
112
113
    cur_cnt_inbin += counts.back();
    cnt_in_bin.push_back(cur_cnt_inbin);
    bin_upper_bound_.push_back(std::numeric_limits<double>::infinity());
    num_bin_ = static_cast<int>(bin_upper_bound_.size());
Guolin Ke's avatar
Guolin Ke committed
114
  } else {
Guolin Ke's avatar
Guolin Ke committed
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
    if (min_data_in_bin > 0) {
      max_bin = std::min(max_bin, static_cast<int>(total_sample_cnt / min_data_in_bin));
      max_bin = std::max(max_bin, 1);
    }
    double mean_bin_size = static_cast<double>(total_sample_cnt) / max_bin;
    if (zero_cnt > mean_bin_size) {
      int non_zero_cnt = static_cast<int>(raw_values.size());
      max_bin = std::min(max_bin, 1 + static_cast<int>(non_zero_cnt / min_data_in_bin));
    }
    // mean size for one bin
    int rest_bin_cnt = max_bin;
    int rest_sample_cnt = static_cast<int>(total_sample_cnt);
    std::vector<bool> is_big_count_value(num_values, false);
    for (int i = 0; i < num_values; ++i) {
      if (counts[i] >= mean_bin_size) {
        is_big_count_value[i] = true;
        --rest_bin_cnt;
        rest_sample_cnt -= counts[i];
      }
    }
    mean_bin_size = static_cast<double>(rest_sample_cnt) / rest_bin_cnt;
    std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
    std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());

    int bin_cnt = 0;
    lower_bounds[bin_cnt] = distinct_values[0];
    int cur_cnt_inbin = 0;
    for (int i = 0; i < num_values - 1; ++i) {
      if (!is_big_count_value[i]) {
        rest_sample_cnt -= counts[i];
      }
      cur_cnt_inbin += counts[i];
      // need a new bin
      if (is_big_count_value[i] || cur_cnt_inbin >= mean_bin_size ||
        (is_big_count_value[i + 1] && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
        upper_bounds[bin_cnt] = distinct_values[i];
        cnt_in_bin.push_back(cur_cnt_inbin);
        ++bin_cnt;
        lower_bounds[bin_cnt] = distinct_values[i + 1];
        if (bin_cnt >= max_bin - 1) { break; }
        cur_cnt_inbin = 0;
        if (!is_big_count_value[i]) {
          --rest_bin_cnt;
          mean_bin_size = rest_sample_cnt / static_cast<double>(rest_bin_cnt);
        }
Guolin Ke's avatar
Guolin Ke committed
160
      }
Guolin Ke's avatar
Guolin Ke committed
161
    }
Guolin Ke's avatar
Guolin Ke committed
162
163
164
165
166
167
168
169
    cur_cnt_inbin += counts.back();
    cnt_in_bin.push_back(cur_cnt_inbin);
    ++bin_cnt;
    // update bin upper bound
    bin_upper_bound_ = std::vector<double>(bin_cnt);
    num_bin_ = bin_cnt;
    for (int i = 0; i < bin_cnt - 1; ++i) {
      bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
Guolin Ke's avatar
Guolin Ke committed
170
    }
Guolin Ke's avatar
Guolin Ke committed
171
172
    // last bin upper bound
    bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
Guolin Ke's avatar
Guolin Ke committed
173
  }
Guolin Ke's avatar
Guolin Ke committed
174

Guolin Ke's avatar
Guolin Ke committed
175
176
177
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
Guolin Ke's avatar
Guolin Ke committed
178
    default_bin_ = 0;
Guolin Ke's avatar
Guolin Ke committed
179
180
  } else {
    is_trival_ = false;
Guolin Ke's avatar
Guolin Ke committed
181
182
183
184
    default_bin_ = ValueToBin(0);
  }
  if (NeedFilter(cnt_in_bin, static_cast<int>(total_sample_cnt), min_split_data)) {
    is_trival_ = true;
Guolin Ke's avatar
Guolin Ke committed
185
186
  }
  // calculate sparse rate
187
  CHECK(num_bin_ <= max_bin);
Guolin Ke's avatar
Guolin Ke committed
188
  sparse_rate_ = static_cast<double>(cnt_in_bin[GetDefaultBin()]) / static_cast<double>(total_sample_cnt);
Guolin Ke's avatar
Guolin Ke committed
189
190
191
192
193
194
195
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
196
  size += sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
197
  size += 2 * sizeof(double);
198
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
199
  size += sizeof(uint32_t);
Guolin Ke's avatar
Guolin Ke committed
200
201
202
203
204
205
206
207
208
209
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
210
211
212
213
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
214
215
216
  std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
  buffer += sizeof(default_bin_);
  std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
Guolin Ke's avatar
Guolin Ke committed
217
218
219
220
221
222
223
224
225
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
226
227
228
229
  std::memcpy(&min_val_, buffer, sizeof(min_val_));
  buffer += sizeof(min_val_);
  std::memcpy(&max_val_, buffer, sizeof(max_val_));
  buffer += sizeof(max_val_);
Guolin Ke's avatar
Guolin Ke committed
230
231
232
233
  std::memcpy(&default_bin_, buffer, sizeof(default_bin_));
  buffer += sizeof(default_bin_);
  bin_upper_bound_ = std::vector<double>(num_bin_);
  std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
Guolin Ke's avatar
Guolin Ke committed
234
235
236
237
238
239
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
240
241
  fwrite(&min_val_, sizeof(min_val_), 1, file);
  fwrite(&max_val_, sizeof(max_val_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
242
243
  fwrite(&default_bin_, sizeof(default_bin_), 1, file);
  fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
Guolin Ke's avatar
Guolin Ke committed
244
245
246
}

size_t BinMapper::SizesInByte() const {
Guolin Ke's avatar
Guolin Ke committed
247
  size_t ret = sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_)
Guolin Ke's avatar
Guolin Ke committed
248
249
    + sizeof(min_val_) + sizeof(max_val_) + sizeof(default_bin_);
  ret += sizeof(double) *  num_bin_;
Guolin Ke's avatar
Guolin Ke committed
250
  return ret;
Guolin Ke's avatar
Guolin Ke committed
251
252
253
254
255
256
257
258
259
260
261
262
263
264
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;

Guolin Ke's avatar
Guolin Ke committed
265
double BinMapper::kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
266

Guolin Ke's avatar
Guolin Ke committed
267
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, 
Guolin Ke's avatar
Guolin Ke committed
268
  bool is_enable_sparse, bool* is_sparse) {
Guolin Ke's avatar
Guolin Ke committed
269
  // sparse threshold
Guolin Ke's avatar
Guolin Ke committed
270
  if (sparse_rate >= BinMapper::kSparseThreshold && is_enable_sparse) {
Guolin Ke's avatar
Guolin Ke committed
271
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
272
    return CreateSparseBin(num_data, num_bin);
Guolin Ke's avatar
Guolin Ke committed
273
274
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
275
    return CreateDenseBin(num_data, num_bin);
Guolin Ke's avatar
Guolin Ke committed
276
277
278
  }
}

Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
283
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin) {
  if (num_bin <= 256) {
    return new DenseBin<uint8_t>(num_data);
  } else if (num_bin <= 65536) {
    return new DenseBin<uint16_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
284
  } else {
Guolin Ke's avatar
Guolin Ke committed
285
    return new DenseBin<uint32_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
286
  }
Guolin Ke's avatar
Guolin Ke committed
287

Guolin Ke's avatar
Guolin Ke committed
288
289
}

Guolin Ke's avatar
Guolin Ke committed
290
291
292
293
294
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin) {
  if (num_bin <= 256) {
    return new SparseBin<uint8_t>(num_data);
  } else if (num_bin <= 65536) {
    return new SparseBin<uint16_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
295
  } else {
Guolin Ke's avatar
Guolin Ke committed
296
    return new SparseBin<uint32_t>(num_data);
Guolin Ke's avatar
Guolin Ke committed
297
298
299
300
  }
}

}  // namespace LightGBM