bin.cpp 7.14 KB
Newer Older
1
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
2
3
4
5
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"
6
#include "ordered_sparse_bin.hpp"
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
17

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
18
BinMapper::BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
19
20
21
}

// deep copy function for BinMapper
Guolin Ke's avatar
Guolin Ke committed
22
BinMapper::BinMapper(const BinMapper& other) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
Guolin Ke's avatar
Guolin Ke committed
26
  bin_upper_bound_ = std::vector<double>(num_bin_);
Guolin Ke's avatar
Guolin Ke committed
27
28
29
30
31
  for (int i = 0; i < num_bin_; ++i) {
    bin_upper_bound_[i] = other.bin_upper_bound_[i];
  }
}

Guolin Ke's avatar
Guolin Ke committed
32
BinMapper::BinMapper(const void* memory) {
Guolin Ke's avatar
Guolin Ke committed
33
34
35
36
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
Guolin Ke's avatar
Guolin Ke committed
37

Guolin Ke's avatar
Guolin Ke committed
38
39
}

Guolin Ke's avatar
Guolin Ke committed
40
void BinMapper::FindBin(std::vector<double>* values, size_t total_sample_cnt, int max_bin) {
41
  std::vector<double>& ref_values = (*values);
Guolin Ke's avatar
Guolin Ke committed
42
  size_t sample_size = total_sample_cnt;
Guolin Ke's avatar
Guolin Ke committed
43
  int zero_cnt = static_cast<int>(total_sample_cnt - ref_values.size());
Guolin Ke's avatar
Guolin Ke committed
44
  // find distinct_values first
45
  std::vector<double> distinct_values;
46
  std::vector<int> counts;
Guolin Ke's avatar
Guolin Ke committed
47

48
  std::sort(ref_values.begin(), ref_values.end());
Guolin Ke's avatar
Guolin Ke committed
49
50
51
52
53

  // push zero in the front
  if (ref_values.size() == 0 || (ref_values[0] > 0.0f && zero_cnt > 0)) {
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
54
  }
Guolin Ke's avatar
Guolin Ke committed
55

Guolin Ke's avatar
Guolin Ke committed
56
57
58
59
  if (ref_values.size() > 0) {
    distinct_values.push_back(ref_values[0]);
    counts.push_back(1);
  }
Guolin Ke's avatar
Guolin Ke committed
60

61
62
  for (size_t i = 1; i < ref_values.size(); ++i) {
    if (ref_values[i] != ref_values[i - 1]) {
Guolin Ke's avatar
Guolin Ke committed
63
64
65
66
67
68
      if (ref_values[i - 1] == 0.0f) {
        counts.back() += zero_cnt;
      } else if (ref_values[i - 1] < 0.0f && ref_values[i] > 0.0f) {
        distinct_values.push_back(0);
        counts.push_back(zero_cnt);
      }
69
70
      distinct_values.push_back(ref_values[i]);
      counts.push_back(1);
Guolin Ke's avatar
Guolin Ke committed
71
    } else {
72
      ++counts.back();
Guolin Ke's avatar
Guolin Ke committed
73
74
    }
  }
Guolin Ke's avatar
Guolin Ke committed
75
76
77
78
79
80
81

  // push zero in the back
  if (ref_values.size() > 0 && ref_values.back() < 0.0f && zero_cnt > 0) {
    distinct_values.push_back(0);
    counts.push_back(zero_cnt);
  }

82
  int num_values = static_cast<int>(distinct_values.size());
Guolin Ke's avatar
Guolin Ke committed
83
84
  int cnt_in_bin0 = 0;
  if (num_values <= max_bin) {
Guolin Ke's avatar
Guolin Ke committed
85
    std::sort(distinct_values.begin(), distinct_values.end());
Guolin Ke's avatar
Guolin Ke committed
86
87
    // use distinct value is enough
    num_bin_ = num_values;
Guolin Ke's avatar
Guolin Ke committed
88
    bin_upper_bound_ = std::vector<double>(num_values);
Guolin Ke's avatar
Guolin Ke committed
89
90
91
92
    for (int i = 0; i < num_values - 1; ++i) {
      bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
    }
    cnt_in_bin0 = counts[0];
93
    bin_upper_bound_[num_values - 1] = std::numeric_limits<double>::infinity();
Guolin Ke's avatar
Guolin Ke committed
94
95
  } else {
    // mean size for one bin
96
    double mean_bin_size = sample_size / static_cast<double>(max_bin);
Guolin Ke's avatar
Guolin Ke committed
97
    double static_mean_bin_size = mean_bin_size;
98
99
    std::vector<double> upper_bounds(max_bin, std::numeric_limits<double>::infinity());
    std::vector<double> lower_bounds(max_bin, std::numeric_limits<double>::infinity());
Guolin Ke's avatar
Guolin Ke committed
100
101
102
103
104
105
106
107
108

    int rest_sample_cnt = static_cast<int>(sample_size);
    int bin_cnt = 0;
    lower_bounds[bin_cnt] = distinct_values[0];
    int cur_cnt_inbin = 0;
    for (int i = 0; i < num_values - 1; ++i) {
      rest_sample_cnt -= counts[i];
      cur_cnt_inbin += counts[i];
      // need a new bin
Guolin Ke's avatar
Guolin Ke committed
109
110
      if (counts[i] >= static_mean_bin_size || cur_cnt_inbin >= mean_bin_size ||
        (counts[i + 1] >= static_mean_bin_size && cur_cnt_inbin >= std::max(1.0, mean_bin_size * 0.5f))) {
Guolin Ke's avatar
Guolin Ke committed
111
112
113
        upper_bounds[bin_cnt] = distinct_values[i];
        if (bin_cnt == 0) {
          cnt_in_bin0 = cur_cnt_inbin;
114
        }
Guolin Ke's avatar
Guolin Ke committed
115
116
117
118
119
        ++bin_cnt;
        lower_bounds[bin_cnt] = distinct_values[i + 1];
        if (bin_cnt >= max_bin - 1) { break; }
        cur_cnt_inbin = 0;
        mean_bin_size = rest_sample_cnt / static_cast<double>(max_bin - bin_cnt);
Guolin Ke's avatar
Guolin Ke committed
120
121
      }
    }
Guolin Ke's avatar
Guolin Ke committed
122
123
    //
    ++bin_cnt;
Guolin Ke's avatar
Guolin Ke committed
124
    // update bin upper bound
Guolin Ke's avatar
Guolin Ke committed
125
    bin_upper_bound_ = std::vector<double>(bin_cnt);
126
    num_bin_ = bin_cnt;
127
128
    for (int i = 0; i < bin_cnt - 1; ++i) {
      bin_upper_bound_[i] = (upper_bounds[i] + lower_bounds[i + 1]) / 2.0f;
Guolin Ke's avatar
Guolin Ke committed
129
130
    }
    // last bin upper bound
131
    bin_upper_bound_[bin_cnt - 1] = std::numeric_limits<double>::infinity();
Guolin Ke's avatar
Guolin Ke committed
132
133
134
135
136
137
138
139
  }
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
  }
  // calculate sparse rate
140
  sparse_rate_ = static_cast<double>(cnt_in_bin0) / static_cast<double>(sample_size);
Guolin Ke's avatar
Guolin Ke committed
141
142
143
144
145
146
147
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
148
149
  size += sizeof(double);
  size += bin * sizeof(double);
Guolin Ke's avatar
Guolin Ke committed
150
151
152
153
154
155
156
157
158
159
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
160
  std::memcpy(buffer, bin_upper_bound_.data(), num_bin_ * sizeof(double));
Guolin Ke's avatar
Guolin Ke committed
161
162
163
164
165
166
167
168
169
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
Guolin Ke's avatar
Guolin Ke committed
170
171
  bin_upper_bound_ = std::vector<double>(num_bin_);
  std::memcpy(bin_upper_bound_.data(), buffer, num_bin_ * sizeof(double));
Guolin Ke's avatar
Guolin Ke committed
172
173
174
175
176
177
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
Guolin Ke's avatar
Guolin Ke committed
178
  fwrite(bin_upper_bound_.data(), sizeof(double), num_bin_, file);
Guolin Ke's avatar
Guolin Ke committed
179
180
181
}

size_t BinMapper::SizesInByte() const {
182
  return sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_) + sizeof(double) * num_bin_;
Guolin Ke's avatar
Guolin Ke committed
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;


198
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, double sparse_rate, bool is_enable_sparse, bool* is_sparse, int default_bin) {
Guolin Ke's avatar
Guolin Ke committed
199
  // sparse threshold
200
  const double kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
201
202
  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
203
    return CreateSparseBin(num_data, num_bin, default_bin);
Guolin Ke's avatar
Guolin Ke committed
204
205
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
206
    return CreateDenseBin(num_data, num_bin, default_bin);
Guolin Ke's avatar
Guolin Ke committed
207
208
209
  }
}

Guolin Ke's avatar
Guolin Ke committed
210
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin) {
Guolin Ke's avatar
Guolin Ke committed
211
  if (num_bin <= 256) {
Guolin Ke's avatar
Guolin Ke committed
212
    return new DenseBin<uint8_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
213
  } else if (num_bin <= 65536) {
Guolin Ke's avatar
Guolin Ke committed
214
    return new DenseBin<uint16_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
215
  } else {
Guolin Ke's avatar
Guolin Ke committed
216
    return new DenseBin<uint32_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
217
218
219
  }
}

Guolin Ke's avatar
Guolin Ke committed
220
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, int default_bin) {
Guolin Ke's avatar
Guolin Ke committed
221
  if (num_bin <= 256) {
Guolin Ke's avatar
Guolin Ke committed
222
    return new SparseBin<uint8_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
223
  } else if (num_bin <= 65536) {
Guolin Ke's avatar
Guolin Ke committed
224
    return new SparseBin<uint16_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
225
  } else {
Guolin Ke's avatar
Guolin Ke committed
226
    return new SparseBin<uint32_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
227
228
229
230
  }
}

}  // namespace LightGBM