bin.cpp 6.5 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#include <LightGBM/bin.h>

#include "dense_bin.hpp"
#include "sparse_bin.hpp"

#include <cmath>
#include <cstring>
#include <cstdint>

#include <limits>
#include <vector>
#include <algorithm>

namespace LightGBM {

BinMapper::BinMapper()
  :bin_upper_bound_(nullptr) {
}

// deep copy function for BinMapper
BinMapper::BinMapper(const BinMapper& other)
  : bin_upper_bound_(nullptr) {
  num_bin_ = other.num_bin_;
  is_trival_ = other.is_trival_;
  sparse_rate_ = other.sparse_rate_;
26
  bin_upper_bound_ = new float[num_bin_];
Guolin Ke's avatar
Guolin Ke committed
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  for (int i = 0; i < num_bin_; ++i) {
    bin_upper_bound_[i] = other.bin_upper_bound_[i];
  }
}

BinMapper::BinMapper(const void* memory)
  :bin_upper_bound_(nullptr) {
  CopyFrom(reinterpret_cast<const char*>(memory));
}

BinMapper::~BinMapper() {
  delete[] bin_upper_bound_;
}

41
void BinMapper::FindBin(std::vector<float>* values, int max_bin) {
Guolin Ke's avatar
Guolin Ke committed
42
43
  size_t sample_size = values->size();
  // find distinct_values first
44
  float* distinct_values = new float[sample_size];
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
  int *counts = new int[sample_size];
  int num_values = 1;
  std::sort(values->begin(), values->end());
  distinct_values[0] = (*values)[0];
  counts[0] = 1;
  for (size_t i = 1; i < values->size(); ++i) {
    if ((*values)[i] != (*values)[i - 1]) {
      distinct_values[num_values] = (*values)[i];
      counts[num_values] = 1;
      ++num_values;
    } else {
      ++counts[num_values - 1];
    }
  }
  int cnt_in_bin0 = 0;

  if (num_values <= max_bin) {
    // use distinct value is enough
    num_bin_ = num_values;
64
    bin_upper_bound_ = new float[num_values];
Guolin Ke's avatar
Guolin Ke committed
65
66
67
68
    for (int i = 0; i < num_values - 1; ++i) {
      bin_upper_bound_[i] = (distinct_values[i] + distinct_values[i + 1]) / 2;
    }
    cnt_in_bin0 = counts[0];
69
    bin_upper_bound_[num_values - 1] = std::numeric_limits<float>::infinity();
Guolin Ke's avatar
Guolin Ke committed
70
71
72
  } else {
    // need find bins
    num_bin_ = max_bin;
73
74
    bin_upper_bound_ = new float[max_bin];
    float * bin_lower_bound = new float[max_bin];
Guolin Ke's avatar
Guolin Ke committed
75
    // mean size for one bin
76
    float mean_bin_size = sample_size / static_cast<float>(max_bin);
Guolin Ke's avatar
Guolin Ke committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    int rest_sample_cnt = static_cast<int>(sample_size);
    int cur_cnt_inbin = 0;
    int bin_cnt = 0;
    bin_lower_bound[0] = distinct_values[0];
    for (int i = 0; i < num_values - 1; ++i) {
      rest_sample_cnt -= counts[i];
      cur_cnt_inbin += counts[i];
      // need a new bin
      if (cur_cnt_inbin >= mean_bin_size) {
        bin_upper_bound_[bin_cnt] = distinct_values[i];
        if (bin_cnt == 0) { cnt_in_bin0 = cur_cnt_inbin; }
        ++bin_cnt;
        bin_lower_bound[bin_cnt] = distinct_values[i + 1];
        cur_cnt_inbin = 0;
91
        mean_bin_size = rest_sample_cnt / static_cast<float>(max_bin - bin_cnt);
Guolin Ke's avatar
Guolin Ke committed
92
93
94
95
96
      }
    }
    cur_cnt_inbin += counts[num_values - 1];
    // update bin upper bound
    for (int i = 0; i < bin_cnt; ++i) {
97
      bin_upper_bound_[i] = (bin_upper_bound_[i] + bin_lower_bound[i + 1]) / 2.0f;
Guolin Ke's avatar
Guolin Ke committed
98
99
    }
    // last bin upper bound
100
    bin_upper_bound_[bin_cnt] = std::numeric_limits<float>::infinity();
Guolin Ke's avatar
Guolin Ke committed
101
102
103
104
105
    ++bin_cnt;
    delete[] bin_lower_bound;
    // if no so much bin
    if (bin_cnt < max_bin) {
      // old bin data
106
      float* tmp_bin_upper_bound = bin_upper_bound_;
Guolin Ke's avatar
Guolin Ke committed
107
      num_bin_ = bin_cnt;
108
      bin_upper_bound_ = new float[num_bin_];
Guolin Ke's avatar
Guolin Ke committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
      // copy back
      for (int i = 0; i < num_bin_; ++i) {
        bin_upper_bound_[i] = tmp_bin_upper_bound[i];
      }
      // free old space
      delete[] tmp_bin_upper_bound;
    }
  }
  delete[] distinct_values;
  delete[] counts;
  // check trival(num_bin_ == 1) feature
  if (num_bin_ <= 1) {
    is_trival_ = true;
  } else {
    is_trival_ = false;
  }
  // calculate sparse rate
126
  sparse_rate_ = static_cast<float>(cnt_in_bin0) / static_cast<float>(sample_size);
Guolin Ke's avatar
Guolin Ke committed
127
128
129
130
131
132
133
}


int BinMapper::SizeForSpecificBin(int bin) {
  int size = 0;
  size += sizeof(int);
  size += sizeof(bool);
134
135
  size += sizeof(float);
  size += bin * sizeof(float);
Guolin Ke's avatar
Guolin Ke committed
136
137
138
139
140
141
142
143
144
145
  return size;
}

void BinMapper::CopyTo(char * buffer) {
  std::memcpy(buffer, &num_bin_, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(buffer, &is_trival_, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(buffer, &sparse_rate_, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
146
  std::memcpy(buffer, bin_upper_bound_, num_bin_ * sizeof(float));
Guolin Ke's avatar
Guolin Ke committed
147
148
149
150
151
152
153
154
155
156
}

void BinMapper::CopyFrom(const char * buffer) {
  std::memcpy(&num_bin_, buffer, sizeof(num_bin_));
  buffer += sizeof(num_bin_);
  std::memcpy(&is_trival_, buffer, sizeof(is_trival_));
  buffer += sizeof(is_trival_);
  std::memcpy(&sparse_rate_, buffer, sizeof(sparse_rate_));
  buffer += sizeof(sparse_rate_);
  if (bin_upper_bound_ != nullptr) { delete[] bin_upper_bound_; }
157
158
  bin_upper_bound_ = new float[num_bin_];
  std::memcpy(bin_upper_bound_, buffer, num_bin_ * sizeof(float));
Guolin Ke's avatar
Guolin Ke committed
159
160
161
162
163
164
}

void BinMapper::SaveBinaryToFile(FILE* file) const {
  fwrite(&num_bin_, sizeof(num_bin_), 1, file);
  fwrite(&is_trival_, sizeof(is_trival_), 1, file);
  fwrite(&sparse_rate_, sizeof(sparse_rate_), 1, file);
165
  fwrite(bin_upper_bound_, sizeof(float), num_bin_, file);
Guolin Ke's avatar
Guolin Ke committed
166
167
168
}

size_t BinMapper::SizesInByte() const {
169
  return sizeof(num_bin_) + sizeof(is_trival_) + sizeof(sparse_rate_) + sizeof(float) * num_bin_;
Guolin Ke's avatar
Guolin Ke committed
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
}

template class DenseBin<uint8_t>;
template class DenseBin<uint16_t>;
template class DenseBin<uint32_t>;

template class SparseBin<uint8_t>;
template class SparseBin<uint16_t>;
template class SparseBin<uint32_t>;

template class OrderedSparseBin<uint8_t>;
template class OrderedSparseBin<uint16_t>;
template class OrderedSparseBin<uint32_t>;


185
Bin* Bin::CreateBin(data_size_t num_data, int num_bin, float sparse_rate, bool is_enable_sparse, bool* is_sparse, int default_bin) {
Guolin Ke's avatar
Guolin Ke committed
186
  // sparse threshold
187
  const float kSparseThreshold = 0.8f;
Guolin Ke's avatar
Guolin Ke committed
188
189
  if (sparse_rate >= kSparseThreshold && is_enable_sparse) {
    *is_sparse = true;
Guolin Ke's avatar
Guolin Ke committed
190
    return CreateSparseBin(num_data, num_bin, default_bin);
Guolin Ke's avatar
Guolin Ke committed
191
192
  } else {
    *is_sparse = false;
Guolin Ke's avatar
Guolin Ke committed
193
    return CreateDenseBin(num_data, num_bin, default_bin);
Guolin Ke's avatar
Guolin Ke committed
194
195
196
  }
}

Guolin Ke's avatar
Guolin Ke committed
197
Bin* Bin::CreateDenseBin(data_size_t num_data, int num_bin, int default_bin) {
Guolin Ke's avatar
Guolin Ke committed
198
  if (num_bin <= 256) {
Guolin Ke's avatar
Guolin Ke committed
199
    return new DenseBin<uint8_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
200
  } else if (num_bin <= 65536) {
Guolin Ke's avatar
Guolin Ke committed
201
    return new DenseBin<uint16_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
202
  } else {
Guolin Ke's avatar
Guolin Ke committed
203
    return new DenseBin<uint32_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
204
205
206
  }
}

Guolin Ke's avatar
Guolin Ke committed
207
Bin* Bin::CreateSparseBin(data_size_t num_data, int num_bin, int default_bin) {
Guolin Ke's avatar
Guolin Ke committed
208
  if (num_bin <= 256) {
Guolin Ke's avatar
Guolin Ke committed
209
    return new SparseBin<uint8_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
210
  } else if (num_bin <= 65536) {
Guolin Ke's avatar
Guolin Ke committed
211
    return new SparseBin<uint16_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
212
  } else {
Guolin Ke's avatar
Guolin Ke committed
213
    return new SparseBin<uint32_t>(num_data, default_bin);
Guolin Ke's avatar
Guolin Ke committed
214
215
216
217
  }
}

}  // namespace LightGBM