".github/vscode:/vscode.git/clone" did not exist on "cc733f8595267f313886f92ed5d1285010ba8f3f"
ordered_sparse_bin.hpp 5.61 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
#ifndef LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_
#define LIGHTGBM_IO_ORDERED_SPARSE_BIN_HPP_

#include <LightGBM/bin.h>

#include <cstring>
#include <cstdint>

#include <vector>
#include <mutex>
#include <algorithm>

13
14
#include "sparse_bin.hpp"

Guolin Ke's avatar
Guolin Ke committed
15
16
17
namespace LightGBM {

/*!
Qiwei Ye's avatar
Qiwei Ye committed
18
19
20
21
22
23
* \brief Interface for ordered bin data. efficient for construct histogram, especially for sparse bin
*        There are 2 advantages by using ordered bin.
*        1. group the data by leafs to improve the cache hit.
*        2. only store the non-zero bin, which can speed up the histogram consturction for sparse features.
*        However it brings additional cost: it need re-order the bins after every split, which will cost much for dense feature.
*        So we only using ordered bin for sparse situations.
Guolin Ke's avatar
Guolin Ke committed
24
25
*/
template <typename VAL_T>
26
class OrderedSparseBin: public OrderedBin {
Guolin Ke's avatar
Guolin Ke committed
27
28
29
30
31
public:
  /*! \brief Pair to store one bin entry */
  struct SparsePair {
    data_size_t ridx;  // data(row) index
    VAL_T bin;  // bin for this data
32
    SparsePair() : ridx(0), bin(0) {}
Guolin Ke's avatar
Guolin Ke committed
33
34
  };

35
36
  OrderedSparseBin(const SparseBin<VAL_T>* bin_data)
    :bin_data_(bin_data) {
Guolin Ke's avatar
Guolin Ke committed
37
    data_size_t cur_pos = 0;
38
    data_size_t i_delta = -1;
39
    int non_zero_cnt = 0;
40
    while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
41
      ++non_zero_cnt;
Guolin Ke's avatar
Guolin Ke committed
42
    }
43
    ordered_pair_.resize(non_zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
44
    leaf_cnt_.push_back(non_zero_cnt);
Guolin Ke's avatar
Guolin Ke committed
45
46
47
48
49
50
51
52
53
54
55
56
  }

  ~OrderedSparseBin() {
  }

  void Init(const char* used_idices, int num_leaves) override {
    // initialize the leaf information
    leaf_start_ = std::vector<data_size_t>(num_leaves, 0);
    leaf_cnt_ = std::vector<data_size_t>(num_leaves, 0);
    if (used_idices == nullptr) {
      // if using all data, copy all non-zero pair
      data_size_t j = 0;
57
58
59
60
61
62
      data_size_t cur_pos = 0;
      data_size_t i_delta = -1;
      while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
        ordered_pair_[j].ridx = cur_pos;
        ordered_pair_[j].bin = bin_data_->vals_[i_delta];
        ++j;
Guolin Ke's avatar
Guolin Ke committed
63
      }
64
      leaf_cnt_[0] = static_cast<data_size_t>(j);
Guolin Ke's avatar
Guolin Ke committed
65
66
67
68
    } else {
      // if using part of data(bagging)
      data_size_t j = 0;
      data_size_t cur_pos = 0;
69
70
71
      data_size_t i_delta = -1;
      while (bin_data_->NextNonzero(&i_delta, &cur_pos)) {
        if (used_idices[cur_pos]) {
Guolin Ke's avatar
Guolin Ke committed
72
          ordered_pair_[j].ridx = cur_pos;
73
          ordered_pair_[j].bin = bin_data_->vals_[i_delta];
Guolin Ke's avatar
Guolin Ke committed
74
75
76
77
78
79
80
81
          ++j;
        }
      }
      leaf_cnt_[0] = j;
    }
  }

  void ConstructHistogram(int leaf, const score_t* gradient, const score_t* hessian,
82
    HistogramBinEntry* out) const override {
Guolin Ke's avatar
Guolin Ke committed
83
84
85
    // get current leaf boundary
    const data_size_t start = leaf_start_[leaf];
    const data_size_t end = start + leaf_cnt_[leaf];
86
87
    const int rest = (end - start) % 4;
    data_size_t i = start;
Guolin Ke's avatar
Guolin Ke committed
88
    // use data on current leaf to construct histogram
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
    for (; i < end - rest; i += 4) {

      const VAL_T bin0 = ordered_pair_[i].bin;
      const VAL_T bin1 = ordered_pair_[i + 1].bin;
      const VAL_T bin2 = ordered_pair_[i + 2].bin;
      const VAL_T bin3 = ordered_pair_[i + 3].bin;

      const auto g0 = gradient[ordered_pair_[i].ridx];
      const auto h0 = hessian[ordered_pair_[i].ridx];
      const auto g1 = gradient[ordered_pair_[i + 1].ridx];
      const auto h1 = hessian[ordered_pair_[i + 1].ridx];
      const auto g2 = gradient[ordered_pair_[i + 2].ridx];
      const auto h2 = hessian[ordered_pair_[i + 2].ridx];
      const auto g3 = gradient[ordered_pair_[i + 3].ridx];
      const auto h3 = hessian[ordered_pair_[i + 3].ridx];

      out[bin0].sum_gradients += g0;
      out[bin1].sum_gradients += g1;
      out[bin2].sum_gradients += g2;
      out[bin3].sum_gradients += g3;

      out[bin0].sum_hessians += h0;
      out[bin1].sum_hessians += h1;
      out[bin2].sum_hessians += h2;
      out[bin3].sum_hessians += h3;

      ++out[bin0].cnt;
      ++out[bin1].cnt;
      ++out[bin2].cnt;
      ++out[bin3].cnt;
    }

    for (; i < end; ++i) {

      const VAL_T bin0 = ordered_pair_[i].bin;

      const auto g0 = gradient[ordered_pair_[i].ridx];
      const auto h0 = hessian[ordered_pair_[i].ridx];

      out[bin0].sum_gradients += g0;
      out[bin0].sum_hessians += h0;
      ++out[bin0].cnt;
Guolin Ke's avatar
Guolin Ke committed
131
132
133
    }
  }

Guolin Ke's avatar
Guolin Ke committed
134
  void Split(int leaf, int right_leaf, const char* is_in_leaf, char mark) override {
Guolin Ke's avatar
Guolin Ke committed
135
136
137
138
139
140
141
    // get current leaf boundary
    const data_size_t l_start = leaf_start_[leaf];
    const data_size_t l_end = l_start + leaf_cnt_[leaf];
    // new left leaf end after split
    data_size_t new_left_end = l_start;

    for (data_size_t i = l_start; i < l_end; ++i) {
Guolin Ke's avatar
Guolin Ke committed
142
      if (is_in_leaf[ordered_pair_[i].ridx] == mark) {
Guolin Ke's avatar
Guolin Ke committed
143
144
145
146
147
148
149
150
151
        std::swap(ordered_pair_[new_left_end], ordered_pair_[i]);
        ++new_left_end;
      }
    }

    leaf_start_[right_leaf] = new_left_end;
    leaf_cnt_[leaf] = new_left_end - l_start;
    leaf_cnt_[right_leaf] = l_end - new_left_end;
  }
Guolin Ke's avatar
Guolin Ke committed
152
153
154
  data_size_t NonZeroCount(int leaf) const override {
    return static_cast<data_size_t>(leaf_cnt_[leaf]);
  }
Guolin Ke's avatar
Guolin Ke committed
155
156
157
158
159
160
  /*! \brief Disable copy */
  OrderedSparseBin<VAL_T>& operator=(const OrderedSparseBin<VAL_T>&) = delete;
  /*! \brief Disable copy */
  OrderedSparseBin<VAL_T>(const OrderedSparseBin<VAL_T>&) = delete;

private:
161
  const SparseBin<VAL_T>* bin_data_;
Guolin Ke's avatar
Guolin Ke committed
162
163
164
165
166
167
168
  /*! \brief Store non-zero pair , group by leaf */
  std::vector<SparsePair> ordered_pair_;
  /*! \brief leaf_start_[i] means data in i-th leaf start from */
  std::vector<data_size_t> leaf_start_;
  /*! \brief leaf_cnt_[i] means number of data in i-th leaf */
  std::vector<data_size_t> leaf_cnt_;
};
169
170
171
172
173
174

template <typename VAL_T>
OrderedBin* SparseBin<VAL_T>::CreateOrderedBin() const {
  return new OrderedSparseBin<VAL_T>(this);
}

Guolin Ke's avatar
Guolin Ke committed
175
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
176
#endif   // LightGBM_IO_ORDERED_SPARSE_BIN_HPP_