data_partition.hpp 7.59 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_

#include <LightGBM/meta.h>
Guolin Ke's avatar
Guolin Ke committed
5
#include <LightGBM/dataset.h>
Guolin Ke's avatar
Guolin Ke committed
6

7
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
8
9
10
11
12
13
14
15
16
17
18

#include <cstring>

#include <vector>

namespace LightGBM {
/*!
* \brief DataPartition is used to store the the partition of data on tree.
*/
class DataPartition {
public:
19
20
  DataPartition(data_size_t num_data, int num_leaves)
    :num_data_(num_data), num_leaves_(num_leaves) {
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
    indices_.resize(num_data_);
    temp_left_indices_.resize(num_data_);
    temp_right_indices_.resize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
26
    used_data_indices_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
27
28
    #pragma omp parallel
    #pragma omp master
Guolin Ke's avatar
Guolin Ke committed
29
30
31
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
32
33
34
35
36
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
37
  }
38
39
40
41
42
43

  void ResetLeaves(int num_leaves) {
    num_leaves_ = num_leaves;
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
  }
Guolin Ke's avatar
Guolin Ke committed
44
45
46
47
48
49
  void ResetNumData(int num_data) {
    num_data_ = num_data;
    indices_.resize(num_data_);
    temp_left_indices_.resize(num_data_);
    temp_right_indices_.resize(num_data_);
  }
Guolin Ke's avatar
Guolin Ke committed
50
  ~DataPartition() {
Guolin Ke's avatar
Guolin Ke committed
51

Guolin Ke's avatar
Guolin Ke committed
52
53
54
55
56
57
  }

  /*!
  * \brief Init, will put all data on the root(leaf_idx = 0)
  */
  void Init() {
58
59
    std::fill(leaf_begin_.begin(), leaf_begin_.end(), 0);
    std::fill(leaf_count_.begin(), leaf_count_.end(), 0);
Guolin Ke's avatar
Guolin Ke committed
60
61
62
    if (used_data_indices_ == nullptr) {
      // if using all data
      leaf_count_[0] = num_data_;
Guolin Ke's avatar
Guolin Ke committed
63
      #pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
64
65
66
67
68
69
      for (data_size_t i = 0; i < num_data_; ++i) {
        indices_[i] = i;
      }
    } else {
      // if bagging
      leaf_count_[0] = used_data_count_;
Guolin Ke's avatar
Guolin Ke committed
70
      std::memcpy(indices_.data(), used_data_indices_, used_data_count_ * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
71
72
73
74
75
76
77
78
79
    }
  }

  /*!
  * \brief Get the data indices of one leaf
  * \param leaf index of leaf
  * \param indices output data indices
  * \return number of data on this leaf
  */
Guolin Ke's avatar
Guolin Ke committed
80
  const data_size_t* GetIndexOnLeaf(int leaf, data_size_t* out_len) const {
Guolin Ke's avatar
Guolin Ke committed
81
82
    // copy reference, maybe unsafe, but faster
    data_size_t begin = leaf_begin_[leaf];
Guolin Ke's avatar
Guolin Ke committed
83
84
    *out_len = leaf_count_[leaf];
    return indices_.data() + begin;
Guolin Ke's avatar
Guolin Ke committed
85
86
87
88
89
90
91
92
93
  }

  /*!
  * \brief Split the data
  * \param leaf index of leaf
  * \param feature_bins feature bin data
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
Guolin Ke's avatar
Guolin Ke committed
94
  void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, uint32_t default_bin_for_zero, int right_leaf) {
Guolin Ke's avatar
Guolin Ke committed
95
    const data_size_t min_inner_size = 512;
Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
100
101
102
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
    const data_size_t cnt = leaf_count_[leaf];

    data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
    // split data multi-threading
103
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
104
    #pragma omp parallel for schedule(static, 1)
Guolin Ke's avatar
Guolin Ke committed
105
    for (int i = 0; i < num_threads_; ++i) {
106
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
107
108
109
110
111
112
113
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > cnt) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
      // split data inner, reduce the times of function called
Guolin Ke's avatar
Guolin Ke committed
114
      data_size_t cur_left_count = dataset->Split(feature, threshold, default_bin_for_zero, indices_.data() + begin + cur_start, cur_cnt,
Guolin Ke's avatar
Guolin Ke committed
115
                                                  temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
Guolin Ke's avatar
Guolin Ke committed
116
117
118
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
119
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
120
    }
121
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
122
123
124
125
126
127
128
129
130
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
    // copy back indices of right leaf to indices_
Guolin Ke's avatar
Guolin Ke committed
131
    #pragma omp parallel for schedule(static, 1)
Guolin Ke's avatar
Guolin Ke committed
132
133
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
134
135
        std::memcpy(indices_.data() + begin + left_write_pos_buf_[i],
                    temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
136
137
      }
      if (right_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
138
139
        std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i],
                    temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
140
141
142
143
144
145
146
147
148
149
150
151
152
      }
    }
    // update leaf boundary
    leaf_count_[leaf] = left_cnt;
    leaf_begin_[right_leaf] = left_cnt + begin;
    leaf_count_[right_leaf] = cnt - left_cnt;
  }

  /*!
  * \brief SetLabelAt used data indices before training, used for bagging
  * \param used_data_indices indices of used data
  * \param num_used_data number of used data
  */
Guolin Ke's avatar
Guolin Ke committed
153
  void SetUsedDataIndices(const data_size_t* used_data_indices, data_size_t num_used_data) {
Guolin Ke's avatar
Guolin Ke committed
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
    used_data_indices_ = used_data_indices;
    used_data_count_ = num_used_data;
  }

  /*!
  * \brief Get number of data on one leaf
  * \param leaf index of leaf
  * \return number of data of this leaf
  */
  data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }

  /*!
  * \brief Get leaf begin
  * \param leaf index of leaf
  * \return begin index of this leaf
  */
  data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
172
  const data_size_t* indices() const { return indices_.data(); }
Guolin Ke's avatar
Guolin Ke committed
173
174
175
176
177
178
179
180
181
182

  /*! \brief Get number of leaves */
  int num_leaves() const { return num_leaves_; }

private:
  /*! \brief Number of all data */
  data_size_t num_data_;
  /*! \brief Number of all leaves */
  int num_leaves_;
  /*! \brief start index of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
183
  std::vector<data_size_t> leaf_begin_;
Guolin Ke's avatar
Guolin Ke committed
184
  /*! \brief number of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
185
  std::vector<data_size_t> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
186
  /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
Guolin Ke's avatar
Guolin Ke committed
187
  std::vector<data_size_t> indices_;
Guolin Ke's avatar
Guolin Ke committed
188
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
189
  std::vector<data_size_t> temp_left_indices_;
Guolin Ke's avatar
Guolin Ke committed
190
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
191
  std::vector<data_size_t> temp_right_indices_;
Guolin Ke's avatar
Guolin Ke committed
192
193
194
195
196
197
198
  /*! \brief used data indices, used for bagging */
  const data_size_t* used_data_indices_;
  /*! \brief used data count, used for bagging */
  data_size_t used_data_count_;
  /*! \brief number of threads */
  int num_threads_;
  /*! \brief Buffer for multi-threading data partition, used to store offset for different threads */
Guolin Ke's avatar
Guolin Ke committed
199
  std::vector<data_size_t> offsets_buf_;
Guolin Ke's avatar
Guolin Ke committed
200
  /*! \brief Buffer for multi-threading data partition, used to store left count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
201
  std::vector<data_size_t> left_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
202
  /*! \brief Buffer for multi-threading data partition, used to store right count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
203
  std::vector<data_size_t> right_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
204
  /*! \brief Buffer for multi-threading data partition, used to store write position of left leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
205
  std::vector<data_size_t> left_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
206
  /*! \brief Buffer for multi-threading data partition, used to store write position of right leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
207
  std::vector<data_size_t> right_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
208
209
210
};

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
211
#endif   // LightGBM_TREELEARNER_DATA_PARTITION_HPP_