"vscode:/vscode.git/clone" did not exist on "47feda49821bc761eb01a7e1155e37a84b3794e4"
data_partition.hpp 7.44 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_

#include <LightGBM/meta.h>
Guolin Ke's avatar
Guolin Ke committed
5
#include <LightGBM/dataset.h>
Guolin Ke's avatar
Guolin Ke committed
6

7
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
8
9
10
11
12
13
14
15
16
17
18

#include <cstring>

#include <vector>

namespace LightGBM {
/*!
* \brief DataPartition is used to store the the partition of data on tree.
*/
class DataPartition {
public:
19
20
  DataPartition(data_size_t num_data, int num_leaves)
    :num_data_(num_data), num_leaves_(num_leaves) {
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
    indices_.resize(num_data_);
    temp_left_indices_.resize(num_data_);
    temp_right_indices_.resize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
    used_data_indices_ = nullptr;
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
32
33
34
35
36
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
37
  }
38
39
40
41
42
43

  void ResetLeaves(int num_leaves) {
    num_leaves_ = num_leaves;
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
  }
Guolin Ke's avatar
Guolin Ke committed
44
45
46
47
48
49
  void ResetNumData(int num_data) {
    num_data_ = num_data;
    indices_.resize(num_data_);
    temp_left_indices_.resize(num_data_);
    temp_right_indices_.resize(num_data_);
  }
Guolin Ke's avatar
Guolin Ke committed
50
  ~DataPartition() {
Guolin Ke's avatar
Guolin Ke committed
51

Guolin Ke's avatar
Guolin Ke committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  }

  /*!
  * \brief Init, will put all data on the root(leaf_idx = 0)
  */
  void Init() {
    for (int i = 0; i < num_leaves_; ++i) {
      leaf_count_[i] = 0;
    }
    leaf_begin_[0] = 0;
    if (used_data_indices_ == nullptr) {
      // if using all data
      leaf_count_[0] = num_data_;
#pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        indices_[i] = i;
      }
    } else {
      // if bagging
      leaf_count_[0] = used_data_count_;
Guolin Ke's avatar
Guolin Ke committed
72
      std::memcpy(indices_.data(), used_data_indices_, used_data_count_ * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
73
74
75
76
77
78
79
80
81
    }
  }

  /*!
  * \brief Get the data indices of one leaf
  * \param leaf index of leaf
  * \param indices output data indices
  * \return number of data on this leaf
  */
Guolin Ke's avatar
Guolin Ke committed
82
  const data_size_t* GetIndexOnLeaf(int leaf, data_size_t* out_len) const {
Guolin Ke's avatar
Guolin Ke committed
83
84
    // copy reference, maybe unsafe, but faster
    data_size_t begin = leaf_begin_[leaf];
Guolin Ke's avatar
Guolin Ke committed
85
86
    *out_len = leaf_count_[leaf];
    return indices_.data() + begin;
Guolin Ke's avatar
Guolin Ke committed
87
88
89
90
91
92
93
94
95
  }

  /*!
  * \brief Split the data
  * \param leaf index of leaf
  * \param feature_bins feature bin data
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
Guolin Ke's avatar
Guolin Ke committed
96
  void Split(int leaf, const Dataset* dataset, int feature, uint32_t threshold, int right_leaf) {
Guolin Ke's avatar
Guolin Ke committed
97
98
99
100
101
102
103
104
    const data_size_t min_inner_size = 1000;
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
    const data_size_t cnt = leaf_count_[leaf];

    data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
    // split data multi-threading
105
    OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
106
107
#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
108
      OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
109
110
111
112
113
114
115
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > cnt) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
      // split data inner, reduce the times of function called
Guolin Ke's avatar
Guolin Ke committed
116
      data_size_t cur_left_count = dataset->Split(feature, threshold, indices_.data() + begin + cur_start, cur_cnt,
Guolin Ke's avatar
Guolin Ke committed
117
        temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
Guolin Ke's avatar
Guolin Ke committed
118
119
120
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
121
      OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
122
    }
123
    OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
124
125
126
127
128
129
130
131
132
133
134
135
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
    // copy back indices of right leaf to indices_
#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
136
137
        std::memcpy(indices_.data() + begin + left_write_pos_buf_[i], 
          temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
138
139
      }
      if (right_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
140
141
        std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i], 
          temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
142
143
144
145
146
147
148
149
150
151
152
153
154
      }
    }
    // update leaf boundary
    leaf_count_[leaf] = left_cnt;
    leaf_begin_[right_leaf] = left_cnt + begin;
    leaf_count_[right_leaf] = cnt - left_cnt;
  }

  /*!
  * \brief SetLabelAt used data indices before training, used for bagging
  * \param used_data_indices indices of used data
  * \param num_used_data number of used data
  */
Guolin Ke's avatar
Guolin Ke committed
155
  void SetUsedDataIndices(const data_size_t* used_data_indices, data_size_t num_used_data) {
Guolin Ke's avatar
Guolin Ke committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
    used_data_indices_ = used_data_indices;
    used_data_count_ = num_used_data;
  }

  /*!
  * \brief Get number of data on one leaf
  * \param leaf index of leaf
  * \return number of data of this leaf
  */
  data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }

  /*!
  * \brief Get leaf begin
  * \param leaf index of leaf
  * \return begin index of this leaf
  */
  data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
174
  const data_size_t* indices() const { return indices_.data(); }
Guolin Ke's avatar
Guolin Ke committed
175
176
177
178
179
180
181
182
183
184

  /*! \brief Get number of leaves */
  int num_leaves() const { return num_leaves_; }

private:
  /*! \brief Number of all data */
  data_size_t num_data_;
  /*! \brief Number of all leaves */
  int num_leaves_;
  /*! \brief start index of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
185
  std::vector<data_size_t> leaf_begin_;
Guolin Ke's avatar
Guolin Ke committed
186
  /*! \brief number of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
187
  std::vector<data_size_t> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
188
  /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
Guolin Ke's avatar
Guolin Ke committed
189
  std::vector<data_size_t> indices_;
Guolin Ke's avatar
Guolin Ke committed
190
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
191
  std::vector<data_size_t> temp_left_indices_;
Guolin Ke's avatar
Guolin Ke committed
192
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
193
  std::vector<data_size_t> temp_right_indices_;
Guolin Ke's avatar
Guolin Ke committed
194
195
196
197
198
199
200
  /*! \brief used data indices, used for bagging */
  const data_size_t* used_data_indices_;
  /*! \brief used data count, used for bagging */
  data_size_t used_data_count_;
  /*! \brief number of threads */
  int num_threads_;
  /*! \brief Buffer for multi-threading data partition, used to store offset for different threads */
Guolin Ke's avatar
Guolin Ke committed
201
  std::vector<data_size_t> offsets_buf_;
Guolin Ke's avatar
Guolin Ke committed
202
  /*! \brief Buffer for multi-threading data partition, used to store left count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
203
  std::vector<data_size_t> left_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
204
  /*! \brief Buffer for multi-threading data partition, used to store right count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
205
  std::vector<data_size_t> right_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
206
  /*! \brief Buffer for multi-threading data partition, used to store write position of left leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
207
  std::vector<data_size_t> left_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
208
  /*! \brief Buffer for multi-threading data partition, used to store write position of right leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
209
  std::vector<data_size_t> right_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
210
211
212
};

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
213
#endif   // LightGBM_TREELEARNER_DATA_PARTITION_HPP_