data_partition.hpp 7.16 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_

#include <LightGBM/meta.h>
#include <LightGBM/feature.h>

7
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
8
9
10
11
12
13
14
15
16
17
18

#include <cstring>

#include <vector>

namespace LightGBM {
/*!
* \brief DataPartition is used to store the the partition of data on tree.
*/
class DataPartition {
public:
19
20
  DataPartition(data_size_t num_data, int num_leaves)
    :num_data_(num_data), num_leaves_(num_leaves) {
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
    indices_.resize(num_data_);
    temp_left_indices_.resize(num_data_);
    temp_right_indices_.resize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
    used_data_indices_ = nullptr;
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
32
33
34
35
36
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
37
  }
38
39
40
41
42
43
44

  void ResetLeaves(int num_leaves) {
    num_leaves_ = num_leaves;
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
  }

Guolin Ke's avatar
Guolin Ke committed
45
  ~DataPartition() {
Guolin Ke's avatar
Guolin Ke committed
46

Guolin Ke's avatar
Guolin Ke committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  }

  /*!
  * \brief Init, will put all data on the root(leaf_idx = 0)
  */
  void Init() {
    for (int i = 0; i < num_leaves_; ++i) {
      leaf_count_[i] = 0;
    }
    leaf_begin_[0] = 0;
    if (used_data_indices_ == nullptr) {
      // if using all data
      leaf_count_[0] = num_data_;
#pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        indices_[i] = i;
      }
    } else {
      // if bagging
      leaf_count_[0] = used_data_count_;
Guolin Ke's avatar
Guolin Ke committed
67
      std::memcpy(indices_.data(), used_data_indices_, used_data_count_ * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
68
69
70
71
72
73
74
75
76
    }
  }

  /*!
  * \brief Get the data indices of one leaf
  * \param leaf index of leaf
  * \param indices output data indices
  * \return number of data on this leaf
  */
Guolin Ke's avatar
Guolin Ke committed
77
  const data_size_t* GetIndexOnLeaf(int leaf, data_size_t* out_len) const {
Guolin Ke's avatar
Guolin Ke committed
78
79
    // copy reference, maybe unsafe, but faster
    data_size_t begin = leaf_begin_[leaf];
Guolin Ke's avatar
Guolin Ke committed
80
81
    *out_len = leaf_count_[leaf];
    return indices_.data() + begin;
Guolin Ke's avatar
Guolin Ke committed
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
  }

  /*!
  * \brief Split the data
  * \param leaf index of leaf
  * \param feature_bins feature bin data
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
  void Split(int leaf, const Bin* feature_bins, unsigned int threshold, int right_leaf) {
    const data_size_t min_inner_size = 1000;
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
    const data_size_t cnt = leaf_count_[leaf];

    data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
    // split data multi-threading
#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > cnt) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
      // split data inner, reduce the times of function called
Guolin Ke's avatar
Guolin Ke committed
109
110
      data_size_t cur_left_count = feature_bins->Split(threshold, indices_.data() + begin + cur_start, cur_cnt,
        temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
Guolin Ke's avatar
Guolin Ke committed
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
    }
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
    // copy back indices of right leaf to indices_
#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
127
128
        std::memcpy(indices_.data() + begin + left_write_pos_buf_[i], 
          temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
129
130
      }
      if (right_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
131
132
        std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i], 
          temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
133
134
135
136
137
138
139
140
141
142
143
144
145
      }
    }
    // update leaf boundary
    leaf_count_[leaf] = left_cnt;
    leaf_begin_[right_leaf] = left_cnt + begin;
    leaf_count_[right_leaf] = cnt - left_cnt;
  }

  /*!
  * \brief SetLabelAt used data indices before training, used for bagging
  * \param used_data_indices indices of used data
  * \param num_used_data number of used data
  */
Guolin Ke's avatar
Guolin Ke committed
146
  void SetUsedDataIndices(const data_size_t* used_data_indices, data_size_t num_used_data) {
Guolin Ke's avatar
Guolin Ke committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
    used_data_indices_ = used_data_indices;
    used_data_count_ = num_used_data;
  }

  /*!
  * \brief Get number of data on one leaf
  * \param leaf index of leaf
  * \return number of data of this leaf
  */
  data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }

  /*!
  * \brief Get leaf begin
  * \param leaf index of leaf
  * \return begin index of this leaf
  */
  data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
165
  const data_size_t* indices() const { return indices_.data(); }
Guolin Ke's avatar
Guolin Ke committed
166
167
168
169
170
171
172
173
174
175

  /*! \brief Get number of leaves */
  int num_leaves() const { return num_leaves_; }

private:
  /*! \brief Number of all data */
  data_size_t num_data_;
  /*! \brief Number of all leaves */
  int num_leaves_;
  /*! \brief start index of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
176
  std::vector<data_size_t> leaf_begin_;
Guolin Ke's avatar
Guolin Ke committed
177
  /*! \brief number of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
178
  std::vector<data_size_t> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
179
  /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
Guolin Ke's avatar
Guolin Ke committed
180
  std::vector<data_size_t> indices_;
Guolin Ke's avatar
Guolin Ke committed
181
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
182
  std::vector<data_size_t> temp_left_indices_;
Guolin Ke's avatar
Guolin Ke committed
183
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
184
  std::vector<data_size_t> temp_right_indices_;
Guolin Ke's avatar
Guolin Ke committed
185
186
187
188
189
190
191
  /*! \brief used data indices, used for bagging */
  const data_size_t* used_data_indices_;
  /*! \brief used data count, used for bagging */
  data_size_t used_data_count_;
  /*! \brief number of threads */
  int num_threads_;
  /*! \brief Buffer for multi-threading data partition, used to store offset for different threads */
Guolin Ke's avatar
Guolin Ke committed
192
  std::vector<data_size_t> offsets_buf_;
Guolin Ke's avatar
Guolin Ke committed
193
  /*! \brief Buffer for multi-threading data partition, used to store left count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
194
  std::vector<data_size_t> left_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
195
  /*! \brief Buffer for multi-threading data partition, used to store right count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
196
  std::vector<data_size_t> right_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
197
  /*! \brief Buffer for multi-threading data partition, used to store write position of left leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
198
  std::vector<data_size_t> left_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
199
  /*! \brief Buffer for multi-threading data partition, used to store write position of right leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
200
  std::vector<data_size_t> right_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
201
202
203
};

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
204
#endif   // LightGBM_TREELEARNER_DATA_PARTITION_HPP_