"src/objective/cuda/cuda_multiclass_objective.cu" did not exist on "2e9848c67f2aceef05f80db7301c52a8a9fa14c9"
data_partition.hpp 7.34 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_

#include <LightGBM/meta.h>
#include <LightGBM/feature.h>

7
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
8
9
10
11
12
13
14
15
16
17
18

#include <cstring>

#include <vector>

namespace LightGBM {
/*!
* \brief DataPartition is used to store the the partition of data on tree.
*/
class DataPartition {
public:
19
20
  DataPartition(data_size_t num_data, int num_leaves)
    :num_data_(num_data), num_leaves_(num_leaves) {
Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
    indices_.resize(num_data_);
    temp_left_indices_.resize(num_data_);
    temp_right_indices_.resize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
30
31
    used_data_indices_ = nullptr;
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
Guolin Ke's avatar
Guolin Ke committed
32
33
34
35
36
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);
Guolin Ke's avatar
Guolin Ke committed
37
  }
38
39
40
41
42
43

  void ResetLeaves(int num_leaves) {
    num_leaves_ = num_leaves;
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
  }
Guolin Ke's avatar
Guolin Ke committed
44
45
46
47
48
49
  void ResetNumData(int num_data) {
    num_data_ = num_data;
    indices_.resize(num_data_);
    temp_left_indices_.resize(num_data_);
    temp_right_indices_.resize(num_data_);
  }
Guolin Ke's avatar
Guolin Ke committed
50
  ~DataPartition() {
Guolin Ke's avatar
Guolin Ke committed
51

Guolin Ke's avatar
Guolin Ke committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  }

  /*!
  * \brief Init, will put all data on the root(leaf_idx = 0)
  */
  void Init() {
    for (int i = 0; i < num_leaves_; ++i) {
      leaf_count_[i] = 0;
    }
    leaf_begin_[0] = 0;
    if (used_data_indices_ == nullptr) {
      // if using all data
      leaf_count_[0] = num_data_;
#pragma omp parallel for schedule(static)
      for (data_size_t i = 0; i < num_data_; ++i) {
        indices_[i] = i;
      }
    } else {
      // if bagging
      leaf_count_[0] = used_data_count_;
Guolin Ke's avatar
Guolin Ke committed
72
      std::memcpy(indices_.data(), used_data_indices_, used_data_count_ * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
73
74
75
76
77
78
79
80
81
    }
  }

  /*!
  * \brief Get the data indices of one leaf
  * \param leaf index of leaf
  * \param indices output data indices
  * \return number of data on this leaf
  */
Guolin Ke's avatar
Guolin Ke committed
82
  const data_size_t* GetIndexOnLeaf(int leaf, data_size_t* out_len) const {
Guolin Ke's avatar
Guolin Ke committed
83
84
    // copy reference, maybe unsafe, but faster
    data_size_t begin = leaf_begin_[leaf];
Guolin Ke's avatar
Guolin Ke committed
85
86
    *out_len = leaf_count_[leaf];
    return indices_.data() + begin;
Guolin Ke's avatar
Guolin Ke committed
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  }

  /*!
  * \brief Split the data
  * \param leaf index of leaf
  * \param feature_bins feature bin data
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
  void Split(int leaf, const Bin* feature_bins, unsigned int threshold, int right_leaf) {
    const data_size_t min_inner_size = 1000;
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
    const data_size_t cnt = leaf_count_[leaf];

    data_size_t inner_size = (cnt + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }
    // split data multi-threading
#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > cnt) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > cnt) { cur_cnt = cnt - cur_start; }
      // split data inner, reduce the times of function called
Guolin Ke's avatar
Guolin Ke committed
114
115
      data_size_t cur_left_count = feature_bins->Split(threshold, indices_.data() + begin + cur_start, cur_cnt,
        temp_left_indices_.data() + cur_start, temp_right_indices_.data() + cur_start);
Guolin Ke's avatar
Guolin Ke committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
    }
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];
    // copy back indices of right leaf to indices_
#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
132
133
        std::memcpy(indices_.data() + begin + left_write_pos_buf_[i], 
          temp_left_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
134
135
      }
      if (right_cnts_buf_[i] > 0) {
Guolin Ke's avatar
Guolin Ke committed
136
137
        std::memcpy(indices_.data() + begin + left_cnt + right_write_pos_buf_[i], 
          temp_right_indices_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
138
139
140
141
142
143
144
145
146
147
148
149
150
      }
    }
    // update leaf boundary
    leaf_count_[leaf] = left_cnt;
    leaf_begin_[right_leaf] = left_cnt + begin;
    leaf_count_[right_leaf] = cnt - left_cnt;
  }

  /*!
  * \brief SetLabelAt used data indices before training, used for bagging
  * \param used_data_indices indices of used data
  * \param num_used_data number of used data
  */
Guolin Ke's avatar
Guolin Ke committed
151
  void SetUsedDataIndices(const data_size_t* used_data_indices, data_size_t num_used_data) {
Guolin Ke's avatar
Guolin Ke committed
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    used_data_indices_ = used_data_indices;
    used_data_count_ = num_used_data;
  }

  /*!
  * \brief Get number of data on one leaf
  * \param leaf index of leaf
  * \return number of data of this leaf
  */
  data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }

  /*!
  * \brief Get leaf begin
  * \param leaf index of leaf
  * \return begin index of this leaf
  */
  data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
170
  const data_size_t* indices() const { return indices_.data(); }
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175
176
177
178
179
180

  /*! \brief Get number of leaves */
  int num_leaves() const { return num_leaves_; }

private:
  /*! \brief Number of all data */
  data_size_t num_data_;
  /*! \brief Number of all leaves */
  int num_leaves_;
  /*! \brief start index of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
181
  std::vector<data_size_t> leaf_begin_;
Guolin Ke's avatar
Guolin Ke committed
182
  /*! \brief number of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
183
  std::vector<data_size_t> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
184
  /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
Guolin Ke's avatar
Guolin Ke committed
185
  std::vector<data_size_t> indices_;
Guolin Ke's avatar
Guolin Ke committed
186
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
187
  std::vector<data_size_t> temp_left_indices_;
Guolin Ke's avatar
Guolin Ke committed
188
  /*! \brief team indices buffer for split */
Guolin Ke's avatar
Guolin Ke committed
189
  std::vector<data_size_t> temp_right_indices_;
Guolin Ke's avatar
Guolin Ke committed
190
191
192
193
194
195
196
  /*! \brief used data indices, used for bagging */
  const data_size_t* used_data_indices_;
  /*! \brief used data count, used for bagging */
  data_size_t used_data_count_;
  /*! \brief number of threads */
  int num_threads_;
  /*! \brief Buffer for multi-threading data partition, used to store offset for different threads */
Guolin Ke's avatar
Guolin Ke committed
197
  std::vector<data_size_t> offsets_buf_;
Guolin Ke's avatar
Guolin Ke committed
198
  /*! \brief Buffer for multi-threading data partition, used to store left count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
199
  std::vector<data_size_t> left_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
200
  /*! \brief Buffer for multi-threading data partition, used to store right count after split for different threads */
Guolin Ke's avatar
Guolin Ke committed
201
  std::vector<data_size_t> right_cnts_buf_;
Guolin Ke's avatar
Guolin Ke committed
202
  /*! \brief Buffer for multi-threading data partition, used to store write position of left leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
203
  std::vector<data_size_t> left_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
204
  /*! \brief Buffer for multi-threading data partition, used to store write position of right leaf for different threads */
Guolin Ke's avatar
Guolin Ke committed
205
  std::vector<data_size_t> right_write_pos_buf_;
Guolin Ke's avatar
Guolin Ke committed
206
207
208
};

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
209
#endif   // LightGBM_TREELEARNER_DATA_PARTITION_HPP_