data_partition.hpp 5.53 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_
#define LIGHTGBM_TREELEARNER_DATA_PARTITION_HPP_

Guolin Ke's avatar
Guolin Ke committed
8
#include <LightGBM/dataset.h>
9
#include <LightGBM/meta.h>
10
#include <LightGBM/utils/openmp_wrapper.h>
Guolin Ke's avatar
Guolin Ke committed
11
#include <LightGBM/utils/threading.h>
Guolin Ke's avatar
Guolin Ke committed
12

13
#include <algorithm>
Guolin Ke's avatar
Guolin Ke committed
14
15
16
17
18
19
20
21
#include <cstring>
#include <vector>

namespace LightGBM {
/*!
* \brief DataPartition is used to store the the partition of data on tree.
*/
class DataPartition {
Nikita Titov's avatar
Nikita Titov committed
22
 public:
23
  DataPartition(data_size_t num_data, int num_leaves)
24
      : num_data_(num_data), num_leaves_(num_leaves), runner_(num_data, 512) {
Guolin Ke's avatar
Guolin Ke committed
25
26
27
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
    indices_.resize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
28
29
    used_data_indices_ = nullptr;
  }
30
31
32
33
34
35

  void ResetLeaves(int num_leaves) {
    num_leaves_ = num_leaves;
    leaf_begin_.resize(num_leaves_);
    leaf_count_.resize(num_leaves_);
  }
36

Guolin Ke's avatar
Guolin Ke committed
37
38
39
  void ResetNumData(int num_data) {
    num_data_ = num_data;
    indices_.resize(num_data_);
40
    runner_.ReSize(num_data_);
Guolin Ke's avatar
Guolin Ke committed
41
  }
42

Guolin Ke's avatar
Guolin Ke committed
43
44
45
46
47
48
49
  ~DataPartition() {
  }

  /*!
  * \brief Init, will put all data on the root(leaf_idx = 0)
  */
  void Init() {
50
51
    std::fill(leaf_begin_.begin(), leaf_begin_.end(), 0);
    std::fill(leaf_count_.begin(), leaf_count_.end(), 0);
Guolin Ke's avatar
Guolin Ke committed
52
53
54
    if (used_data_indices_ == nullptr) {
      // if using all data
      leaf_count_[0] = num_data_;
Guolin Ke's avatar
Guolin Ke committed
55
#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
Guolin Ke's avatar
Guolin Ke committed
56
57
58
59
60
61
      for (data_size_t i = 0; i < num_data_; ++i) {
        indices_[i] = i;
      }
    } else {
      // if bagging
      leaf_count_[0] = used_data_count_;
Guolin Ke's avatar
Guolin Ke committed
62
      std::memcpy(indices_.data(), used_data_indices_, used_data_count_ * sizeof(data_size_t));
Guolin Ke's avatar
Guolin Ke committed
63
64
65
    }
  }

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
  void ResetByLeafPred(const std::vector<int>& leaf_pred, int num_leaves) {
    ResetLeaves(num_leaves);
    std::vector<std::vector<data_size_t>> indices_per_leaf(num_leaves_);
    for (data_size_t i = 0; i < static_cast<data_size_t>(leaf_pred.size()); ++i) {
      indices_per_leaf[leaf_pred[i]].push_back(i);
    }
    data_size_t offset = 0;
    for (int i = 0; i < num_leaves_; ++i) {
      leaf_begin_[i] = offset;
      leaf_count_[i] = static_cast<data_size_t>(indices_per_leaf[i].size());
      std::copy(indices_per_leaf[i].begin(), indices_per_leaf[i].end(), indices_.begin() + leaf_begin_[i]);
      offset += leaf_count_[i];
    }
  }

Guolin Ke's avatar
Guolin Ke committed
81
82
83
84
85
86
  /*!
  * \brief Get the data indices of one leaf
  * \param leaf index of leaf
  * \param indices output data indices
  * \return number of data on this leaf
  */
Guolin Ke's avatar
Guolin Ke committed
87
  const data_size_t* GetIndexOnLeaf(int leaf, data_size_t* out_len) const {
Guolin Ke's avatar
Guolin Ke committed
88
89
    // copy reference, maybe unsafe, but faster
    data_size_t begin = leaf_begin_[leaf];
Guolin Ke's avatar
Guolin Ke committed
90
91
    *out_len = leaf_count_[leaf];
    return indices_.data() + begin;
Guolin Ke's avatar
Guolin Ke committed
92
93
94
95
96
97
98
99
100
  }

  /*!
  * \brief Split the data
  * \param leaf index of leaf
  * \param feature_bins feature bin data
  * \param threshold threshold that want to split
  * \param right_leaf index of right leaf
  */
101
102
103
104
  void Split(int leaf, const Dataset* dataset, int feature,
             const uint32_t* threshold, int num_threshold, bool default_left,
             int right_leaf) {
    Common::FunctionTimer fun_timer("DataPartition::Split", global_timer);
Guolin Ke's avatar
Guolin Ke committed
105
106
107
    // get leaf boundary
    const data_size_t begin = leaf_begin_[leaf];
    const data_size_t cnt = leaf_count_[leaf];
108
    auto left_start = indices_.data() + begin;
109
    const auto left_cnt = runner_.Run<false>(
110
111
112
113
114
115
116
        cnt,
        [=](int, data_size_t cur_start, data_size_t cur_cnt, data_size_t* left,
            data_size_t* right) {
          return dataset->Split(feature, threshold, num_threshold, default_left,
                                left_start + cur_start, cur_cnt, left, right);
        },
        left_start);
Guolin Ke's avatar
Guolin Ke committed
117
118
119
120
121
122
123
124
125
126
    leaf_count_[leaf] = left_cnt;
    leaf_begin_[right_leaf] = left_cnt + begin;
    leaf_count_[right_leaf] = cnt - left_cnt;
  }

  /*!
  * \brief SetLabelAt used data indices before training, used for bagging
  * \param used_data_indices indices of used data
  * \param num_used_data number of used data
  */
Guolin Ke's avatar
Guolin Ke committed
127
  void SetUsedDataIndices(const data_size_t* used_data_indices, data_size_t num_used_data) {
Guolin Ke's avatar
Guolin Ke committed
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
    used_data_indices_ = used_data_indices;
    used_data_count_ = num_used_data;
  }

  /*!
  * \brief Get number of data on one leaf
  * \param leaf index of leaf
  * \return number of data of this leaf
  */
  data_size_t leaf_count(int leaf) const { return leaf_count_[leaf]; }

  /*!
  * \brief Get leaf begin
  * \param leaf index of leaf
  * \return begin index of this leaf
  */
  data_size_t leaf_begin(int leaf) const { return leaf_begin_[leaf]; }

Guolin Ke's avatar
Guolin Ke committed
146
  const data_size_t* indices() const { return indices_.data(); }
Guolin Ke's avatar
Guolin Ke committed
147
148
149
150

  /*! \brief Get number of leaves */
  int num_leaves() const { return num_leaves_; }

Nikita Titov's avatar
Nikita Titov committed
151
 private:
Guolin Ke's avatar
Guolin Ke committed
152
153
154
155
156
  /*! \brief Number of all data */
  data_size_t num_data_;
  /*! \brief Number of all leaves */
  int num_leaves_;
  /*! \brief start index of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
157
  std::vector<data_size_t> leaf_begin_;
Guolin Ke's avatar
Guolin Ke committed
158
  /*! \brief number of data on one leaf */
Guolin Ke's avatar
Guolin Ke committed
159
  std::vector<data_size_t> leaf_count_;
Guolin Ke's avatar
Guolin Ke committed
160
  /*! \brief Store all data's indices, order by leaf[data_in_leaf0,..,data_leaf1,..] */
161
  std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>> indices_;
Guolin Ke's avatar
Guolin Ke committed
162
163
164
165
  /*! \brief used data indices, used for bagging */
  const data_size_t* used_data_indices_;
  /*! \brief used data count, used for bagging */
  data_size_t used_data_count_;
166
  ParallelPartitionRunner<data_size_t, true> runner_;
Guolin Ke's avatar
Guolin Ke committed
167
168
169
};

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
170
#endif   // LightGBM_TREELEARNER_DATA_PARTITION_HPP_