"vscode:/vscode.git/clone" did not exist on "aafedd8a1c57432ac3e38e4af236c00c6c54e84c"
serial_tree_learner.h 7.05 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
#ifndef LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_

#include <LightGBM/dataset.h>
#include <LightGBM/tree.h>
6
7
8
9
10
11
12
13
14
15
#include <LightGBM/tree_learner.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/random.h>

#include <string>
#include <cmath>
#include <cstdio>
#include <memory>
#include <random>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
16
17

#include "data_partition.hpp"
18
#include "feature_histogram.hpp"
Guolin Ke's avatar
Guolin Ke committed
19
#include "leaf_splits.hpp"
20
#include "split_info.hpp"
Guolin Ke's avatar
Guolin Ke committed
21

22
23
24
25
26
#ifdef USE_GPU
// Use 4KBytes aligned allocator for ordered gradients and ordered hessians when GPU is enabled.
// This is necessary to pin the two arrays in memory and make transferring faster.
#include <boost/align/aligned_allocator.hpp>
#endif
Guolin Ke's avatar
Guolin Ke committed
27

28
29
using namespace json11;

Guolin Ke's avatar
Guolin Ke committed
30
31
32
33
34
35
namespace LightGBM {

/*!
* \brief Used for learning a tree by single machine
*/
class SerialTreeLearner: public TreeLearner {
Nikita Titov's avatar
Nikita Titov committed
36
 public:
Guolin Ke's avatar
Guolin Ke committed
37
  explicit SerialTreeLearner(const Config* config);
Guolin Ke's avatar
Guolin Ke committed
38
39
40

  ~SerialTreeLearner();

41
  void Init(const Dataset* train_data, bool is_constant_hessian) override;
Guolin Ke's avatar
Guolin Ke committed
42

Guolin Ke's avatar
Guolin Ke committed
43
44
  void ResetTrainingData(const Dataset* train_data) override;

Guolin Ke's avatar
Guolin Ke committed
45
  void ResetConfig(const Config* config) override;
Guolin Ke's avatar
Guolin Ke committed
46

47
48
  Tree* Train(const score_t* gradients, const score_t *hessians, bool is_constant_hessian,
              Json& forced_split_json) override;
Guolin Ke's avatar
Guolin Ke committed
49

50
  Tree* FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const override;
Guolin Ke's avatar
Guolin Ke committed
51

52
53
54
  Tree* FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred,
                          const score_t* gradients, const score_t* hessians) override;

Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
  void SetBaggingData(const data_size_t* used_indices, data_size_t num_data) override {
    data_partition_->SetUsedDataIndices(used_indices, num_data);
  }

Guolin Ke's avatar
Guolin Ke committed
59
60
61
  void AddPredictionToScore(const Tree* tree, double* out_score) const override {
    if (tree->num_leaves() <= 1) { return; }
    CHECK(tree->num_leaves() <= data_partition_->num_leaves());
Guolin Ke's avatar
Guolin Ke committed
62
    #pragma omp parallel for schedule(static)
63
    for (int i = 0; i < tree->num_leaves(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
64
      double output = static_cast<double>(tree->LeafOutput(i));
Guolin Ke's avatar
Guolin Ke committed
65
66
      data_size_t cnt_leaf_data = 0;
      auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
Guolin Ke's avatar
Guolin Ke committed
67
      for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
68
        out_score[tmp_idx[j]] += output;
Guolin Ke's avatar
Guolin Ke committed
69
70
71
72
      }
    }
  }

73
74
75
  void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, const double* prediction,
                       data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;

Guolin Ke's avatar
Guolin Ke committed
76
77
78
  void RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, double prediction,
                       data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const override;

Nikita Titov's avatar
Nikita Titov committed
79
 protected:
Guolin Ke's avatar
Guolin Ke committed
80
81
82
83
84
85
86
87
  /*!
  * \brief Some initial works before training
  */
  virtual void BeforeTrain();

  /*!
  * \brief Some initial works before FindBestSplit
  */
Guolin Ke's avatar
Guolin Ke committed
88
  virtual bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf);
Guolin Ke's avatar
Guolin Ke committed
89

Guolin Ke's avatar
Guolin Ke committed
90
  virtual void FindBestSplits();
Guolin Ke's avatar
Guolin Ke committed
91

Guolin Ke's avatar
Guolin Ke committed
92
  virtual void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
Guolin Ke's avatar
Guolin Ke committed
93

Guolin Ke's avatar
Guolin Ke committed
94
  virtual void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract);
Guolin Ke's avatar
Guolin Ke committed
95
96
97
98
99
100
101
102
103
104

  /*!
  * \brief Partition tree and data according best split.
  * \param tree Current tree, will be splitted on this function.
  * \param best_leaf The index of leaf that will be splitted.
  * \param left_leaf The index of left leaf after splitted.
  * \param right_leaf The index of right leaf after splitted.
  */
  virtual void Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf);

105
106
  /* Force splits with forced_split_json dict and then return num splits forced.*/
  virtual int32_t ForceSplits(Tree* tree, Json& forced_split_json, int* left_leaf,
107
                              int* right_leaf, int* cur_depth,
108
109
                              bool *aborted_last_force_split);

Guolin Ke's avatar
Guolin Ke committed
110
111
112
113
114
115
  /*!
  * \brief Get the number of data in a leaf
  * \param leaf_idx The index of leaf
  * \return The number of data in the leaf_idx leaf
  */
  inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
116
117
118

  double CalculateOndemandCosts(int feature_index, int leaf_index);

Guolin Ke's avatar
Guolin Ke committed
119
120
121
122
123
124
125
  /*! \brief number of data */
  data_size_t num_data_;
  /*! \brief number of features */
  int num_features_;
  /*! \brief training data */
  const Dataset* train_data_;
  /*! \brief gradients of current iteration */
126
  const score_t* gradients_;
Guolin Ke's avatar
Guolin Ke committed
127
  /*! \brief hessians of current iteration */
128
  const score_t* hessians_;
Guolin Ke's avatar
Guolin Ke committed
129
  /*! \brief training data partition on leaves */
Guolin Ke's avatar
Guolin Ke committed
130
  std::unique_ptr<DataPartition> data_partition_;
Guolin Ke's avatar
Guolin Ke committed
131
132
  /*! \brief used for generate used features */
  Random random_;
Hui Xue's avatar
Hui Xue committed
133
  /*! \brief used for sub feature training, is_feature_used_[i] = false means don't used feature i */
Guolin Ke's avatar
Guolin Ke committed
134
  std::vector<int8_t> is_feature_used_;
135
136
  /*! \brief pointer to histograms array of parent of current leaves */
  FeatureHistogram* parent_leaf_histogram_array_;
Guolin Ke's avatar
Guolin Ke committed
137
138
139
140
141
142
143
  /*! \brief pointer to histograms array of smaller leaf */
  FeatureHistogram* smaller_leaf_histogram_array_;
  /*! \brief pointer to histograms array of larger leaf */
  FeatureHistogram* larger_leaf_histogram_array_;

  /*! \brief store best split points for all leaves */
  std::vector<SplitInfo> best_split_per_leaf_;
144
145
  /*! \brief store best split per feature for all leaves */
  std::vector<SplitInfo> splits_per_leaf_;
Guolin Ke's avatar
Guolin Ke committed
146
147

  /*! \brief stores best thresholds for all feature for smaller leaf */
Guolin Ke's avatar
Guolin Ke committed
148
  std::unique_ptr<LeafSplits> smaller_leaf_splits_;
Guolin Ke's avatar
Guolin Ke committed
149
  /*! \brief stores best thresholds for all feature for larger leaf */
Guolin Ke's avatar
Guolin Ke committed
150
  std::unique_ptr<LeafSplits> larger_leaf_splits_;
151
  std::vector<int> valid_feature_indices_;
Guolin Ke's avatar
Guolin Ke committed
152

153
154
#ifdef USE_GPU
  /*! \brief gradients of current iteration, ordered for cache optimized, aligned to 4K page */
155
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
156
  /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
157
  std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
158
#else
Guolin Ke's avatar
Guolin Ke committed
159
  /*! \brief gradients of current iteration, ordered for cache optimized */
160
  std::vector<score_t> ordered_gradients_;
Guolin Ke's avatar
Guolin Ke committed
161
  /*! \brief hessians of current iteration, ordered for cache optimized */
162
  std::vector<score_t> ordered_hessians_;
163
#endif
Guolin Ke's avatar
Guolin Ke committed
164
165

  /*! \brief Store ordered bin */
Guolin Ke's avatar
Guolin Ke committed
166
  std::vector<std::unique_ptr<OrderedBin>> ordered_bins_;
Guolin Ke's avatar
Guolin Ke committed
167
168
169
  /*! \brief True if has ordered bin */
  bool has_ordered_bin_ = false;
  /*! \brief  is_data_in_leaf_[i] != 0 means i-th data is marked */
Guolin Ke's avatar
Guolin Ke committed
170
  std::vector<char> is_data_in_leaf_;
171
  /*! \brief used to cache historical histogram to speed up*/
Guolin Ke's avatar
Guolin Ke committed
172
  HistogramPool histogram_pool_;
Guolin Ke's avatar
Guolin Ke committed
173
  /*! \brief config of tree learner*/
Guolin Ke's avatar
Guolin Ke committed
174
  const Config* config_;
Guolin Ke's avatar
Guolin Ke committed
175
  int num_threads_;
Guolin Ke's avatar
Guolin Ke committed
176
  std::vector<int> ordered_bin_indices_;
177
  bool is_constant_hessian_;
178
179
180

  std::vector<bool> feature_used;
  std::vector<uint32_t> feature_used_in_data;
Guolin Ke's avatar
Guolin Ke committed
181
182
};

Luke Gallagher's avatar
Luke Gallagher committed
183
184
185
inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
  if (leaf_idx >= 0) {
    return data_partition_->leaf_count(leaf_idx);
Guolin Ke's avatar
Guolin Ke committed
186
187
188
189
190
191
  } else {
    return 0;
  }
}

}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
192
#endif   // LightGBM_TREELEARNER_SERIAL_TREE_LEARNER_H_