parallel_tree_learner.h 7.94 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
#ifndef LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
#define LIGHTGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_

#include <LightGBM/utils/array_args.h>

#include <LightGBM/network.h>
#include "serial_tree_learner.h"
8
#include "gpu_tree_learner.h"
Guolin Ke's avatar
Guolin Ke committed
9
10
11
12

#include <cstring>

#include <vector>
Guolin Ke's avatar
Guolin Ke committed
13
#include <memory>
Guolin Ke's avatar
Guolin Ke committed
14
15
16
17
18

namespace LightGBM {

/*!
* \brief Feature parallel learning algorithm.
Qiwei Ye's avatar
Qiwei Ye committed
19
20
*        Different machine will find best split on different features, then sync global best split
*        It is recommonded used when #data is small or #feature is large
Guolin Ke's avatar
Guolin Ke committed
21
*/
22
23
template <typename TREELEARNER_T>
class FeatureParallelTreeLearner: public TREELEARNER_T {
Guolin Ke's avatar
Guolin Ke committed
24
public:
Guolin Ke's avatar
Guolin Ke committed
25
  explicit FeatureParallelTreeLearner(const TreeConfig* tree_config);
Guolin Ke's avatar
Guolin Ke committed
26
  ~FeatureParallelTreeLearner();
27
  void Init(const Dataset* train_data, bool is_constant_hessian) override;
Guolin Ke's avatar
Guolin Ke committed
28
29
30

protected:
  void BeforeTrain() override;
Guolin Ke's avatar
Guolin Ke committed
31
  void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
Guolin Ke's avatar
Guolin Ke committed
32
33
34
35
36
37
private:
  /*! \brief rank of local machine */
  int rank_;
  /*! \brief Number of machines of this parallel task */
  int num_machines_;
  /*! \brief Buffer for network send */
Guolin Ke's avatar
Guolin Ke committed
38
  std::vector<char> input_buffer_;
Guolin Ke's avatar
Guolin Ke committed
39
  /*! \brief Buffer for network receive */
Guolin Ke's avatar
Guolin Ke committed
40
  std::vector<char> output_buffer_;
Guolin Ke's avatar
Guolin Ke committed
41
42
43
44
};

/*!
* \brief Data parallel learning algorithm.
Qiwei Ye's avatar
Qiwei Ye committed
45
46
*        Workers use local data to construct histograms locally, then sync up global histograms.
*        It is recommonded used when #data is large or #feature is small
Guolin Ke's avatar
Guolin Ke committed
47
*/
48
49
template <typename TREELEARNER_T>
class DataParallelTreeLearner: public TREELEARNER_T {
Guolin Ke's avatar
Guolin Ke committed
50
public:
Guolin Ke's avatar
Guolin Ke committed
51
  explicit DataParallelTreeLearner(const TreeConfig* tree_config);
Guolin Ke's avatar
Guolin Ke committed
52
  ~DataParallelTreeLearner();
53
  void Init(const Dataset* train_data, bool is_constant_hessian) override;
Guolin Ke's avatar
Guolin Ke committed
54
  void ResetConfig(const TreeConfig* tree_config) override;
Guolin Ke's avatar
Guolin Ke committed
55
56
protected:
  void BeforeTrain() override;
Guolin Ke's avatar
Guolin Ke committed
57
58
  void FindBestSplits() override;
  void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;

  inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override {
    if (leaf_idx >= 0) {
      return global_data_count_in_leaf_[leaf_idx];
    } else {
      return 0;
    }
  }

private:
  /*! \brief Rank of local machine */
  int rank_;
  /*! \brief Number of machines of this parallel task */
  int num_machines_;
  /*! \brief Buffer for network send */
Guolin Ke's avatar
Guolin Ke committed
75
  std::vector<char> input_buffer_;
Guolin Ke's avatar
Guolin Ke committed
76
  /*! \brief Buffer for network receive */
Guolin Ke's avatar
Guolin Ke committed
77
  std::vector<char> output_buffer_;
Guolin Ke's avatar
Guolin Ke committed
78
79
  /*! \brief different machines will aggregate histograms for different features,
       use this to mark local aggregate features*/
Guolin Ke's avatar
Guolin Ke committed
80
  std::vector<bool> is_feature_aggregated_;
Guolin Ke's avatar
Guolin Ke committed
81
  /*! \brief Block start index for reduce scatter */
Guolin Ke's avatar
Guolin Ke committed
82
  std::vector<int> block_start_;
Guolin Ke's avatar
Guolin Ke committed
83
  /*! \brief Block size for reduce scatter */
Guolin Ke's avatar
Guolin Ke committed
84
  std::vector<int> block_len_;
Hui Xue's avatar
Hui Xue committed
85
  /*! \brief Write positions for feature histograms */
Guolin Ke's avatar
Guolin Ke committed
86
  std::vector<int> buffer_write_start_pos_;
Hui Xue's avatar
Hui Xue committed
87
  /*! \brief Read positions for local feature histograms */
Guolin Ke's avatar
Guolin Ke committed
88
  std::vector<int> buffer_read_start_pos_;
Guolin Ke's avatar
Guolin Ke committed
89
90
91
  /*! \brief Size for reduce scatter */
  int reduce_scatter_size_;
  /*! \brief Store global number of data in leaves  */
Guolin Ke's avatar
Guolin Ke committed
92
  std::vector<data_size_t> global_data_count_in_leaf_;
Guolin Ke's avatar
Guolin Ke committed
93
94
};

Guolin Ke's avatar
Guolin Ke committed
95
96
97
98
99
100
/*!
* \brief Voting based data parallel learning algorithm.
* Like data parallel, but not aggregate histograms for all features.
* Here using voting to reduce features, and only aggregate histograms for selected features.
* When #data is large and #feature is large, you can use this to have better speed-up
*/
101
102
template <typename TREELEARNER_T>
class VotingParallelTreeLearner: public TREELEARNER_T {
Guolin Ke's avatar
Guolin Ke committed
103
public:
Guolin Ke's avatar
Guolin Ke committed
104
  explicit VotingParallelTreeLearner(const TreeConfig* tree_config);
Guolin Ke's avatar
Guolin Ke committed
105
  ~VotingParallelTreeLearner() { }
106
  void Init(const Dataset* train_data, bool is_constant_hessian) override;
Guolin Ke's avatar
Guolin Ke committed
107
  void ResetConfig(const TreeConfig* tree_config) override;
Guolin Ke's avatar
Guolin Ke committed
108
109
protected:
  void BeforeTrain() override;
Guolin Ke's avatar
Guolin Ke committed
110
  bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
Guolin Ke's avatar
Guolin Ke committed
111
112
  void FindBestSplits() override;
  void FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;

  inline data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const override {
    if (leaf_idx >= 0) {
      return global_data_count_in_leaf_[leaf_idx];
    } else {
      return 0;
    }
  }
  /*!
  * \brief Perform global voting
  * \param leaf_idx index of leaf
  * \param splits All splits from local voting
  * \param out Result of gobal voting, only store feature indices
  */
128
  void GlobalVoting(int leaf_idx, const std::vector<LightSplitInfo>& splits,
Guolin Ke's avatar
Guolin Ke committed
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
    std::vector<int>* out);
  /*!
  * \brief Copy local histgram to buffer
  * \param smaller_top_features Selected features for smaller leaf
  * \param larger_top_features Selected features for larger leaf
  */
  void CopyLocalHistogram(const std::vector<int>& smaller_top_features,
    const std::vector<int>& larger_top_features);

private:
  /*! \brief Tree config used in local mode */
  TreeConfig local_tree_config_;
  /*! \brief Voting size */
  int top_k_;
  /*! \brief Rank of local machine*/
  int rank_;
  /*! \brief Number of machines */
  int num_machines_;
  /*! \brief Buffer for network send */
  std::vector<char> input_buffer_;
  /*! \brief Buffer for network receive */
  std::vector<char> output_buffer_;
  /*! \brief different machines will aggregate histograms for different features,
  use this to mark local aggregate features*/
  std::vector<bool> smaller_is_feature_aggregated_;
  /*! \brief different machines will aggregate histograms for different features,
  use this to mark local aggregate features*/
  std::vector<bool> larger_is_feature_aggregated_;
  /*! \brief Block start index for reduce scatter */
  std::vector<int> block_start_;
  /*! \brief Block size for reduce scatter */
  std::vector<int> block_len_;
  /*! \brief Read positions for feature histgrams at smaller leaf */
  std::vector<int> smaller_buffer_read_start_pos_;
  /*! \brief Read positions for feature histgrams at larger leaf */
  std::vector<int> larger_buffer_read_start_pos_;
  /*! \brief Size for reduce scatter */
  int reduce_scatter_size_;
  /*! \brief Store global number of data in leaves  */
  std::vector<data_size_t> global_data_count_in_leaf_;
  /*! \brief Store global split information for smaller leaf  */
  std::unique_ptr<LeafSplits> smaller_leaf_splits_global_;
  /*! \brief Store global split information for larger leaf  */
  std::unique_ptr<LeafSplits> larger_leaf_splits_global_;
  /*! \brief Store global histogram for smaller leaf  */
  std::unique_ptr<FeatureHistogram[]> smaller_leaf_histogram_array_global_;
  /*! \brief Store global histogram for larger leaf  */
  std::unique_ptr<FeatureHistogram[]> larger_leaf_histogram_array_global_;
Guolin Ke's avatar
Guolin Ke committed
177
178
179
180

  std::vector<HistogramBinEntry> smaller_leaf_histogram_data_;
  std::vector<HistogramBinEntry> larger_leaf_histogram_data_;
  std::vector<FeatureMetainfo> feature_metas_;
Guolin Ke's avatar
Guolin Ke committed
181
};
Guolin Ke's avatar
Guolin Ke committed
182

183
// To-do: reduce the communication cost by using bitset to communicate.
184
inline void SyncUpGlobalBestSplit(char* input_buffer_, char* output_buffer_, SplitInfo* smaller_best_split, SplitInfo* larger_best_split, int max_cat_threshold) {
185
  // sync global best info
186
  int size = SplitInfo::Size(max_cat_threshold);
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
  smaller_best_split->CopyTo(input_buffer_);
  larger_best_split->CopyTo(input_buffer_ + size);
  Network::Allreduce(input_buffer_, size * 2, size, output_buffer_, 
                     [&size] (const char* src, char* dst, int len) {
    int used_size = 0;
    LightSplitInfo p1, p2;
    while (used_size < len) {
      p1.CopyFrom(src);
      p2.CopyFrom(dst);
      if (p1 > p2) {
        std::memcpy(dst, src, size);
      }
      src += size;
      dst += size;
      used_size += size;
    }
  });
  // copy back
  smaller_best_split->CopyFrom(output_buffer_);
  larger_best_split->CopyFrom(output_buffer_ + size);
}

Guolin Ke's avatar
Guolin Ke committed
209
}  // namespace LightGBM
Guolin Ke's avatar
Guolin Ke committed
210
#endif   // LightGBM_TREELEARNER_PARALLEL_TREE_LEARNER_H_
Guolin Ke's avatar
Guolin Ke committed
211