data_parallel_tree_learner.cpp 11.7 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
8
#include <cstring>
#include <tuple>
#include <vector>

9
10
#include "parallel_tree_learner.h"

Guolin Ke's avatar
Guolin Ke committed
11
12
namespace LightGBM {

13
template <typename TREELEARNER_T>
Guolin Ke's avatar
Guolin Ke committed
14
15
DataParallelTreeLearner<TREELEARNER_T>::DataParallelTreeLearner(const Config* config)
  :TREELEARNER_T(config) {
Guolin Ke's avatar
Guolin Ke committed
16
17
}

18
19
template <typename TREELEARNER_T>
DataParallelTreeLearner<TREELEARNER_T>::~DataParallelTreeLearner() {
Guolin Ke's avatar
Guolin Ke committed
20
21
}

22
23
template <typename TREELEARNER_T>
void DataParallelTreeLearner<TREELEARNER_T>::Init(const Dataset* train_data, bool is_constant_hessian) {
Guolin Ke's avatar
Guolin Ke committed
24
  // initialize SerialTreeLearner
25
  TREELEARNER_T::Init(train_data, is_constant_hessian);
Guolin Ke's avatar
Guolin Ke committed
26
27
28
29
  // Get local rank and global machine size
  rank_ = Network::rank();
  num_machines_ = Network::num_machines();
  // allocate buffer for communication
30
  size_t buffer_size = this->train_data_->NumTotalBin() * sizeof(HistogramBinEntry);
Guolin Ke's avatar
Guolin Ke committed
31

Guolin Ke's avatar
Guolin Ke committed
32
33
  input_buffer_.resize(buffer_size);
  output_buffer_.resize(buffer_size);
Guolin Ke's avatar
Guolin Ke committed
34

35
  is_feature_aggregated_.resize(this->num_features_);
Guolin Ke's avatar
Guolin Ke committed
36

Guolin Ke's avatar
Guolin Ke committed
37
38
  block_start_.resize(num_machines_);
  block_len_.resize(num_machines_);
Guolin Ke's avatar
Guolin Ke committed
39

40
41
  buffer_write_start_pos_.resize(this->num_features_);
  buffer_read_start_pos_.resize(this->num_features_);
Guolin Ke's avatar
Guolin Ke committed
42
  global_data_count_in_leaf_.resize(this->config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
43
44
}

45
template <typename TREELEARNER_T>
Guolin Ke's avatar
Guolin Ke committed
46
47
48
void DataParallelTreeLearner<TREELEARNER_T>::ResetConfig(const Config* config) {
  TREELEARNER_T::ResetConfig(config);
  global_data_count_in_leaf_.resize(this->config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
49
}
Guolin Ke's avatar
Guolin Ke committed
50

51
52
53
template <typename TREELEARNER_T>
void DataParallelTreeLearner<TREELEARNER_T>::BeforeTrain() {
  TREELEARNER_T::BeforeTrain();
Guolin Ke's avatar
Guolin Ke committed
54
55
56
  // generate feature partition for current tree
  std::vector<std::vector<int>> feature_distribution(num_machines_, std::vector<int>());
  std::vector<int> num_bins_distributed(num_machines_, 0);
57
58
  for (int i = 0; i < this->train_data_->num_total_features(); ++i) {
    int inner_feature_index = this->train_data_->InnerFeatureIndex(i);
Guolin Ke's avatar
Guolin Ke committed
59
    if (inner_feature_index == -1) { continue; }
60
    if (this->is_feature_used_[inner_feature_index]) {
Guolin Ke's avatar
Guolin Ke committed
61
      int cur_min_machine = static_cast<int>(ArrayArgs<int>::ArgMin(num_bins_distributed));
Guolin Ke's avatar
Guolin Ke committed
62
      feature_distribution[cur_min_machine].push_back(inner_feature_index);
63
64
      auto num_bin = this->train_data_->FeatureNumBin(inner_feature_index);
      if (this->train_data_->FeatureBinMapper(inner_feature_index)->GetDefaultBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
65
66
67
        num_bin -= 1;
      }
      num_bins_distributed[cur_min_machine] += num_bin;
Guolin Ke's avatar
Guolin Ke committed
68
    }
Guolin Ke's avatar
Guolin Ke committed
69
    is_feature_aggregated_[inner_feature_index] = false;
Guolin Ke's avatar
Guolin Ke committed
70
71
72
73
74
75
76
77
78
79
80
  }
  // get local used feature
  for (auto fid : feature_distribution[rank_]) {
    is_feature_aggregated_[fid] = true;
  }

  // get block start and block len for reduce scatter
  reduce_scatter_size_ = 0;
  for (int i = 0; i < num_machines_; ++i) {
    block_len_[i] = 0;
    for (auto fid : feature_distribution[i]) {
81
82
      auto num_bin = this->train_data_->FeatureNumBin(fid);
      if (this->train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
83
84
85
        num_bin -= 1;
      }
      block_len_[i] += num_bin * sizeof(HistogramBinEntry);
Guolin Ke's avatar
Guolin Ke committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
    }
    reduce_scatter_size_ += block_len_[i];
  }

  block_start_[0] = 0;
  for (int i = 1; i < num_machines_; ++i) {
    block_start_[i] = block_start_[i - 1] + block_len_[i - 1];
  }

  // get buffer_write_start_pos_
  int bin_size = 0;
  for (int i = 0; i < num_machines_; ++i) {
    for (auto fid : feature_distribution[i]) {
      buffer_write_start_pos_[fid] = bin_size;
100
101
      auto num_bin = this->train_data_->FeatureNumBin(fid);
      if (this->train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
102
103
104
        num_bin -= 1;
      }
      bin_size += num_bin * sizeof(HistogramBinEntry);
Guolin Ke's avatar
Guolin Ke committed
105
106
107
108
109
110
111
    }
  }

  // get buffer_read_start_pos_
  bin_size = 0;
  for (auto fid : feature_distribution[rank_]) {
    buffer_read_start_pos_[fid] = bin_size;
112
113
    auto num_bin = this->train_data_->FeatureNumBin(fid);
    if (this->train_data_->FeatureBinMapper(fid)->GetDefaultBin() == 0) {
Guolin Ke's avatar
Guolin Ke committed
114
115
116
      num_bin -= 1;
    }
    bin_size += num_bin * sizeof(HistogramBinEntry);
Guolin Ke's avatar
Guolin Ke committed
117
118
119
  }

  // sync global data sumup info
120
121
  std::tuple<data_size_t, double, double> data(this->smaller_leaf_splits_->num_data_in_leaf(),
                                               this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians());
Guolin Ke's avatar
Guolin Ke committed
122
  int size = sizeof(data);
Guolin Ke's avatar
Guolin Ke committed
123
  std::memcpy(input_buffer_.data(), &data, size);
Guolin Ke's avatar
Guolin Ke committed
124
  // global sumup reduce
Guolin Ke's avatar
Guolin Ke committed
125
126
  Network::Allreduce(input_buffer_.data(), size, sizeof(std::tuple<data_size_t, double, double>), output_buffer_.data(), [](const char *src, char *dst, int type_size, comm_size_t len) {
    comm_size_t used_size = 0;
Guolin Ke's avatar
Guolin Ke committed
127
128
    const std::tuple<data_size_t, double, double> *p1;
    std::tuple<data_size_t, double, double> *p2;
Guolin Ke's avatar
Guolin Ke committed
129
    while (used_size < len) {
Guolin Ke's avatar
Guolin Ke committed
130
131
      p1 = reinterpret_cast<const std::tuple<data_size_t, double, double> *>(src);
      p2 = reinterpret_cast<std::tuple<data_size_t, double, double> *>(dst);
Guolin Ke's avatar
Guolin Ke committed
132
133
134
135
136
137
138
139
140
      std::get<0>(*p2) = std::get<0>(*p2) + std::get<0>(*p1);
      std::get<1>(*p2) = std::get<1>(*p2) + std::get<1>(*p1);
      std::get<2>(*p2) = std::get<2>(*p2) + std::get<2>(*p1);
      src += type_size;
      dst += type_size;
      used_size += type_size;
    }
  });
  // copy back
Tsukasa OMOTO's avatar
Tsukasa OMOTO committed
141
  std::memcpy((void*)&data, output_buffer_.data(), size);
Guolin Ke's avatar
Guolin Ke committed
142
  // set global sumup info
143
  this->smaller_leaf_splits_->Init(std::get<1>(data), std::get<2>(data));
Guolin Ke's avatar
Guolin Ke committed
144
145
146
147
  // init global data count in leaf
  global_data_count_in_leaf_[0] = std::get<0>(data);
}

148
template <typename TREELEARNER_T>
Guolin Ke's avatar
Guolin Ke committed
149
150
void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits() {
  TREELEARNER_T::ConstructHistograms(this->is_feature_used_, true);
Guolin Ke's avatar
Guolin Ke committed
151
  // construct local histograms
152
  #pragma omp parallel for schedule(static)
153
154
  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
    if ((!this->is_feature_used_.empty() && this->is_feature_used_[feature_index] == false)) continue;
Guolin Ke's avatar
Guolin Ke committed
155
    // copy to buffer
Guolin Ke's avatar
Guolin Ke committed
156
    std::memcpy(input_buffer_.data() + buffer_write_start_pos_[feature_index],
157
158
                this->smaller_leaf_histogram_array_[feature_index].RawData(),
                this->smaller_leaf_histogram_array_[feature_index].SizeOfHistgram());
Guolin Ke's avatar
Guolin Ke committed
159
160
  }
  // Reduce scatter for histogram
Guolin Ke's avatar
Guolin Ke committed
161
162
  Network::ReduceScatter(input_buffer_.data(), reduce_scatter_size_, sizeof(HistogramBinEntry), block_start_.data(),
                         block_len_.data(), output_buffer_.data(), static_cast<comm_size_t>(output_buffer_.size()), &HistogramBinEntry::SumReducer);
Guolin Ke's avatar
Guolin Ke committed
163
164
165
166
167
168
169
  this->FindBestSplitsFromHistograms(this->is_feature_used_, true);
}

template <typename TREELEARNER_T>
void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const std::vector<int8_t>&, bool) {
  std::vector<SplitInfo> smaller_bests_per_thread(this->num_threads_, SplitInfo());
  std::vector<SplitInfo> larger_bests_per_thread(this->num_threads_, SplitInfo());
170
171
172
173
174
175
  std::vector<int8_t> smaller_node_used_features(this->num_features_, 1);
  std::vector<int8_t> larger_node_used_features(this->num_features_, 1);
  if (this->config_->feature_fraction_bynode) {
    smaller_node_used_features = this->GetUsedFeatures();
    larger_node_used_features = this->GetUsedFeatures();
  }
176
177
  OMP_INIT_EX();
  #pragma omp parallel for schedule(static)
178
  for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
179
    OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
180
    if (!is_feature_aggregated_[feature_index]) continue;
Guolin Ke's avatar
Guolin Ke committed
181
    const int tid = omp_get_thread_num();
Guolin Ke's avatar
Guolin Ke committed
182
    const int real_feature_index = this->train_data_->RealFeatureIndex(feature_index);
Guolin Ke's avatar
Guolin Ke committed
183
    // restore global histograms from buffer
184
    this->smaller_leaf_histogram_array_[feature_index].FromMemory(
Guolin Ke's avatar
Guolin Ke committed
185
      output_buffer_.data() + buffer_read_start_pos_[feature_index]);
Guolin Ke's avatar
Guolin Ke committed
186

187
    this->train_data_->FixHistogram(feature_index,
Guolin Ke's avatar
Guolin Ke committed
188
189
190
                                    this->smaller_leaf_splits_->sum_gradients(), this->smaller_leaf_splits_->sum_hessians(),
                                    GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
                                    this->smaller_leaf_histogram_array_[feature_index].RawData());
Guolin Ke's avatar
Guolin Ke committed
191
    SplitInfo smaller_split;
Guolin Ke's avatar
Guolin Ke committed
192
    // find best threshold for smaller child
193
194
195
196
    this->smaller_leaf_histogram_array_[feature_index].FindBestThreshold(
      this->smaller_leaf_splits_->sum_gradients(),
      this->smaller_leaf_splits_->sum_hessians(),
      GetGlobalDataCountInLeaf(this->smaller_leaf_splits_->LeafIndex()),
Guolin Ke's avatar
Guolin Ke committed
197
198
      this->smaller_leaf_splits_->min_constraint(),
      this->smaller_leaf_splits_->max_constraint(),
Guolin Ke's avatar
Guolin Ke committed
199
      &smaller_split);
Guolin Ke's avatar
Guolin Ke committed
200
    smaller_split.feature = real_feature_index;
201
    if (smaller_split > smaller_bests_per_thread[tid] && smaller_node_used_features[feature_index]) {
Guolin Ke's avatar
Guolin Ke committed
202
      smaller_bests_per_thread[tid] = smaller_split;
Guolin Ke's avatar
Guolin Ke committed
203
    }
Guolin Ke's avatar
Guolin Ke committed
204
205

    // only root leaf
206
    if (this->larger_leaf_splits_ == nullptr || this->larger_leaf_splits_->LeafIndex() < 0) continue;
Guolin Ke's avatar
Guolin Ke committed
207
208

    // construct histgroms for large leaf, we init larger leaf as the parent, so we can just subtract the smaller leaf's histograms
209
210
    this->larger_leaf_histogram_array_[feature_index].Subtract(
      this->smaller_leaf_histogram_array_[feature_index]);
Guolin Ke's avatar
Guolin Ke committed
211
    SplitInfo larger_split;
Guolin Ke's avatar
Guolin Ke committed
212
    // find best threshold for larger child
213
214
215
216
    this->larger_leaf_histogram_array_[feature_index].FindBestThreshold(
      this->larger_leaf_splits_->sum_gradients(),
      this->larger_leaf_splits_->sum_hessians(),
      GetGlobalDataCountInLeaf(this->larger_leaf_splits_->LeafIndex()),
Guolin Ke's avatar
Guolin Ke committed
217
218
      this->larger_leaf_splits_->min_constraint(),
      this->larger_leaf_splits_->max_constraint(),
Guolin Ke's avatar
Guolin Ke committed
219
      &larger_split);
Guolin Ke's avatar
Guolin Ke committed
220
    larger_split.feature = real_feature_index;
221
    if (larger_split > larger_bests_per_thread[tid] && larger_node_used_features[feature_index]) {
Guolin Ke's avatar
Guolin Ke committed
222
      larger_bests_per_thread[tid] = larger_split;
Guolin Ke's avatar
Guolin Ke committed
223
    }
224
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
225
  }
226
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
227

Guolin Ke's avatar
Guolin Ke committed
228
229
230
  auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_bests_per_thread);
  int leaf = this->smaller_leaf_splits_->LeafIndex();
  this->best_split_per_leaf_[leaf] = smaller_bests_per_thread[smaller_best_idx];
Guolin Ke's avatar
Guolin Ke committed
231

Guolin Ke's avatar
Guolin Ke committed
232
233
234
235
236
  if (this->larger_leaf_splits_ != nullptr &&  this->larger_leaf_splits_->LeafIndex() >= 0) {
    leaf = this->larger_leaf_splits_->LeafIndex();
    auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_bests_per_thread);
    this->best_split_per_leaf_[leaf] = larger_bests_per_thread[larger_best_idx];
  }
Guolin Ke's avatar
Guolin Ke committed
237

Guolin Ke's avatar
Guolin Ke committed
238
239
  SplitInfo smaller_best_split, larger_best_split;
  smaller_best_split = this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()];
Guolin Ke's avatar
Guolin Ke committed
240
  // find local best split for larger leaf
241
  if (this->larger_leaf_splits_->LeafIndex() >= 0) {
Guolin Ke's avatar
Guolin Ke committed
242
    larger_best_split = this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()];
Guolin Ke's avatar
Guolin Ke committed
243
244
245
  }

  // sync global best info
Guolin Ke's avatar
Guolin Ke committed
246
  SyncUpGlobalBestSplit(input_buffer_.data(), input_buffer_.data(), &smaller_best_split, &larger_best_split, this->config_->max_cat_threshold);
Guolin Ke's avatar
Guolin Ke committed
247
248

  // set best split
Guolin Ke's avatar
Guolin Ke committed
249
  this->best_split_per_leaf_[this->smaller_leaf_splits_->LeafIndex()] = smaller_best_split;
250
  if (this->larger_leaf_splits_->LeafIndex() >= 0) {
Guolin Ke's avatar
Guolin Ke committed
251
    this->best_split_per_leaf_[this->larger_leaf_splits_->LeafIndex()] = larger_best_split;
Guolin Ke's avatar
Guolin Ke committed
252
253
254
  }
}

255
256
257
258
template <typename TREELEARNER_T>
void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
  TREELEARNER_T::Split(tree, best_Leaf, left_leaf, right_leaf);
  const SplitInfo& best_split_info = this->best_split_per_leaf_[best_Leaf];
Guolin Ke's avatar
Guolin Ke committed
259
260
261
262
263
  // need update global number of data in leaf
  global_data_count_in_leaf_[*left_leaf] = best_split_info.left_count;
  global_data_count_in_leaf_[*right_leaf] = best_split_info.right_count;
}

264
265
266
// instantiate template classes, otherwise linker cannot find the code
template class DataParallelTreeLearner<GPUTreeLearner>;
template class DataParallelTreeLearner<SerialTreeLearner>;
Guolin Ke's avatar
Guolin Ke committed
267
268

}  // namespace LightGBM