serial_tree_learner.cpp 28.4 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
#include "serial_tree_learner.h"

7
8
#include <LightGBM/network.h>
#include <LightGBM/objective_function.h>
Guolin Ke's avatar
Guolin Ke committed
9
#include <LightGBM/utils/array_args.h>
10
#include <LightGBM/utils/common.h>
Guolin Ke's avatar
Guolin Ke committed
11

Guolin Ke's avatar
Guolin Ke committed
12
#include <algorithm>
13
#include <queue>
14
15
#include <unordered_map>
#include <utility>
Guolin Ke's avatar
Guolin Ke committed
16

17
18
#include "cost_effective_gradient_boosting.hpp"

Guolin Ke's avatar
Guolin Ke committed
19
20
namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
21
SerialTreeLearner::SerialTreeLearner(const Config* config)
22
    : config_(config), col_sampler_(config) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
26
27
}

SerialTreeLearner::~SerialTreeLearner() {
}

28
void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
Guolin Ke's avatar
Guolin Ke committed
29
30
31
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
  num_features_ = train_data_->num_features();
32
33
  int max_cache_size = 0;
  // Get the max size of pool
Guolin Ke's avatar
Guolin Ke committed
34
35
  if (config_->histogram_pool_size <= 0) {
    max_cache_size = config_->num_leaves;
36
37
38
  } else {
    size_t total_histogram_size = 0;
    for (int i = 0; i < train_data_->num_features(); ++i) {
39
      total_histogram_size += kHistEntrySize * train_data_->FeatureNumBin(i);
40
    }
Guolin Ke's avatar
Guolin Ke committed
41
    max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
42
43
  }
  // at least need 2 leaves
Guolin Ke's avatar
Guolin Ke committed
44
  max_cache_size = std::max(2, max_cache_size);
Guolin Ke's avatar
Guolin Ke committed
45
  max_cache_size = std::min(max_cache_size, config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
46

Guolin Ke's avatar
Guolin Ke committed
47
  // push split information for all leaves
Guolin Ke's avatar
Guolin Ke committed
48
  best_split_per_leaf_.resize(config_->num_leaves);
49
  constraints_.reset(new LeafConstraints<ConstraintEntry>(config_->num_leaves));
Guolin Ke's avatar
Guolin Ke committed
50

wxchan's avatar
wxchan committed
51
  // initialize splits for leaf
Guolin Ke's avatar
Guolin Ke committed
52
53
  smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
  larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
Guolin Ke's avatar
Guolin Ke committed
54
55

  // initialize data partition
Guolin Ke's avatar
Guolin Ke committed
56
  data_partition_.reset(new DataPartition(num_data_, config_->num_leaves));
57
  col_sampler_.SetTrainingData(train_data_);
Guolin Ke's avatar
Guolin Ke committed
58
  // initialize ordered gradients and hessians
Guolin Ke's avatar
Guolin Ke committed
59
60
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);
61

62
63
  GetShareStates(train_data_, is_constant_hessian, true);
  histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
64
  Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
65
66
67
  if (CostEfficientGradientBoosting::IsEnable(config_)) {
    cegb_.reset(new CostEfficientGradientBoosting(this));
    cegb_->Init();
68
  }
Guolin Ke's avatar
Guolin Ke committed
69
70
}

71
72
73
void SerialTreeLearner::GetShareStates(const Dataset* dataset,
                                       bool is_constant_hessian,
                                       bool is_first_time) {
74
  if (is_first_time) {
75
    share_state_.reset(dataset->GetShareStates(
76
77
78
        ordered_gradients_.data(), ordered_hessians_.data(),
        col_sampler_.is_feature_used_bytree(), is_constant_hessian,
        config_->force_col_wise, config_->force_row_wise));
79
  } else {
Nikita Titov's avatar
Nikita Titov committed
80
    CHECK_NOTNULL(share_state_);
81
    // cannot change is_hist_col_wise during training
82
    share_state_.reset(dataset->GetShareStates(
83
        ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(),
84
85
        is_constant_hessian, share_state_->is_colwise,
        !share_state_->is_colwise));
86
  }
Nikita Titov's avatar
Nikita Titov committed
87
  CHECK_NOTNULL(share_state_);
88
89
}

90
91
92
void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data,
                                               bool is_constant_hessian,
                                               bool reset_multi_val_bin) {
Guolin Ke's avatar
Guolin Ke committed
93
94
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
95
  CHECK_EQ(num_features_, train_data_->num_features());
Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
100
101
102

  // initialize splits for leaf
  smaller_leaf_splits_->ResetNumData(num_data_);
  larger_leaf_splits_->ResetNumData(num_data_);

  // initialize data partition
  data_partition_->ResetNumData(num_data_);
103
  if (reset_multi_val_bin) {
104
    col_sampler_.SetTrainingData(train_data_);
105
106
    GetShareStates(train_data_, is_constant_hessian, false);
  }
107

Guolin Ke's avatar
Guolin Ke committed
108
109
110
  // initialize ordered gradients and hessians
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);
111
112
113
  if (cegb_ != nullptr) {
    cegb_->Init();
  }
Guolin Ke's avatar
Guolin Ke committed
114
}
Guolin Ke's avatar
Guolin Ke committed
115

Guolin Ke's avatar
Guolin Ke committed
116
117
118
void SerialTreeLearner::ResetConfig(const Config* config) {
  if (config_->num_leaves != config->num_leaves) {
    config_ = config;
Guolin Ke's avatar
Guolin Ke committed
119
120
    int max_cache_size = 0;
    // Get the max size of pool
Guolin Ke's avatar
Guolin Ke committed
121
122
    if (config->histogram_pool_size <= 0) {
      max_cache_size = config_->num_leaves;
Guolin Ke's avatar
Guolin Ke committed
123
124
125
    } else {
      size_t total_histogram_size = 0;
      for (int i = 0; i < train_data_->num_features(); ++i) {
126
        total_histogram_size += kHistEntrySize * train_data_->FeatureNumBin(i);
Guolin Ke's avatar
Guolin Ke committed
127
      }
Guolin Ke's avatar
Guolin Ke committed
128
      max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
Guolin Ke's avatar
Guolin Ke committed
129
130
131
    }
    // at least need 2 leaves
    max_cache_size = std::max(2, max_cache_size);
Guolin Ke's avatar
Guolin Ke committed
132
    max_cache_size = std::min(max_cache_size, config_->num_leaves);
133
    histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
134
135

    // push split information for all leaves
Guolin Ke's avatar
Guolin Ke committed
136
137
    best_split_per_leaf_.resize(config_->num_leaves);
    data_partition_->ResetLeaves(config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
138
  } else {
Guolin Ke's avatar
Guolin Ke committed
139
    config_ = config;
Guolin Ke's avatar
Guolin Ke committed
140
  }
141
  col_sampler_.SetConfig(config_);
142
  histogram_pool_.ResetConfig(train_data_, config_);
143
144
145
146
  if (CostEfficientGradientBoosting::IsEnable(config_)) {
    cegb_.reset(new CostEfficientGradientBoosting(this));
    cegb_->Init();
  }
Guolin Ke's avatar
Guolin Ke committed
147
148
}

149
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
150
  Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
Guolin Ke's avatar
Guolin Ke committed
151
152
  gradients_ = gradients;
  hessians_ = hessians;
153
  int num_threads = OMP_NUM_THREADS();
Nikita Titov's avatar
Nikita Titov committed
154
  if (share_state_->num_threads != num_threads && share_state_->num_threads > 0) {
155
    Log::Warning(
Nikita Titov's avatar
Nikita Titov committed
156
157
        "Detected that num_threads changed during training (from %d to %d), "
        "it may cause unexpected errors.",
158
159
160
        share_state_->num_threads, num_threads);
  }
  share_state_->num_threads = num_threads;
Nikita Titov's avatar
Nikita Titov committed
161

Guolin Ke's avatar
Guolin Ke committed
162
163
  // some initial works before training
  BeforeTrain();
Guolin Ke's avatar
Guolin Ke committed
164

Guolin Ke's avatar
Guolin Ke committed
165
  auto tree = std::unique_ptr<Tree>(new Tree(config_->num_leaves));
166
  auto tree_prt = tree.get();
Guolin Ke's avatar
Guolin Ke committed
167
168
  // root leaf
  int left_leaf = 0;
169
  int cur_depth = 1;
Guolin Ke's avatar
Guolin Ke committed
170
171
  // only root leaf can be splitted on first time
  int right_leaf = -1;
172

173
  int init_splits = ForceSplits(tree_prt, &left_leaf, &right_leaf, &cur_depth);
174

Guolin Ke's avatar
Guolin Ke committed
175
  for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
Guolin Ke's avatar
Guolin Ke committed
176
    // some initial works before finding best split
177
    if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) {
Guolin Ke's avatar
Guolin Ke committed
178
      // find best threshold for every feature
Guolin Ke's avatar
Guolin Ke committed
179
      FindBestSplits();
180
    } 
Guolin Ke's avatar
Guolin Ke committed
181
182
183
184
185
186
    // Get a leaf with max split gain
    int best_leaf = static_cast<int>(ArrayArgs<SplitInfo>::ArgMax(best_split_per_leaf_));
    // Get split information for best leaf
    const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
    // cannot split, quit
    if (best_leaf_SplitInfo.gain <= 0.0) {
Guolin Ke's avatar
Guolin Ke committed
187
      Log::Warning("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain);
Guolin Ke's avatar
Guolin Ke committed
188
189
190
      break;
    }
    // split tree with best leaf
191
    Split(tree_prt, best_leaf, &left_leaf, &right_leaf);
192
    cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
Guolin Ke's avatar
Guolin Ke committed
193
  }
194
  Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth);
Guolin Ke's avatar
Guolin Ke committed
195
  return tree.release();
Guolin Ke's avatar
Guolin Ke committed
196
197
}

198
Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const {
Guolin Ke's avatar
Guolin Ke committed
199
  auto tree = std::unique_ptr<Tree>(new Tree(*old_tree));
Nikita Titov's avatar
Nikita Titov committed
200
  CHECK_GE(data_partition_->num_leaves(), tree->num_leaves());
201
  OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
202
  #pragma omp parallel for schedule(static)
203
  for (int i = 0; i < tree->num_leaves(); ++i) {
204
    OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
205
206
207
    data_size_t cnt_leaf_data = 0;
    auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
    double sum_grad = 0.0f;
208
    double sum_hess = kEpsilon;
Guolin Ke's avatar
Guolin Ke committed
209
210
211
212
213
    for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
      auto idx = tmp_idx[j];
      sum_grad += gradients[idx];
      sum_hess += hessians[idx];
    }
214
215
216
    double output = FeatureHistogram::CalculateSplittedLeafOutput<true, true>(
        sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
        config_->max_delta_step);
Guolin Ke's avatar
Guolin Ke committed
217
218
219
    auto old_leaf_output = tree->LeafOutput(i);
    auto new_leaf_output = output * tree->shrinkage();
    tree->SetLeafOutput(i, config_->refit_decay_rate * old_leaf_output + (1.0 - config_->refit_decay_rate) * new_leaf_output);
220
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
221
  }
222
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
223
224
225
  return tree.release();
}

226
227
228
229
230
Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred, const score_t* gradients, const score_t *hessians) {
  data_partition_->ResetByLeafPred(leaf_pred, old_tree->num_leaves());
  return FitByExistingTree(old_tree, gradients, hessians);
}

Guolin Ke's avatar
Guolin Ke committed
231
void SerialTreeLearner::BeforeTrain() {
232
  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeTrain", global_timer);
233
234
  // reset histogram pool
  histogram_pool_.ResetMap();
Guolin Ke's avatar
Guolin Ke committed
235

236
237
  col_sampler_.ResetByTree();
  train_data_->InitTrain(col_sampler_.is_feature_used_bytree(), share_state_.get());
Guolin Ke's avatar
Guolin Ke committed
238
239
240
  // initialize data partition
  data_partition_->Init();

241
242
  constraints_->Reset();

Guolin Ke's avatar
Guolin Ke committed
243
  // reset the splits for leaves
Guolin Ke's avatar
Guolin Ke committed
244
  for (int i = 0; i < config_->num_leaves; ++i) {
Guolin Ke's avatar
Guolin Ke committed
245
246
247
248
249
250
251
    best_split_per_leaf_[i].Reset();
  }

  // Sumup for root
  if (data_partition_->leaf_count(0) == num_data_) {
    // use all data
    smaller_leaf_splits_->Init(gradients_, hessians_);
Guolin Ke's avatar
Guolin Ke committed
252

Guolin Ke's avatar
Guolin Ke committed
253
254
  } else {
    // use bagging, only use part of data
Guolin Ke's avatar
Guolin Ke committed
255
    smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
Guolin Ke's avatar
Guolin Ke committed
256
257
258
259
260
  }

  larger_leaf_splits_->Init();
}

Guolin Ke's avatar
Guolin Ke committed
261
bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
262
  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeFindBestSplit", global_timer);
Guolin Ke's avatar
Guolin Ke committed
263
  // check depth of current leaf
Guolin Ke's avatar
Guolin Ke committed
264
  if (config_->max_depth > 0) {
Guolin Ke's avatar
Guolin Ke committed
265
    // only need to check left leaf, since right leaf is in same level of left leaf
Guolin Ke's avatar
Guolin Ke committed
266
    if (tree->leaf_depth(left_leaf) >= config_->max_depth) {
Guolin Ke's avatar
Guolin Ke committed
267
268
269
270
271
272
273
      best_split_per_leaf_[left_leaf].gain = kMinScore;
      if (right_leaf >= 0) {
        best_split_per_leaf_[right_leaf].gain = kMinScore;
      }
      return false;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
274
275
276
  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // no enough data to continue
Guolin Ke's avatar
Guolin Ke committed
277
278
  if (num_data_in_right_child < static_cast<data_size_t>(config_->min_data_in_leaf * 2)
      && num_data_in_left_child < static_cast<data_size_t>(config_->min_data_in_leaf * 2)) {
Guolin Ke's avatar
Guolin Ke committed
279
280
281
282
283
284
    best_split_per_leaf_[left_leaf].gain = kMinScore;
    if (right_leaf >= 0) {
      best_split_per_leaf_[right_leaf].gain = kMinScore;
    }
    return false;
  }
285
  parent_leaf_histogram_array_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
286
287
  // only have root
  if (right_leaf < 0) {
288
    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
Guolin Ke's avatar
Guolin Ke committed
289
290
    larger_leaf_histogram_array_ = nullptr;
  } else if (num_data_in_left_child < num_data_in_right_child) {
Hui Xue's avatar
Hui Xue committed
291
    // put parent(left) leaf's histograms into larger leaf's histograms
292
293
294
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Move(left_leaf, right_leaf);
    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
Guolin Ke's avatar
Guolin Ke committed
295
  } else {
Hui Xue's avatar
Hui Xue committed
296
    // put parent(left) leaf's histograms to larger leaf's histograms
297
298
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
Guolin Ke's avatar
Guolin Ke committed
299
300
301
302
  }
  return true;
}

Guolin Ke's avatar
Guolin Ke committed
303
304
void SerialTreeLearner::FindBestSplits() {
  std::vector<int8_t> is_feature_used(num_features_, 0);
305
  #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512)
Guolin Ke's avatar
Guolin Ke committed
306
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
307
    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
Guolin Ke's avatar
Guolin Ke committed
308
309
310
311
312
313
314
315
316
317
318
319
    if (parent_leaf_histogram_array_ != nullptr
        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
      continue;
    }
    is_feature_used[feature_index] = 1;
  }
  bool use_subtract = parent_leaf_histogram_array_ != nullptr;
  ConstructHistograms(is_feature_used, use_subtract);
  FindBestSplitsFromHistograms(is_feature_used, use_subtract);
}

320
321
322
323
void SerialTreeLearner::ConstructHistograms(
    const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
                                  global_timer);
Guolin Ke's avatar
Guolin Ke committed
324
  // construct smaller leaf
325
326
  hist_t* ptr_smaller_leaf_hist_data =
      smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
327
328
329
  train_data_->ConstructHistograms(
      is_feature_used, smaller_leaf_splits_->data_indices(),
      smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
330
331
      ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
      ptr_smaller_leaf_hist_data);
Guolin Ke's avatar
Guolin Ke committed
332
333
334

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
335
336
    hist_t* ptr_larger_leaf_hist_data =
        larger_leaf_histogram_array_[0].RawData() - kHistOffset;
337
338
339
    train_data_->ConstructHistograms(
        is_feature_used, larger_leaf_splits_->data_indices(),
        larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
340
        ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
341
        ptr_larger_leaf_hist_data);
Guolin Ke's avatar
Guolin Ke committed
342
  }
343
344
}

Guolin Ke's avatar
Guolin Ke committed
345
346
347
348
void SerialTreeLearner::FindBestSplitsFromHistograms(
    const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  Common::FunctionTimer fun_timer(
      "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
349
350
  std::vector<SplitInfo> smaller_best(share_state_->num_threads);
  std::vector<SplitInfo> larger_best(share_state_->num_threads);
351
352
  std::vector<int8_t> smaller_node_used_features = col_sampler_.GetByNode();
  std::vector<int8_t> larger_node_used_features = col_sampler_.GetByNode();
353
  OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
354
355
// find splits
#pragma omp parallel for schedule(static)
Guolin Ke's avatar
Guolin Ke committed
356
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
357
    OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
358
359
360
    if (!is_feature_used[feature_index]) {
      continue;
    }
Guolin Ke's avatar
Guolin Ke committed
361
    const int tid = omp_get_thread_num();
Guolin Ke's avatar
Guolin Ke committed
362
363
364
365
    train_data_->FixHistogram(
        feature_index, smaller_leaf_splits_->sum_gradients(),
        smaller_leaf_splits_->sum_hessians(),
        smaller_leaf_histogram_array_[feature_index].RawData());
366
    int real_fidx = train_data_->RealFeatureIndex(feature_index);
367
368
369
370
371
372
373

    ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index,
                               real_fidx,
                               smaller_node_used_features[feature_index],
                               smaller_leaf_splits_->num_data_in_leaf(),
                               smaller_leaf_splits_.get(), &smaller_best[tid]);

Guolin Ke's avatar
Guolin Ke committed
374
    // only has root leaf
Guolin Ke's avatar
Guolin Ke committed
375
376
377
378
    if (larger_leaf_splits_ == nullptr ||
        larger_leaf_splits_->leaf_index() < 0) {
      continue;
    }
Guolin Ke's avatar
Guolin Ke committed
379

Guolin Ke's avatar
Guolin Ke committed
380
    if (use_subtract) {
Guolin Ke's avatar
Guolin Ke committed
381
382
      larger_leaf_histogram_array_[feature_index].Subtract(
          smaller_leaf_histogram_array_[feature_index]);
383
    } else {
Guolin Ke's avatar
Guolin Ke committed
384
385
386
387
      train_data_->FixHistogram(
          feature_index, larger_leaf_splits_->sum_gradients(),
          larger_leaf_splits_->sum_hessians(),
          larger_leaf_histogram_array_[feature_index].RawData());
388
    }
389
390
391
392
393

    ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index,
                               real_fidx,
                               larger_node_used_features[feature_index],
                               larger_leaf_splits_->num_data_in_leaf(),
Guolin Ke's avatar
Guolin Ke committed
394
                               larger_leaf_splits_.get(), &larger_best[tid]);
395

396
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
397
  }
398
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
399
  auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
400
  int leaf = smaller_leaf_splits_->leaf_index();
Guolin Ke's avatar
Guolin Ke committed
401
402
  best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];

Guolin Ke's avatar
Guolin Ke committed
403
404
  if (larger_leaf_splits_ != nullptr &&
      larger_leaf_splits_->leaf_index() >= 0) {
405
    leaf = larger_leaf_splits_->leaf_index();
Guolin Ke's avatar
Guolin Ke committed
406
407
408
409
410
    auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
    best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
  }
}

411
412
413
414
415
416
int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
                                       int* right_leaf, int *cur_depth) {
  bool abort_last_forced_split = false;
  if (forced_split_json_ == nullptr) {
    return 0;
  }
417
418
419
420
  int32_t result_count = 0;
  // start at root leaf
  *left_leaf = 0;
  std::queue<std::pair<Json, int>> q;
421
  Json left = *forced_split_json_;
422
423
424
  Json right;
  bool left_smaller = true;
  std::unordered_map<int, SplitInfo> forceSplitMap;
425
  q.push(std::make_pair(left, *left_leaf));
426
  while (!q.empty()) {
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
    // before processing next node from queue, store info for current left/right leaf
    // store "best split" for left and right, even if they might be overwritten by forced split
    if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) {
      FindBestSplits();
    }
    // then, compute own splits
    SplitInfo left_split;
    SplitInfo right_split;

    if (!left.is_null()) {
      const int left_feature = left["feature"].int_value();
      const double left_threshold_double = left["threshold"].number_value();
      const int left_inner_feature_index = train_data_->InnerFeatureIndex(left_feature);
      const uint32_t left_threshold = train_data_->BinThreshold(
              left_inner_feature_index, left_threshold_double);
      auto leaf_histogram_array = (left_smaller) ? smaller_leaf_histogram_array_ : larger_leaf_histogram_array_;
      auto left_leaf_splits = (left_smaller) ? smaller_leaf_splits_.get() : larger_leaf_splits_.get();
      leaf_histogram_array[left_inner_feature_index].GatherInfoForThreshold(
              left_leaf_splits->sum_gradients(),
              left_leaf_splits->sum_hessians(),
              left_threshold,
              left_leaf_splits->num_data_in_leaf(),
              &left_split);
      left_split.feature = left_feature;
      forceSplitMap[*left_leaf] = left_split;
      if (left_split.gain < 0) {
        forceSplitMap.erase(*left_leaf);
      }
    }

    if (!right.is_null()) {
      const int right_feature = right["feature"].int_value();
      const double right_threshold_double = right["threshold"].number_value();
      const int right_inner_feature_index = train_data_->InnerFeatureIndex(right_feature);
      const uint32_t right_threshold = train_data_->BinThreshold(
              right_inner_feature_index, right_threshold_double);
      auto leaf_histogram_array = (left_smaller) ? larger_leaf_histogram_array_ : smaller_leaf_histogram_array_;
      auto right_leaf_splits = (left_smaller) ? larger_leaf_splits_.get() : smaller_leaf_splits_.get();
      leaf_histogram_array[right_inner_feature_index].GatherInfoForThreshold(
        right_leaf_splits->sum_gradients(),
        right_leaf_splits->sum_hessians(),
        right_threshold,
        right_leaf_splits->num_data_in_leaf(),
        &right_split);
      right_split.feature = right_feature;
      forceSplitMap[*right_leaf] = right_split;
      if (right_split.gain < 0) {
        forceSplitMap.erase(*right_leaf);
      }
    }

    std::pair<Json, int> pair = q.front();
    q.pop();
    int current_leaf = pair.second;
    // split info should exist because searching in bfs fashion - should have added from parent
    if (forceSplitMap.find(current_leaf) == forceSplitMap.end()) {
483
        abort_last_forced_split = true;
484
485
        break;
    }
486
487
488
489
    best_split_per_leaf_[current_leaf] = forceSplitMap[current_leaf];
    Split(tree, current_leaf, left_leaf, right_leaf);
    left_smaller = best_split_per_leaf_[current_leaf].left_count <
                   best_split_per_leaf_[current_leaf].right_count;
490
491
492
493
    left = Json();
    right = Json();
    if ((pair.first).object_items().count("left") > 0) {
      left = (pair.first)["left"];
494
495
496
      if (left.object_items().count("feature") > 0 && left.object_items().count("threshold") > 0) {
        q.push(std::make_pair(left, *left_leaf));
      }
497
498
499
    }
    if ((pair.first).object_items().count("right") > 0) {
      right = (pair.first)["right"];
500
501
502
      if (right.object_items().count("feature") > 0 && right.object_items().count("threshold") > 0) {
        q.push(std::make_pair(right, *right_leaf));
      }
503
504
505
506
    }
    result_count++;
    *(cur_depth) = std::max(*(cur_depth), tree->leaf_depth(*left_leaf));
  }
507
508
509
510
511
512
513
514
515
516
517
518
519
  if (abort_last_forced_split) {
    int best_leaf =
        static_cast<int>(ArrayArgs<SplitInfo>::ArgMax(best_split_per_leaf_));
    const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
    if (best_leaf_SplitInfo.gain <= 0.0) {
      Log::Warning("No further splits with positive gain, best gain: %f",
                   best_leaf_SplitInfo.gain);
      return config_->num_leaves;
    }
    Split(tree, best_leaf, left_leaf, right_leaf);
    *(cur_depth) = std::max(*(cur_depth), tree->leaf_depth(*left_leaf));
    result_count++;
  }
520
521
  return result_count;
}
Guolin Ke's avatar
Guolin Ke committed
522

523
524
525
void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
                                   int* right_leaf, bool update_cnt) {
  Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer);
526
  SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
527
528
  const int inner_feature_index =
      train_data_->InnerFeatureIndex(best_split_info.feature);
529
  if (cegb_ != nullptr) {
530
531
    cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info,
                                &best_split_per_leaf_);
532
  }
533
  *left_leaf = best_leaf;
534
535
  auto next_leaf_id = tree->NextLeafId();

536
537
538
  bool is_numerical_split =
      train_data_->FeatureBinMapper(inner_feature_index)->bin_type() ==
      BinType::NumericalBin;
Guolin Ke's avatar
Guolin Ke committed
539
  if (is_numerical_split) {
540
541
    auto threshold_double = train_data_->RealThreshold(
        inner_feature_index, best_split_info.threshold);
542
    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
543
544
545
546
547
548
549
                           &best_split_info.threshold, 1,
                           best_split_info.default_left, next_leaf_id);
    if (update_cnt) {
      // don't need to update this in data-based parallel model
      best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
      best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
    }
550
    // split tree, will return right leaf
551
552
553
554
555
556
557
558
559
560
561
562
    *right_leaf = tree->Split(
        best_leaf, inner_feature_index, best_split_info.feature,
        best_split_info.threshold, threshold_double,
        static_cast<double>(best_split_info.left_output),
        static_cast<double>(best_split_info.right_output),
        static_cast<data_size_t>(best_split_info.left_count),
        static_cast<data_size_t>(best_split_info.right_count),
        static_cast<double>(best_split_info.left_sum_hessian),
        static_cast<double>(best_split_info.right_sum_hessian),
        static_cast<float>(best_split_info.gain),
        train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
        best_split_info.default_left);
563
  } else {
564
565
566
    std::vector<uint32_t> cat_bitset_inner =
        Common::ConstructBitset(best_split_info.cat_threshold.data(),
                                best_split_info.num_cat_threshold);
567
568
    std::vector<int> threshold_int(best_split_info.num_cat_threshold);
    for (int i = 0; i < best_split_info.num_cat_threshold; ++i) {
569
570
      threshold_int[i] = static_cast<int>(train_data_->RealThreshold(
          inner_feature_index, best_split_info.cat_threshold[i]));
571
    }
572
573
    std::vector<uint32_t> cat_bitset = Common::ConstructBitset(
        threshold_int.data(), best_split_info.num_cat_threshold);
574

575
    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
                           cat_bitset_inner.data(),
                           static_cast<int>(cat_bitset_inner.size()),
                           best_split_info.default_left, next_leaf_id);

    if (update_cnt) {
      // don't need to update this in data-based parallel model
      best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
      best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
    }

    *right_leaf = tree->SplitCategorical(
        best_leaf, inner_feature_index, best_split_info.feature,
        cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()),
        cat_bitset.data(), static_cast<int>(cat_bitset.size()),
        static_cast<double>(best_split_info.left_output),
        static_cast<double>(best_split_info.right_output),
        static_cast<data_size_t>(best_split_info.left_count),
        static_cast<data_size_t>(best_split_info.right_count),
        static_cast<double>(best_split_info.left_sum_hessian),
        static_cast<double>(best_split_info.right_sum_hessian),
        static_cast<float>(best_split_info.gain),
        train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
598
  }
599

600
#ifdef DEBUG
601
  CHECK(*right_leaf == next_leaf_id);
602
#endif
603

Guolin Ke's avatar
Guolin Ke committed
604
605
  // init the leaves that used on next iteration
  if (best_split_info.left_count < best_split_info.right_count) {
606
    CHECK_GT(best_split_info.left_count, 0);
607
608
609
610
611
612
    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                               best_split_info.left_sum_gradient,
                               best_split_info.left_sum_hessian);
    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                              best_split_info.right_sum_gradient,
                              best_split_info.right_sum_hessian);
Guolin Ke's avatar
Guolin Ke committed
613
  } else {
614
    CHECK_GT(best_split_info.right_count, 0);
615
616
617
618
619
620
    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                               best_split_info.right_sum_gradient,
                               best_split_info.right_sum_hessian);
    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                              best_split_info.left_sum_gradient,
                              best_split_info.left_sum_hessian);
Guolin Ke's avatar
Guolin Ke committed
621
  }
622
623
624
625
626
  constraints_->UpdateConstraints(is_numerical_split, *left_leaf, *right_leaf,
                                  best_split_info.monotone_type,
                                  best_split_info.right_output,
                                  best_split_info.left_output);
}
Guolin Ke's avatar
Guolin Ke committed
627

628
void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
629
630
                                        data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const {
  if (obj != nullptr && obj->IsRenewTreeOutput()) {
Nikita Titov's avatar
Nikita Titov committed
631
    CHECK_LE(tree->num_leaves(), data_partition_->num_leaves());
632
633
    const data_size_t* bag_mapper = nullptr;
    if (total_num_data != num_data_) {
634
      CHECK_EQ(bag_cnt, num_data_);
635
636
      bag_mapper = bag_indices;
    }
Guolin Ke's avatar
Guolin Ke committed
637
    std::vector<int> n_nozeroworker_perleaf(tree->num_leaves(), 1);
638
    int num_machines = Network::num_machines();
639
640
641
642
643
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < tree->num_leaves(); ++i) {
      const double output = static_cast<double>(tree->LeafOutput(i));
      data_size_t cnt_leaf_data = 0;
      auto index_mapper = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
Guolin Ke's avatar
Guolin Ke committed
644
645
      if (cnt_leaf_data > 0) {
        // bag_mapper[index_mapper[i]]
646
        const double new_output = obj->RenewTreeOutput(output, residual_getter, index_mapper, bag_mapper, cnt_leaf_data);
Guolin Ke's avatar
Guolin Ke committed
647
648
        tree->SetLeafOutput(i, new_output);
      } else {
649
        CHECK_GT(num_machines, 1);
Guolin Ke's avatar
Guolin Ke committed
650
651
652
653
654
655
656
657
658
        tree->SetLeafOutput(i, 0.0);
        n_nozeroworker_perleaf[i] = 0;
      }
    }
    if (num_machines > 1) {
      std::vector<double> outputs(tree->num_leaves());
      for (int i = 0; i < tree->num_leaves(); ++i) {
        outputs[i] = static_cast<double>(tree->LeafOutput(i));
      }
Guolin Ke's avatar
Guolin Ke committed
659
660
      outputs = Network::GlobalSum(&outputs);
      n_nozeroworker_perleaf = Network::GlobalSum(&n_nozeroworker_perleaf);
Guolin Ke's avatar
Guolin Ke committed
661
662
663
664
665
666
667
      for (int i = 0; i < tree->num_leaves(); ++i) {
        tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]);
      }
    }
  }
}

668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
void SerialTreeLearner::ComputeBestSplitForFeature(
    FeatureHistogram* histogram_array_, int feature_index, int real_fidx,
    bool is_feature_used, int num_data, const LeafSplits* leaf_splits,
    SplitInfo* best_split) {
  if (!is_feature_used) {
    return;
  }
  SplitInfo new_split;
  histogram_array_[feature_index].FindBestThreshold(
      leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
      constraints_->Get(leaf_splits->leaf_index()), &new_split);
  new_split.feature = real_fidx;
  if (cegb_ != nullptr) {
    new_split.gain -=
        cegb_->DetlaGain(feature_index, real_fidx, leaf_splits->leaf_index(),
                         num_data, new_split);
  }
  if (new_split > *best_split) {
    *best_split = new_split;
  }
}

Guolin Ke's avatar
Guolin Ke committed
690
}  // namespace LightGBM