serial_tree_learner.cpp 32.2 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
#include "serial_tree_learner.h"

7
8
9
10
11
#include <LightGBM/network.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/common.h>

12
13
14
15
16
#include <algorithm>
#include <queue>
#include <unordered_map>
#include <utility>

17
18
#include "cost_effective_gradient_boosting.hpp"

Guolin Ke's avatar
Guolin Ke committed
19
20
namespace LightGBM {

Guolin Ke's avatar
Guolin Ke committed
21
SerialTreeLearner::SerialTreeLearner(const Config* config)
22
    : config_(config), col_sampler_(config) {
Guolin Ke's avatar
Guolin Ke committed
23
24
25
26
27
}

SerialTreeLearner::~SerialTreeLearner() {
}

28
void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
Guolin Ke's avatar
Guolin Ke committed
29
30
31
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
  num_features_ = train_data_->num_features();
32
33
  int max_cache_size = 0;
  // Get the max size of pool
Guolin Ke's avatar
Guolin Ke committed
34
35
  if (config_->histogram_pool_size <= 0) {
    max_cache_size = config_->num_leaves;
36
37
38
  } else {
    size_t total_histogram_size = 0;
    for (int i = 0; i < train_data_->num_features(); ++i) {
39
      total_histogram_size += kHistEntrySize * train_data_->FeatureNumBin(i);
40
    }
Guolin Ke's avatar
Guolin Ke committed
41
    max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
42
43
  }
  // at least need 2 leaves
Guolin Ke's avatar
Guolin Ke committed
44
  max_cache_size = std::max(2, max_cache_size);
Guolin Ke's avatar
Guolin Ke committed
45
  max_cache_size = std::min(max_cache_size, config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
46

Guolin Ke's avatar
Guolin Ke committed
47
  // push split information for all leaves
Guolin Ke's avatar
Guolin Ke committed
48
  best_split_per_leaf_.resize(config_->num_leaves);
49
  constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves));
Guolin Ke's avatar
Guolin Ke committed
50

wxchan's avatar
wxchan committed
51
  // initialize splits for leaf
Guolin Ke's avatar
Guolin Ke committed
52
53
  smaller_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
  larger_leaf_splits_.reset(new LeafSplits(train_data_->num_data()));
Guolin Ke's avatar
Guolin Ke committed
54
55

  // initialize data partition
Guolin Ke's avatar
Guolin Ke committed
56
  data_partition_.reset(new DataPartition(num_data_, config_->num_leaves));
57
  col_sampler_.SetTrainingData(train_data_);
Guolin Ke's avatar
Guolin Ke committed
58
  // initialize ordered gradients and hessians
Guolin Ke's avatar
Guolin Ke committed
59
60
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);
61

62
63
  GetShareStates(train_data_, is_constant_hessian, true);
  histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
64
  Log::Info("Number of data points in the train set: %d, number of used features: %d", num_data_, num_features_);
65
66
67
  if (CostEfficientGradientBoosting::IsEnable(config_)) {
    cegb_.reset(new CostEfficientGradientBoosting(this));
    cegb_->Init();
68
  }
Guolin Ke's avatar
Guolin Ke committed
69
70
}

71
72
73
void SerialTreeLearner::GetShareStates(const Dataset* dataset,
                                       bool is_constant_hessian,
                                       bool is_first_time) {
74
  if (is_first_time) {
75
    share_state_.reset(dataset->GetShareStates(
76
77
78
        ordered_gradients_.data(), ordered_hessians_.data(),
        col_sampler_.is_feature_used_bytree(), is_constant_hessian,
        config_->force_col_wise, config_->force_row_wise));
79
  } else {
Nikita Titov's avatar
Nikita Titov committed
80
    CHECK_NOTNULL(share_state_);
81
    // cannot change is_hist_col_wise during training
82
    share_state_.reset(dataset->GetShareStates(
83
        ordered_gradients_.data(), ordered_hessians_.data(), col_sampler_.is_feature_used_bytree(),
84
85
        is_constant_hessian, share_state_->is_colwise,
        !share_state_->is_colwise));
86
  }
Nikita Titov's avatar
Nikita Titov committed
87
  CHECK_NOTNULL(share_state_);
88
89
}

90
91
92
void SerialTreeLearner::ResetTrainingDataInner(const Dataset* train_data,
                                               bool is_constant_hessian,
                                               bool reset_multi_val_bin) {
Guolin Ke's avatar
Guolin Ke committed
93
94
  train_data_ = train_data;
  num_data_ = train_data_->num_data();
95
  CHECK_EQ(num_features_, train_data_->num_features());
Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
100
101
102

  // initialize splits for leaf
  smaller_leaf_splits_->ResetNumData(num_data_);
  larger_leaf_splits_->ResetNumData(num_data_);

  // initialize data partition
  data_partition_->ResetNumData(num_data_);
103
  if (reset_multi_val_bin) {
104
    col_sampler_.SetTrainingData(train_data_);
105
106
    GetShareStates(train_data_, is_constant_hessian, false);
  }
107

Guolin Ke's avatar
Guolin Ke committed
108
109
110
  // initialize ordered gradients and hessians
  ordered_gradients_.resize(num_data_);
  ordered_hessians_.resize(num_data_);
111
112
113
  if (cegb_ != nullptr) {
    cegb_->Init();
  }
Guolin Ke's avatar
Guolin Ke committed
114
}
Guolin Ke's avatar
Guolin Ke committed
115

Guolin Ke's avatar
Guolin Ke committed
116
117
118
void SerialTreeLearner::ResetConfig(const Config* config) {
  if (config_->num_leaves != config->num_leaves) {
    config_ = config;
Guolin Ke's avatar
Guolin Ke committed
119
120
    int max_cache_size = 0;
    // Get the max size of pool
Guolin Ke's avatar
Guolin Ke committed
121
122
    if (config->histogram_pool_size <= 0) {
      max_cache_size = config_->num_leaves;
Guolin Ke's avatar
Guolin Ke committed
123
124
125
    } else {
      size_t total_histogram_size = 0;
      for (int i = 0; i < train_data_->num_features(); ++i) {
126
        total_histogram_size += kHistEntrySize * train_data_->FeatureNumBin(i);
Guolin Ke's avatar
Guolin Ke committed
127
      }
Guolin Ke's avatar
Guolin Ke committed
128
      max_cache_size = static_cast<int>(config_->histogram_pool_size * 1024 * 1024 / total_histogram_size);
Guolin Ke's avatar
Guolin Ke committed
129
130
131
    }
    // at least need 2 leaves
    max_cache_size = std::max(2, max_cache_size);
Guolin Ke's avatar
Guolin Ke committed
132
    max_cache_size = std::min(max_cache_size, config_->num_leaves);
133
    histogram_pool_.DynamicChangeSize(train_data_, share_state_->is_colwise, config_, max_cache_size, config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
134
135

    // push split information for all leaves
Guolin Ke's avatar
Guolin Ke committed
136
137
    best_split_per_leaf_.resize(config_->num_leaves);
    data_partition_->ResetLeaves(config_->num_leaves);
Guolin Ke's avatar
Guolin Ke committed
138
  } else {
Guolin Ke's avatar
Guolin Ke committed
139
    config_ = config;
Guolin Ke's avatar
Guolin Ke committed
140
  }
141
  col_sampler_.SetConfig(config_);
142
  histogram_pool_.ResetConfig(train_data_, config_);
143
144
145
146
  if (CostEfficientGradientBoosting::IsEnable(config_)) {
    cegb_.reset(new CostEfficientGradientBoosting(this));
    cegb_->Init();
  }
147
  constraints_.reset(LeafConstraintsBase::Create(config_, config_->num_leaves));
Guolin Ke's avatar
Guolin Ke committed
148
149
}

150
Tree* SerialTreeLearner::Train(const score_t* gradients, const score_t *hessians) {
151
  Common::FunctionTimer fun_timer("SerialTreeLearner::Train", global_timer);
Guolin Ke's avatar
Guolin Ke committed
152
153
  gradients_ = gradients;
  hessians_ = hessians;
154
  int num_threads = OMP_NUM_THREADS();
Nikita Titov's avatar
Nikita Titov committed
155
  if (share_state_->num_threads != num_threads && share_state_->num_threads > 0) {
156
    Log::Warning(
Nikita Titov's avatar
Nikita Titov committed
157
158
        "Detected that num_threads changed during training (from %d to %d), "
        "it may cause unexpected errors.",
159
160
161
        share_state_->num_threads, num_threads);
  }
  share_state_->num_threads = num_threads;
Nikita Titov's avatar
Nikita Titov committed
162

Guolin Ke's avatar
Guolin Ke committed
163
164
  // some initial works before training
  BeforeTrain();
Guolin Ke's avatar
Guolin Ke committed
165

166
167
  bool track_branch_features = !(config_->interaction_constraints_vector.empty());
  auto tree = std::unique_ptr<Tree>(new Tree(config_->num_leaves, track_branch_features));
168
  auto tree_prt = tree.get();
169
170
  constraints_->ShareTreePointer(tree_prt);

Guolin Ke's avatar
Guolin Ke committed
171
172
  // root leaf
  int left_leaf = 0;
173
  int cur_depth = 1;
Guolin Ke's avatar
Guolin Ke committed
174
175
  // only root leaf can be splitted on first time
  int right_leaf = -1;
176

177
  int init_splits = ForceSplits(tree_prt, &left_leaf, &right_leaf, &cur_depth);
178

Guolin Ke's avatar
Guolin Ke committed
179
  for (int split = init_splits; split < config_->num_leaves - 1; ++split) {
Guolin Ke's avatar
Guolin Ke committed
180
    // some initial works before finding best split
181
    if (BeforeFindBestSplit(tree_prt, left_leaf, right_leaf)) {
Guolin Ke's avatar
Guolin Ke committed
182
      // find best threshold for every feature
183
      FindBestSplits(tree_prt);
184
    }
Guolin Ke's avatar
Guolin Ke committed
185
186
187
188
189
190
    // Get a leaf with max split gain
    int best_leaf = static_cast<int>(ArrayArgs<SplitInfo>::ArgMax(best_split_per_leaf_));
    // Get split information for best leaf
    const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
    // cannot split, quit
    if (best_leaf_SplitInfo.gain <= 0.0) {
Guolin Ke's avatar
Guolin Ke committed
191
      Log::Warning("No further splits with positive gain, best gain: %f", best_leaf_SplitInfo.gain);
Guolin Ke's avatar
Guolin Ke committed
192
193
194
      break;
    }
    // split tree with best leaf
195
    Split(tree_prt, best_leaf, &left_leaf, &right_leaf);
196
    cur_depth = std::max(cur_depth, tree->leaf_depth(left_leaf));
Guolin Ke's avatar
Guolin Ke committed
197
  }
198
  Log::Debug("Trained a tree with leaves = %d and max_depth = %d", tree->num_leaves(), cur_depth);
Guolin Ke's avatar
Guolin Ke committed
199
  return tree.release();
Guolin Ke's avatar
Guolin Ke committed
200
201
}

202
Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t *hessians) const {
Guolin Ke's avatar
Guolin Ke committed
203
  auto tree = std::unique_ptr<Tree>(new Tree(*old_tree));
Nikita Titov's avatar
Nikita Titov committed
204
  CHECK_GE(data_partition_->num_leaves(), tree->num_leaves());
205
  OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
206
  #pragma omp parallel for schedule(static)
207
  for (int i = 0; i < tree->num_leaves(); ++i) {
208
    OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
209
210
211
    data_size_t cnt_leaf_data = 0;
    auto tmp_idx = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
    double sum_grad = 0.0f;
212
    double sum_hess = kEpsilon;
Guolin Ke's avatar
Guolin Ke committed
213
214
215
216
217
    for (data_size_t j = 0; j < cnt_leaf_data; ++j) {
      auto idx = tmp_idx[j];
      sum_grad += gradients[idx];
      sum_hess += hessians[idx];
    }
Belinda Trotta's avatar
Belinda Trotta committed
218
219
220
221
222
223
224
225
226
227
    double output;
    if ((config_->path_smooth > kEpsilon) & (i > 0)) {
      output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true>(
          sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
          config_->max_delta_step, config_->path_smooth, cnt_leaf_data, tree->leaf_parent(i));
    } else {
      output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, false>(
          sum_grad, sum_hess, config_->lambda_l1, config_->lambda_l2,
          config_->max_delta_step, config_->path_smooth, cnt_leaf_data, 0);
    }
Guolin Ke's avatar
Guolin Ke committed
228
229
230
    auto old_leaf_output = tree->LeafOutput(i);
    auto new_leaf_output = output * tree->shrinkage();
    tree->SetLeafOutput(i, config_->refit_decay_rate * old_leaf_output + (1.0 - config_->refit_decay_rate) * new_leaf_output);
231
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
232
  }
233
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
234
235
236
  return tree.release();
}

237
238
239
240
241
Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const std::vector<int>& leaf_pred, const score_t* gradients, const score_t *hessians) {
  data_partition_->ResetByLeafPred(leaf_pred, old_tree->num_leaves());
  return FitByExistingTree(old_tree, gradients, hessians);
}

Guolin Ke's avatar
Guolin Ke committed
242
void SerialTreeLearner::BeforeTrain() {
243
  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeTrain", global_timer);
244
245
  // reset histogram pool
  histogram_pool_.ResetMap();
Guolin Ke's avatar
Guolin Ke committed
246

247
248
  col_sampler_.ResetByTree();
  train_data_->InitTrain(col_sampler_.is_feature_used_bytree(), share_state_.get());
Guolin Ke's avatar
Guolin Ke committed
249
250
251
  // initialize data partition
  data_partition_->Init();

252
253
  constraints_->Reset();

Guolin Ke's avatar
Guolin Ke committed
254
  // reset the splits for leaves
Guolin Ke's avatar
Guolin Ke committed
255
  for (int i = 0; i < config_->num_leaves; ++i) {
Guolin Ke's avatar
Guolin Ke committed
256
257
258
259
260
261
262
    best_split_per_leaf_[i].Reset();
  }

  // Sumup for root
  if (data_partition_->leaf_count(0) == num_data_) {
    // use all data
    smaller_leaf_splits_->Init(gradients_, hessians_);
Guolin Ke's avatar
Guolin Ke committed
263

Guolin Ke's avatar
Guolin Ke committed
264
265
  } else {
    // use bagging, only use part of data
Guolin Ke's avatar
Guolin Ke committed
266
    smaller_leaf_splits_->Init(0, data_partition_.get(), gradients_, hessians_);
Guolin Ke's avatar
Guolin Ke committed
267
268
269
270
271
  }

  larger_leaf_splits_->Init();
}

Guolin Ke's avatar
Guolin Ke committed
272
bool SerialTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
273
  Common::FunctionTimer fun_timer("SerialTreeLearner::BeforeFindBestSplit", global_timer);
Guolin Ke's avatar
Guolin Ke committed
274
  // check depth of current leaf
Guolin Ke's avatar
Guolin Ke committed
275
  if (config_->max_depth > 0) {
Guolin Ke's avatar
Guolin Ke committed
276
    // only need to check left leaf, since right leaf is in same level of left leaf
Guolin Ke's avatar
Guolin Ke committed
277
    if (tree->leaf_depth(left_leaf) >= config_->max_depth) {
Guolin Ke's avatar
Guolin Ke committed
278
279
280
281
282
283
284
      best_split_per_leaf_[left_leaf].gain = kMinScore;
      if (right_leaf >= 0) {
        best_split_per_leaf_[right_leaf].gain = kMinScore;
      }
      return false;
    }
  }
Guolin Ke's avatar
Guolin Ke committed
285
286
287
  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
  // no enough data to continue
Guolin Ke's avatar
Guolin Ke committed
288
289
  if (num_data_in_right_child < static_cast<data_size_t>(config_->min_data_in_leaf * 2)
      && num_data_in_left_child < static_cast<data_size_t>(config_->min_data_in_leaf * 2)) {
Guolin Ke's avatar
Guolin Ke committed
290
291
292
293
294
295
    best_split_per_leaf_[left_leaf].gain = kMinScore;
    if (right_leaf >= 0) {
      best_split_per_leaf_[right_leaf].gain = kMinScore;
    }
    return false;
  }
296
  parent_leaf_histogram_array_ = nullptr;
Guolin Ke's avatar
Guolin Ke committed
297
298
  // only have root
  if (right_leaf < 0) {
299
    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
Guolin Ke's avatar
Guolin Ke committed
300
301
    larger_leaf_histogram_array_ = nullptr;
  } else if (num_data_in_left_child < num_data_in_right_child) {
Hui Xue's avatar
Hui Xue committed
302
    // put parent(left) leaf's histograms into larger leaf's histograms
303
304
305
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Move(left_leaf, right_leaf);
    histogram_pool_.Get(left_leaf, &smaller_leaf_histogram_array_);
Guolin Ke's avatar
Guolin Ke committed
306
  } else {
Hui Xue's avatar
Hui Xue committed
307
    // put parent(left) leaf's histograms to larger leaf's histograms
308
309
    if (histogram_pool_.Get(left_leaf, &larger_leaf_histogram_array_)) { parent_leaf_histogram_array_ = larger_leaf_histogram_array_; }
    histogram_pool_.Get(right_leaf, &smaller_leaf_histogram_array_);
Guolin Ke's avatar
Guolin Ke committed
310
311
312
313
  }
  return true;
}

314
void SerialTreeLearner::FindBestSplits(const Tree* tree) {
Guolin Ke's avatar
Guolin Ke committed
315
  std::vector<int8_t> is_feature_used(num_features_, 0);
316
  #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512)
Guolin Ke's avatar
Guolin Ke committed
317
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
318
    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
Guolin Ke's avatar
Guolin Ke committed
319
320
321
322
323
324
325
326
327
    if (parent_leaf_histogram_array_ != nullptr
        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
      continue;
    }
    is_feature_used[feature_index] = 1;
  }
  bool use_subtract = parent_leaf_histogram_array_ != nullptr;
  ConstructHistograms(is_feature_used, use_subtract);
328
  FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
Guolin Ke's avatar
Guolin Ke committed
329
330
}

331
332
333
334
void SerialTreeLearner::ConstructHistograms(
    const std::vector<int8_t>& is_feature_used, bool use_subtract) {
  Common::FunctionTimer fun_timer("SerialTreeLearner::ConstructHistograms",
                                  global_timer);
Guolin Ke's avatar
Guolin Ke committed
335
  // construct smaller leaf
336
337
  hist_t* ptr_smaller_leaf_hist_data =
      smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
338
339
340
  train_data_->ConstructHistograms(
      is_feature_used, smaller_leaf_splits_->data_indices(),
      smaller_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
341
342
      ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
      ptr_smaller_leaf_hist_data);
Guolin Ke's avatar
Guolin Ke committed
343
344
345

  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
    // construct larger leaf
346
347
    hist_t* ptr_larger_leaf_hist_data =
        larger_leaf_histogram_array_[0].RawData() - kHistOffset;
348
349
350
    train_data_->ConstructHistograms(
        is_feature_used, larger_leaf_splits_->data_indices(),
        larger_leaf_splits_->num_data_in_leaf(), gradients_, hessians_,
351
        ordered_gradients_.data(), ordered_hessians_.data(), share_state_.get(),
352
        ptr_larger_leaf_hist_data);
Guolin Ke's avatar
Guolin Ke committed
353
  }
354
355
}

Guolin Ke's avatar
Guolin Ke committed
356
void SerialTreeLearner::FindBestSplitsFromHistograms(
357
    const std::vector<int8_t>& is_feature_used, bool use_subtract, const Tree* tree) {
Guolin Ke's avatar
Guolin Ke committed
358
359
  Common::FunctionTimer fun_timer(
      "SerialTreeLearner::FindBestSplitsFromHistograms", global_timer);
360
361
  std::vector<SplitInfo> smaller_best(share_state_->num_threads);
  std::vector<SplitInfo> larger_best(share_state_->num_threads);
362
363
364
365
366
  std::vector<int8_t> smaller_node_used_features = col_sampler_.GetByNode(tree, smaller_leaf_splits_->leaf_index());
  std::vector<int8_t> larger_node_used_features;
  if (larger_leaf_splits_->leaf_index() >= 0) {
    larger_node_used_features = col_sampler_.GetByNode(tree, larger_leaf_splits_->leaf_index());
  }
367
  OMP_INIT_EX();
Guolin Ke's avatar
Guolin Ke committed
368
// find splits
369
#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
Guolin Ke's avatar
Guolin Ke committed
370
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
371
    OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
372
373
374
    if (!is_feature_used[feature_index]) {
      continue;
    }
Guolin Ke's avatar
Guolin Ke committed
375
    const int tid = omp_get_thread_num();
Guolin Ke's avatar
Guolin Ke committed
376
377
378
379
    train_data_->FixHistogram(
        feature_index, smaller_leaf_splits_->sum_gradients(),
        smaller_leaf_splits_->sum_hessians(),
        smaller_leaf_histogram_array_[feature_index].RawData());
380
    int real_fidx = train_data_->RealFeatureIndex(feature_index);
381
382
383
384
385
386
387

    ComputeBestSplitForFeature(smaller_leaf_histogram_array_, feature_index,
                               real_fidx,
                               smaller_node_used_features[feature_index],
                               smaller_leaf_splits_->num_data_in_leaf(),
                               smaller_leaf_splits_.get(), &smaller_best[tid]);

Guolin Ke's avatar
Guolin Ke committed
388
    // only has root leaf
Guolin Ke's avatar
Guolin Ke committed
389
390
391
392
    if (larger_leaf_splits_ == nullptr ||
        larger_leaf_splits_->leaf_index() < 0) {
      continue;
    }
Guolin Ke's avatar
Guolin Ke committed
393

Guolin Ke's avatar
Guolin Ke committed
394
    if (use_subtract) {
Guolin Ke's avatar
Guolin Ke committed
395
396
      larger_leaf_histogram_array_[feature_index].Subtract(
          smaller_leaf_histogram_array_[feature_index]);
397
    } else {
Guolin Ke's avatar
Guolin Ke committed
398
399
400
401
      train_data_->FixHistogram(
          feature_index, larger_leaf_splits_->sum_gradients(),
          larger_leaf_splits_->sum_hessians(),
          larger_leaf_histogram_array_[feature_index].RawData());
402
    }
403
404
405
406
407

    ComputeBestSplitForFeature(larger_leaf_histogram_array_, feature_index,
                               real_fidx,
                               larger_node_used_features[feature_index],
                               larger_leaf_splits_->num_data_in_leaf(),
Guolin Ke's avatar
Guolin Ke committed
408
                               larger_leaf_splits_.get(), &larger_best[tid]);
409

410
    OMP_LOOP_EX_END();
Guolin Ke's avatar
Guolin Ke committed
411
  }
412
  OMP_THROW_EX();
Guolin Ke's avatar
Guolin Ke committed
413
  auto smaller_best_idx = ArrayArgs<SplitInfo>::ArgMax(smaller_best);
414
  int leaf = smaller_leaf_splits_->leaf_index();
Guolin Ke's avatar
Guolin Ke committed
415
416
  best_split_per_leaf_[leaf] = smaller_best[smaller_best_idx];

Guolin Ke's avatar
Guolin Ke committed
417
418
  if (larger_leaf_splits_ != nullptr &&
      larger_leaf_splits_->leaf_index() >= 0) {
419
    leaf = larger_leaf_splits_->leaf_index();
Guolin Ke's avatar
Guolin Ke committed
420
421
422
423
424
    auto larger_best_idx = ArrayArgs<SplitInfo>::ArgMax(larger_best);
    best_split_per_leaf_[leaf] = larger_best[larger_best_idx];
  }
}

425
426
427
428
429
430
int32_t SerialTreeLearner::ForceSplits(Tree* tree, int* left_leaf,
                                       int* right_leaf, int *cur_depth) {
  bool abort_last_forced_split = false;
  if (forced_split_json_ == nullptr) {
    return 0;
  }
431
432
433
434
  int32_t result_count = 0;
  // start at root leaf
  *left_leaf = 0;
  std::queue<std::pair<Json, int>> q;
435
  Json left = *forced_split_json_;
436
437
438
  Json right;
  bool left_smaller = true;
  std::unordered_map<int, SplitInfo> forceSplitMap;
439
  q.push(std::make_pair(left, *left_leaf));
440
  while (!q.empty()) {
441
442
443
    // before processing next node from queue, store info for current left/right leaf
    // store "best split" for left and right, even if they might be overwritten by forced split
    if (BeforeFindBestSplit(tree, *left_leaf, *right_leaf)) {
444
      FindBestSplits(tree);
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
    }
    // then, compute own splits
    SplitInfo left_split;
    SplitInfo right_split;

    if (!left.is_null()) {
      const int left_feature = left["feature"].int_value();
      const double left_threshold_double = left["threshold"].number_value();
      const int left_inner_feature_index = train_data_->InnerFeatureIndex(left_feature);
      const uint32_t left_threshold = train_data_->BinThreshold(
              left_inner_feature_index, left_threshold_double);
      auto leaf_histogram_array = (left_smaller) ? smaller_leaf_histogram_array_ : larger_leaf_histogram_array_;
      auto left_leaf_splits = (left_smaller) ? smaller_leaf_splits_.get() : larger_leaf_splits_.get();
      leaf_histogram_array[left_inner_feature_index].GatherInfoForThreshold(
              left_leaf_splits->sum_gradients(),
              left_leaf_splits->sum_hessians(),
              left_threshold,
              left_leaf_splits->num_data_in_leaf(),
Belinda Trotta's avatar
Belinda Trotta committed
463
              left_leaf_splits->weight(),
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
              &left_split);
      left_split.feature = left_feature;
      forceSplitMap[*left_leaf] = left_split;
      if (left_split.gain < 0) {
        forceSplitMap.erase(*left_leaf);
      }
    }

    if (!right.is_null()) {
      const int right_feature = right["feature"].int_value();
      const double right_threshold_double = right["threshold"].number_value();
      const int right_inner_feature_index = train_data_->InnerFeatureIndex(right_feature);
      const uint32_t right_threshold = train_data_->BinThreshold(
              right_inner_feature_index, right_threshold_double);
      auto leaf_histogram_array = (left_smaller) ? larger_leaf_histogram_array_ : smaller_leaf_histogram_array_;
      auto right_leaf_splits = (left_smaller) ? larger_leaf_splits_.get() : smaller_leaf_splits_.get();
      leaf_histogram_array[right_inner_feature_index].GatherInfoForThreshold(
        right_leaf_splits->sum_gradients(),
        right_leaf_splits->sum_hessians(),
        right_threshold,
        right_leaf_splits->num_data_in_leaf(),
Belinda Trotta's avatar
Belinda Trotta committed
485
        right_leaf_splits->weight(),
486
487
488
489
490
491
492
493
494
495
496
497
498
        &right_split);
      right_split.feature = right_feature;
      forceSplitMap[*right_leaf] = right_split;
      if (right_split.gain < 0) {
        forceSplitMap.erase(*right_leaf);
      }
    }

    std::pair<Json, int> pair = q.front();
    q.pop();
    int current_leaf = pair.second;
    // split info should exist because searching in bfs fashion - should have added from parent
    if (forceSplitMap.find(current_leaf) == forceSplitMap.end()) {
499
        abort_last_forced_split = true;
500
501
        break;
    }
502
503
504
505
    best_split_per_leaf_[current_leaf] = forceSplitMap[current_leaf];
    Split(tree, current_leaf, left_leaf, right_leaf);
    left_smaller = best_split_per_leaf_[current_leaf].left_count <
                   best_split_per_leaf_[current_leaf].right_count;
506
507
508
509
    left = Json();
    right = Json();
    if ((pair.first).object_items().count("left") > 0) {
      left = (pair.first)["left"];
510
511
512
      if (left.object_items().count("feature") > 0 && left.object_items().count("threshold") > 0) {
        q.push(std::make_pair(left, *left_leaf));
      }
513
514
515
    }
    if ((pair.first).object_items().count("right") > 0) {
      right = (pair.first)["right"];
516
517
518
      if (right.object_items().count("feature") > 0 && right.object_items().count("threshold") > 0) {
        q.push(std::make_pair(right, *right_leaf));
      }
519
520
521
522
    }
    result_count++;
    *(cur_depth) = std::max(*(cur_depth), tree->leaf_depth(*left_leaf));
  }
523
524
525
526
527
528
529
530
531
532
533
  if (abort_last_forced_split) {
    int best_leaf =
        static_cast<int>(ArrayArgs<SplitInfo>::ArgMax(best_split_per_leaf_));
    const SplitInfo& best_leaf_SplitInfo = best_split_per_leaf_[best_leaf];
    if (best_leaf_SplitInfo.gain <= 0.0) {
      Log::Warning("No further splits with positive gain, best gain: %f",
                   best_leaf_SplitInfo.gain);
      return config_->num_leaves;
    }
    Split(tree, best_leaf, left_leaf, right_leaf);
    *(cur_depth) = std::max(*(cur_depth), tree->leaf_depth(*left_leaf));
534
    ++result_count;
535
  }
536
537
  return result_count;
}
Guolin Ke's avatar
Guolin Ke committed
538

539
540
541
void SerialTreeLearner::SplitInner(Tree* tree, int best_leaf, int* left_leaf,
                                   int* right_leaf, bool update_cnt) {
  Common::FunctionTimer fun_timer("SerialTreeLearner::SplitInner", global_timer);
542
  SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
543
544
  const int inner_feature_index =
      train_data_->InnerFeatureIndex(best_split_info.feature);
545
  if (cegb_ != nullptr) {
546
547
    cegb_->UpdateLeafBestSplits(tree, best_leaf, &best_split_info,
                                &best_split_per_leaf_);
548
  }
549
  *left_leaf = best_leaf;
550
551
  auto next_leaf_id = tree->NextLeafId();

552
553
554
555
  // update before tree split
  constraints_->BeforeSplit(tree, best_leaf, next_leaf_id,
                            best_split_info.monotone_type);

556
557
558
  bool is_numerical_split =
      train_data_->FeatureBinMapper(inner_feature_index)->bin_type() ==
      BinType::NumericalBin;
Guolin Ke's avatar
Guolin Ke committed
559
  if (is_numerical_split) {
560
561
    auto threshold_double = train_data_->RealThreshold(
        inner_feature_index, best_split_info.threshold);
562
    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
563
564
565
566
567
568
569
                           &best_split_info.threshold, 1,
                           best_split_info.default_left, next_leaf_id);
    if (update_cnt) {
      // don't need to update this in data-based parallel model
      best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
      best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
    }
570
    // split tree, will return right leaf
571
572
573
574
575
576
577
578
579
    *right_leaf = tree->Split(
        best_leaf, inner_feature_index, best_split_info.feature,
        best_split_info.threshold, threshold_double,
        static_cast<double>(best_split_info.left_output),
        static_cast<double>(best_split_info.right_output),
        static_cast<data_size_t>(best_split_info.left_count),
        static_cast<data_size_t>(best_split_info.right_count),
        static_cast<double>(best_split_info.left_sum_hessian),
        static_cast<double>(best_split_info.right_sum_hessian),
580
581
        // store the true split gain in tree model
        static_cast<float>(best_split_info.gain + config_->min_gain_to_split),
582
583
        train_data_->FeatureBinMapper(inner_feature_index)->missing_type(),
        best_split_info.default_left);
584
  } else {
585
586
587
    std::vector<uint32_t> cat_bitset_inner =
        Common::ConstructBitset(best_split_info.cat_threshold.data(),
                                best_split_info.num_cat_threshold);
588
589
    std::vector<int> threshold_int(best_split_info.num_cat_threshold);
    for (int i = 0; i < best_split_info.num_cat_threshold; ++i) {
590
591
      threshold_int[i] = static_cast<int>(train_data_->RealThreshold(
          inner_feature_index, best_split_info.cat_threshold[i]));
592
    }
593
594
    std::vector<uint32_t> cat_bitset = Common::ConstructBitset(
        threshold_int.data(), best_split_info.num_cat_threshold);
595

596
    data_partition_->Split(best_leaf, train_data_, inner_feature_index,
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
                           cat_bitset_inner.data(),
                           static_cast<int>(cat_bitset_inner.size()),
                           best_split_info.default_left, next_leaf_id);

    if (update_cnt) {
      // don't need to update this in data-based parallel model
      best_split_info.left_count = data_partition_->leaf_count(*left_leaf);
      best_split_info.right_count = data_partition_->leaf_count(next_leaf_id);
    }

    *right_leaf = tree->SplitCategorical(
        best_leaf, inner_feature_index, best_split_info.feature,
        cat_bitset_inner.data(), static_cast<int>(cat_bitset_inner.size()),
        cat_bitset.data(), static_cast<int>(cat_bitset.size()),
        static_cast<double>(best_split_info.left_output),
        static_cast<double>(best_split_info.right_output),
        static_cast<data_size_t>(best_split_info.left_count),
        static_cast<data_size_t>(best_split_info.right_count),
        static_cast<double>(best_split_info.left_sum_hessian),
        static_cast<double>(best_split_info.right_sum_hessian),
617
618
        // store the true split gain in tree model
        static_cast<float>(best_split_info.gain + config_->min_gain_to_split),
619
        train_data_->FeatureBinMapper(inner_feature_index)->missing_type());
620
  }
621

622
#ifdef DEBUG
623
  CHECK(*right_leaf == next_leaf_id);
624
#endif
625

Guolin Ke's avatar
Guolin Ke committed
626
627
  // init the leaves that used on next iteration
  if (best_split_info.left_count < best_split_info.right_count) {
628
    CHECK_GT(best_split_info.left_count, 0);
629
630
    smaller_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                               best_split_info.left_sum_gradient,
Belinda Trotta's avatar
Belinda Trotta committed
631
632
                               best_split_info.left_sum_hessian,
                               best_split_info.left_output);
633
634
    larger_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                              best_split_info.right_sum_gradient,
Belinda Trotta's avatar
Belinda Trotta committed
635
636
                              best_split_info.right_sum_hessian,
                              best_split_info.right_output);
Guolin Ke's avatar
Guolin Ke committed
637
  } else {
638
    CHECK_GT(best_split_info.right_count, 0);
639
640
    smaller_leaf_splits_->Init(*right_leaf, data_partition_.get(),
                               best_split_info.right_sum_gradient,
Belinda Trotta's avatar
Belinda Trotta committed
641
642
                               best_split_info.right_sum_hessian,
                               best_split_info.right_output);
643
644
    larger_leaf_splits_->Init(*left_leaf, data_partition_.get(),
                              best_split_info.left_sum_gradient,
Belinda Trotta's avatar
Belinda Trotta committed
645
646
                              best_split_info.left_sum_hessian,
                              best_split_info.left_output);
Guolin Ke's avatar
Guolin Ke committed
647
  }
648
649
650
651
652
653
654
655
656
  auto leaves_need_update = constraints_->Update(
      tree, is_numerical_split, *left_leaf, *right_leaf,
      best_split_info.monotone_type, best_split_info.right_output,
      best_split_info.left_output, inner_feature_index, best_split_info,
      best_split_per_leaf_);
  // update leave outputs if needed
  for (auto leaf : leaves_need_update) {
    RecomputeBestSplitForLeaf(leaf, &best_split_per_leaf_[leaf]);
  }
657
}
Guolin Ke's avatar
Guolin Ke committed
658

659
void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj, std::function<double(const label_t*, int)> residual_getter,
660
661
                                        data_size_t total_num_data, const data_size_t* bag_indices, data_size_t bag_cnt) const {
  if (obj != nullptr && obj->IsRenewTreeOutput()) {
Nikita Titov's avatar
Nikita Titov committed
662
    CHECK_LE(tree->num_leaves(), data_partition_->num_leaves());
663
664
    const data_size_t* bag_mapper = nullptr;
    if (total_num_data != num_data_) {
665
      CHECK_EQ(bag_cnt, num_data_);
666
667
      bag_mapper = bag_indices;
    }
Guolin Ke's avatar
Guolin Ke committed
668
    std::vector<int> n_nozeroworker_perleaf(tree->num_leaves(), 1);
669
    int num_machines = Network::num_machines();
670
671
672
673
674
    #pragma omp parallel for schedule(static)
    for (int i = 0; i < tree->num_leaves(); ++i) {
      const double output = static_cast<double>(tree->LeafOutput(i));
      data_size_t cnt_leaf_data = 0;
      auto index_mapper = data_partition_->GetIndexOnLeaf(i, &cnt_leaf_data);
Guolin Ke's avatar
Guolin Ke committed
675
676
      if (cnt_leaf_data > 0) {
        // bag_mapper[index_mapper[i]]
677
        const double new_output = obj->RenewTreeOutput(output, residual_getter, index_mapper, bag_mapper, cnt_leaf_data);
Guolin Ke's avatar
Guolin Ke committed
678
679
        tree->SetLeafOutput(i, new_output);
      } else {
680
        CHECK_GT(num_machines, 1);
Guolin Ke's avatar
Guolin Ke committed
681
682
683
684
685
686
687
688
689
        tree->SetLeafOutput(i, 0.0);
        n_nozeroworker_perleaf[i] = 0;
      }
    }
    if (num_machines > 1) {
      std::vector<double> outputs(tree->num_leaves());
      for (int i = 0; i < tree->num_leaves(); ++i) {
        outputs[i] = static_cast<double>(tree->LeafOutput(i));
      }
Guolin Ke's avatar
Guolin Ke committed
690
691
      outputs = Network::GlobalSum(&outputs);
      n_nozeroworker_perleaf = Network::GlobalSum(&n_nozeroworker_perleaf);
Guolin Ke's avatar
Guolin Ke committed
692
693
694
695
696
697
698
      for (int i = 0; i < tree->num_leaves(); ++i) {
        tree->SetLeafOutput(i, outputs[i] / n_nozeroworker_perleaf[i]);
      }
    }
  }
}

699
700
701
702
703
704
705
706
void SerialTreeLearner::ComputeBestSplitForFeature(
    FeatureHistogram* histogram_array_, int feature_index, int real_fidx,
    bool is_feature_used, int num_data, const LeafSplits* leaf_splits,
    SplitInfo* best_split) {
  if (!is_feature_used) {
    return;
  }
  SplitInfo new_split;
Belinda Trotta's avatar
Belinda Trotta committed
707
708
709
710
711
712
713
714
715
716
  double parent_output;
  if (leaf_splits->leaf_index() == 0) {
    // for root leaf the "parent" output is its own output because we don't apply any smoothing to the root
    parent_output = FeatureHistogram::CalculateSplittedLeafOutput<true, true, true, false>(
        leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), config_->lambda_l1,
        config_->lambda_l2, config_->max_delta_step, constraints_->Get(leaf_splits->leaf_index()),
        config_->path_smooth, static_cast<data_size_t>(num_data), 0);
  } else {
    parent_output = leaf_splits->weight();
  }
717
718
  histogram_array_[feature_index].FindBestThreshold(
      leaf_splits->sum_gradients(), leaf_splits->sum_hessians(), num_data,
Belinda Trotta's avatar
Belinda Trotta committed
719
      constraints_->Get(leaf_splits->leaf_index()),  parent_output, &new_split);
720
721
722
723
724
725
  new_split.feature = real_fidx;
  if (cegb_ != nullptr) {
    new_split.gain -=
        cegb_->DetlaGain(feature_index, real_fidx, leaf_splits->leaf_index(),
                         num_data, new_split);
  }
726
727
728
729
730
  if (new_split.monotone_type != 0) {
    double penalty = constraints_->ComputeMonotoneSplitGainPenalty(
        leaf_splits->leaf_index(), config_->monotone_penalty);
    new_split.gain *= penalty;
  }
731
732
733
734
735
  if (new_split > *best_split) {
    *best_split = new_split;
  }
}

736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
void SerialTreeLearner::RecomputeBestSplitForLeaf(int leaf, SplitInfo* split) {
  FeatureHistogram* histogram_array_;
  if (!histogram_pool_.Get(leaf, &histogram_array_)) {
    Log::Warning(
        "Get historical Histogram for leaf %d failed, will skip the "
        "``RecomputeBestSplitForLeaf``",
        leaf);
    return;
  }
  double sum_gradients = split->left_sum_gradient + split->right_sum_gradient;
  double sum_hessians = split->left_sum_hessian + split->right_sum_hessian;
  int num_data = split->left_count + split->right_count;

  std::vector<SplitInfo> bests(share_state_->num_threads);
  LeafSplits leaf_splits(num_data);
  leaf_splits.Init(leaf, sum_gradients, sum_hessians);

  OMP_INIT_EX();
// find splits
#pragma omp parallel for schedule(static) num_threads(share_state_->num_threads)
  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
    OMP_LOOP_EX_BEGIN();
    if (!col_sampler_.is_feature_used_bytree()[feature_index] ||
        !histogram_array_[feature_index].is_splittable()) {
      continue;
    }
    const int tid = omp_get_thread_num();
    int real_fidx = train_data_->RealFeatureIndex(feature_index);
    ComputeBestSplitForFeature(
        histogram_array_, feature_index, real_fidx,
        true,
        num_data, &leaf_splits, &bests[tid]);

    OMP_LOOP_EX_END();
  }
  OMP_THROW_EX();
  auto best_idx = ArrayArgs<SplitInfo>::ArgMax(bests);
  *split = bests[best_idx];
}

Guolin Ke's avatar
Guolin Ke committed
776
}  // namespace LightGBM