goss.hpp 7.43 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#ifndef LIGHTGBM_BOOSTING_GOSS_H_
#define LIGHTGBM_BOOSTING_GOSS_H_

#include <LightGBM/utils/array_args.h>
#include <LightGBM/utils/log.h>
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/boosting.h>

#include "score_updater.hpp"
#include "gbdt.h"

#include <cstdio>
#include <vector>
#include <string>
#include <fstream>
#include <chrono>

namespace LightGBM {

20
21
22
23
24
#ifdef TIMETAG
std::chrono::duration<double, std::milli> subset_time;
std::chrono::duration<double, std::milli> re_init_tree_time;
#endif

Guolin Ke's avatar
Guolin Ke committed
25
26
27
28
29
30
31
32
33
34
class GOSS: public GBDT {
public:
  /*!
  * \brief Constructor
  */
  GOSS() : GBDT() { 

  }

  ~GOSS() {
35
36
37
38
#ifdef TIMETAG
    Log::Info("GOSS::subset costs %f", subset_time * 1e-3);
    Log::Info("GOSS::re_init_tree costs %f", re_init_tree_time * 1e-3);
#endif
Guolin Ke's avatar
Guolin Ke committed
39
40
41
42
43
44
45
46
  }

  void Init(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
    const std::vector<const Metric*>& training_metrics) override {
    GBDT::Init(config, train_data, object_function, training_metrics);
    CHECK(gbdt_config_->top_rate + gbdt_config_->other_rate <= 1.0f);
    CHECK(gbdt_config_->top_rate > 0.0f && gbdt_config_->other_rate > 0.0f);
    if (gbdt_config_->bagging_freq > 0 && gbdt_config_->bagging_fraction != 1.0f) {
Guolin Ke's avatar
Guolin Ke committed
47
      Log::Fatal("cannot use bagging in GOSS");
Guolin Ke's avatar
Guolin Ke committed
48
49
50
51
52
53
54
    }
    Log::Info("using GOSS");
  }

  void ResetTrainingData(const BoostingConfig* config, const Dataset* train_data, const ObjectiveFunction* object_function,
    const std::vector<const Metric*>& training_metrics) override {
    if (config->bagging_freq > 0 && config->bagging_fraction != 1.0f) {
Guolin Ke's avatar
Guolin Ke committed
55
      Log::Fatal("cannot use bagging in GOSS");
Guolin Ke's avatar
Guolin Ke committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    }
    GBDT::ResetTrainingData(config, train_data, object_function, training_metrics);
    if (train_data_ == nullptr) { return; }
    bag_data_indices_.resize(num_data_);
    tmp_indices_.resize(num_data_);
    tmp_indice_right_.resize(num_data_);
    offsets_buf_.resize(num_threads_);
    left_cnts_buf_.resize(num_threads_);
    right_cnts_buf_.resize(num_threads_);
    left_write_pos_buf_.resize(num_threads_);
    right_write_pos_buf_.resize(num_threads_);

    is_use_subset_ = false;
    if (config->top_rate + config->other_rate <= 0.5) {
      auto bag_data_cnt = static_cast<data_size_t>((config->top_rate + config->other_rate) * num_data_);
      tmp_subset_.reset(new Dataset(bag_data_cnt));
      tmp_subset_->CopyFeatureMapperFrom(train_data_);
      is_use_subset_ = true;
    }
    // flag to not bagging first
    bag_data_cnt_ = num_data_;
  }

  data_size_t BaggingHelper(Random& cur_rand, data_size_t start, data_size_t cnt, data_size_t* buffer, data_size_t* buffer_right) {
Guolin Ke's avatar
Guolin Ke committed
80
    std::vector<score_t> tmp_gradients(cnt, 0.0f);
Guolin Ke's avatar
Guolin Ke committed
81
    for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
82
83
84
85
      for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
        int idx = curr_class * num_data_ + start + i;
        tmp_gradients[i] += std::fabs(gradients_[idx] * hessians_[idx]);
      }
Guolin Ke's avatar
Guolin Ke committed
86
87
88
89
90
91
92
93
94
95
96
97
    }
    data_size_t top_k = static_cast<data_size_t>(cnt * gbdt_config_->top_rate);
    data_size_t other_k = static_cast<data_size_t>(cnt * gbdt_config_->other_rate);
    top_k = std::max(1, top_k);
    ArrayArgs<score_t>::ArgMaxAtK(&tmp_gradients, 0, static_cast<int>(tmp_gradients.size()), top_k);
    score_t threshold = tmp_gradients[top_k - 1];

    score_t multiply = static_cast<score_t>(cnt - top_k) / other_k;
    data_size_t cur_left_cnt = 0;
    data_size_t cur_right_cnt = 0;
    data_size_t big_weight_cnt = 0;
    for (data_size_t i = 0; i < cnt; ++i) {
Guolin Ke's avatar
Guolin Ke committed
98
99
100
101
102
103
      score_t grad = 0.0f;
      for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
        int idx = curr_class * num_data_ + start + i;
        grad += std::fabs(gradients_[idx] * hessians_[idx]);
      }
      if (grad >= threshold) {
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
109
110
111
112
        buffer[cur_left_cnt++] = start + i;
        ++big_weight_cnt;
      } else {
        data_size_t sampled = cur_left_cnt - big_weight_cnt;
        data_size_t rest_need = other_k - sampled;
        data_size_t rest_all = (cnt - i) - (top_k - big_weight_cnt);
        double prob = (rest_need) / static_cast<double>(rest_all);
        if (cur_rand.NextFloat() < prob) {
          buffer[cur_left_cnt++] = start + i;
Guolin Ke's avatar
Guolin Ke committed
113
114
115
116
117
          for (int curr_class = 0; curr_class < num_class_; ++curr_class) {
            int idx = curr_class * num_data_ + start + i;
            gradients_[idx] *= multiply;
            hessians_[idx] *= multiply;
          }
Guolin Ke's avatar
Guolin Ke committed
118
119
120
121
122
123
124
125
126
127
128
129
130
        } else {
          buffer_right[cur_right_cnt++] = start + i;
        }
      }
    }
    return cur_left_cnt;
  }

  void Bagging(int iter) override {
    bag_data_cnt_ = num_data_;
    // not subsample for first iterations
    if (iter < static_cast<int>(1.0f / gbdt_config_->learning_rate)) { return; }

Guolin Ke's avatar
Guolin Ke committed
131
    const data_size_t min_inner_size = 100;
Guolin Ke's avatar
Guolin Ke committed
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    data_size_t inner_size = (num_data_ + num_threads_ - 1) / num_threads_;
    if (inner_size < min_inner_size) { inner_size = min_inner_size; }

#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      left_cnts_buf_[i] = 0;
      right_cnts_buf_[i] = 0;
      data_size_t cur_start = i * inner_size;
      if (cur_start > num_data_) { continue; }
      data_size_t cur_cnt = inner_size;
      if (cur_start + cur_cnt > num_data_) { cur_cnt = num_data_ - cur_start; }
      Random cur_rand(gbdt_config_->bagging_seed + iter * num_threads_ + i);
      data_size_t cur_left_count = BaggingHelper(cur_rand, cur_start, cur_cnt,
        tmp_indices_.data() + cur_start, tmp_indice_right_.data() + cur_start);
      offsets_buf_[i] = cur_start;
      left_cnts_buf_[i] = cur_left_count;
      right_cnts_buf_[i] = cur_cnt - cur_left_count;
    }
    data_size_t left_cnt = 0;
    left_write_pos_buf_[0] = 0;
    right_write_pos_buf_[0] = 0;
    for (int i = 1; i < num_threads_; ++i) {
      left_write_pos_buf_[i] = left_write_pos_buf_[i - 1] + left_cnts_buf_[i - 1];
      right_write_pos_buf_[i] = right_write_pos_buf_[i - 1] + right_cnts_buf_[i - 1];
    }
    left_cnt = left_write_pos_buf_[num_threads_ - 1] + left_cnts_buf_[num_threads_ - 1];

#pragma omp parallel for schedule(static, 1)
    for (int i = 0; i < num_threads_; ++i) {
      if (left_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_write_pos_buf_[i],
          tmp_indices_.data() + offsets_buf_[i], left_cnts_buf_[i] * sizeof(data_size_t));
      }
      if (right_cnts_buf_[i] > 0) {
        std::memcpy(bag_data_indices_.data() + left_cnt + right_write_pos_buf_[i],
          tmp_indice_right_.data() + offsets_buf_[i], right_cnts_buf_[i] * sizeof(data_size_t));
      }
    }
    bag_data_cnt_ = left_cnt;
    // set bagging data to tree learner
    if (!is_use_subset_) {
      tree_learner_->SetBaggingData(bag_data_indices_.data(), bag_data_cnt_);
    } else {
      // get subset
176
177
178
#ifdef TIMETAG
      auto start_time = std::chrono::steady_clock::now();
#endif
Guolin Ke's avatar
Guolin Ke committed
179
180
      tmp_subset_->ReSize(bag_data_cnt_);
      tmp_subset_->CopySubset(train_data_, bag_data_indices_.data(), bag_data_cnt_, false);
181
182
183
184
185
186
#ifdef TIMETAG
      subset_time += std::chrono::steady_clock::now() - start_time;
#endif
#ifdef TIMETAG
      start_time = std::chrono::steady_clock::now();
#endif
Guolin Ke's avatar
Guolin Ke committed
187
      tree_learner_->ResetTrainingData(tmp_subset_.get());
188
189
190
#ifdef TIMETAG
      re_init_tree_time += std::chrono::steady_clock::now() - start_time;
#endif
Guolin Ke's avatar
Guolin Ke committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204
    }
  }

  /*!
  * \brief Get Type name of this boosting object
  */
  const char* SubModelName() const override { return "tree"; }

private:
  std::vector<data_size_t> tmp_indice_right_;
};

}  // namespace LightGBM
#endif   // LIGHTGBM_BOOSTING_GOSS_H_