predictor.hpp 12.1 KB
Newer Older
1
2
3
4
/*!
 * Copyright (c) 2016 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License. See LICENSE file in the project root for license information.
 */
Guolin Ke's avatar
Guolin Ke committed
5
6
7
#ifndef LIGHTGBM_PREDICTOR_HPP_
#define LIGHTGBM_PREDICTOR_HPP_

8
9
10
#include <LightGBM/boosting.h>
#include <LightGBM/dataset.h>
#include <LightGBM/meta.h>
11
#include <LightGBM/utils/common.h>
12
13
14
#include <LightGBM/utils/openmp_wrapper.h>
#include <LightGBM/utils/text_reader.h>

15
#include <string>
Guolin Ke's avatar
Guolin Ke committed
16
#include <cstdio>
17
#include <cstring>
Guolin Ke's avatar
Guolin Ke committed
18
#include <functional>
19
#include <map>
Guolin Ke's avatar
Guolin Ke committed
20
#include <memory>
21
22
23
#include <unordered_map>
#include <utility>
#include <vector>
Guolin Ke's avatar
Guolin Ke committed
24
25
26
27

namespace LightGBM {

/*!
zhangyafeikimi's avatar
zhangyafeikimi committed
28
* \brief Used to predict data with input model
Guolin Ke's avatar
Guolin Ke committed
29
30
*/
class Predictor {
Nikita Titov's avatar
Nikita Titov committed
31
 public:
Guolin Ke's avatar
Guolin Ke committed
32
33
34
  /*!
  * \brief Constructor
  * \param boosting Input boosting model
35
  * \param start_iteration Start index of the iteration to predict
Guolin Ke's avatar
Guolin Ke committed
36
  * \param num_iteration Number of boosting round
37
  * \param is_raw_score True if need to predict result with raw score
Guolin Ke's avatar
Guolin Ke committed
38
39
  * \param predict_leaf_index True to output leaf index instead of prediction score
  * \param predict_contrib True to output feature contributions instead of prediction score
Guolin Ke's avatar
Guolin Ke committed
40
  */
41
  Predictor(Boosting* boosting, int start_iteration, int num_iteration, bool is_raw_score,
Guolin Ke's avatar
Guolin Ke committed
42
43
44
45
            bool predict_leaf_index, bool predict_contrib, bool early_stop,
            int early_stop_freq, double early_stop_margin) {
    early_stop_ = CreatePredictionEarlyStopInstance(
        "none", LightGBM::PredictionEarlyStopConfig());
46
47
    if (early_stop && !boosting->NeedAccuratePrediction()) {
      PredictionEarlyStopConfig pred_early_stop_config;
48
49
      CHECK_GT(early_stop_freq, 0);
      CHECK_GE(early_stop_margin, 0);
50
51
52
      pred_early_stop_config.margin_threshold = early_stop_margin;
      pred_early_stop_config.round_period = early_stop_freq;
      if (boosting->NumberOfClasses() == 1) {
Guolin Ke's avatar
Guolin Ke committed
53
54
        early_stop_ =
            CreatePredictionEarlyStopInstance("binary", pred_early_stop_config);
55
      } else {
Guolin Ke's avatar
Guolin Ke committed
56
57
        early_stop_ = CreatePredictionEarlyStopInstance("multiclass",
                                                        pred_early_stop_config);
58
59
60
      }
    }

61
    boosting->InitPredict(start_iteration, num_iteration, predict_contrib);
Guolin Ke's avatar
Guolin Ke committed
62
    boosting_ = boosting;
63
    num_pred_one_row_ = boosting_->NumPredictOneRow(start_iteration,
Guolin Ke's avatar
Guolin Ke committed
64
        num_iteration, predict_leaf_index, predict_contrib);
65
    num_feature_ = boosting_->MaxFeatureIdx() + 1;
Guolin Ke's avatar
Guolin Ke committed
66
    predict_buf_.resize(
67
        OMP_NUM_THREADS(),
Guolin Ke's avatar
Guolin Ke committed
68
69
        std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>(
            num_feature_, 0.0f));
70
71
    const int kFeatureThreshold = 100000;
    const size_t KSparseThreshold = static_cast<size_t>(0.01 * num_feature_);
Guolin Ke's avatar
Guolin Ke committed
72
    if (predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
73
74
      predict_fun_ = [=](const std::vector<std::pair<int, double>>& features,
                         double* output) {
Guolin Ke's avatar
Guolin Ke committed
75
        int tid = omp_get_thread_num();
Guolin Ke's avatar
Guolin Ke committed
76
77
        if (num_feature_ > kFeatureThreshold &&
            features.size() < KSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
78
79
          auto buf = CopyToPredictMap(features);
          boosting_->PredictLeafIndexByMap(buf, output);
80
81
82
83
        } else {
          CopyToPredictBuffer(predict_buf_[tid].data(), features);
          // get result for leaf index
          boosting_->PredictLeafIndex(predict_buf_[tid].data(), output);
Guolin Ke's avatar
Guolin Ke committed
84
85
          ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(),
                             features);
86
        }
Guolin Ke's avatar
Guolin Ke committed
87
      };
Guolin Ke's avatar
Guolin Ke committed
88
    } else if (predict_contrib) {
89
90
91
      if (boosting_->IsLinear()) {
        Log::Fatal("Predicting SHAP feature contributions is not implemented for linear trees.");
      }
Guolin Ke's avatar
Guolin Ke committed
92
93
94
95
      predict_fun_ = [=](const std::vector<std::pair<int, double>>& features,
                         double* output) {
        int tid = omp_get_thread_num();
        CopyToPredictBuffer(predict_buf_[tid].data(), features);
96
97
        // get feature importances
        boosting_->PredictContrib(predict_buf_[tid].data(), output);
Guolin Ke's avatar
Guolin Ke committed
98
99
100
        ClearPredictBuffer(predict_buf_[tid].data(), predict_buf_[tid].size(),
                           features);
      };
101
102
103
104
105
106
107
      predict_sparse_fun_ = [=](const std::vector<std::pair<int, double>>& features,
                                std::vector<std::unordered_map<int, double>>* output) {
        auto buf = CopyToPredictMap(features);
        // get sparse feature importances
        boosting_->PredictContribByMap(buf, output);
      };

Guolin Ke's avatar
Guolin Ke committed
108
    } else {
Guolin Ke's avatar
Guolin Ke committed
109
      if (is_raw_score) {
Guolin Ke's avatar
Guolin Ke committed
110
111
        predict_fun_ = [=](const std::vector<std::pair<int, double>>& features,
                           double* output) {
Guolin Ke's avatar
Guolin Ke committed
112
          int tid = omp_get_thread_num();
Guolin Ke's avatar
Guolin Ke committed
113
114
          if (num_feature_ > kFeatureThreshold &&
              features.size() < KSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
115
116
            auto buf = CopyToPredictMap(features);
            boosting_->PredictRawByMap(buf, output, &early_stop_);
117
118
          } else {
            CopyToPredictBuffer(predict_buf_[tid].data(), features);
Guolin Ke's avatar
Guolin Ke committed
119
120
121
122
            boosting_->PredictRaw(predict_buf_[tid].data(), output,
                                  &early_stop_);
            ClearPredictBuffer(predict_buf_[tid].data(),
                               predict_buf_[tid].size(), features);
123
          }
Guolin Ke's avatar
Guolin Ke committed
124
125
        };
      } else {
Guolin Ke's avatar
Guolin Ke committed
126
127
        predict_fun_ = [=](const std::vector<std::pair<int, double>>& features,
                           double* output) {
Guolin Ke's avatar
Guolin Ke committed
128
          int tid = omp_get_thread_num();
Guolin Ke's avatar
Guolin Ke committed
129
130
          if (num_feature_ > kFeatureThreshold &&
              features.size() < KSparseThreshold) {
Guolin Ke's avatar
Guolin Ke committed
131
132
            auto buf = CopyToPredictMap(features);
            boosting_->PredictByMap(buf, output, &early_stop_);
133
134
135
          } else {
            CopyToPredictBuffer(predict_buf_[tid].data(), features);
            boosting_->Predict(predict_buf_[tid].data(), output, &early_stop_);
Guolin Ke's avatar
Guolin Ke committed
136
137
            ClearPredictBuffer(predict_buf_[tid].data(),
                               predict_buf_[tid].size(), features);
138
          }
Guolin Ke's avatar
Guolin Ke committed
139
140
141
        };
      }
    }
Guolin Ke's avatar
Guolin Ke committed
142
  }
143

Guolin Ke's avatar
Guolin Ke committed
144
145
146
147
148
149
  /*!
  * \brief Destructor
  */
  ~Predictor() {
  }

zhangyafeikimi's avatar
zhangyafeikimi committed
150
  inline const PredictFunction& GetPredictFunction() const {
Guolin Ke's avatar
Guolin Ke committed
151
    return predict_fun_;
152
  }
153

154
155
156
157
158

  inline const PredictSparseFunction& GetPredictSparseFunction() const {
    return predict_sparse_fun_;
  }

Guolin Ke's avatar
Guolin Ke committed
159
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
160
  * \brief predicting on data, then saving result to disk
Guolin Ke's avatar
Guolin Ke committed
161
162
163
  * \param data_filename Filename of data
  * \param result_filename Filename of output result
  */
Chen Yufei's avatar
Chen Yufei committed
164
  void Predict(const char* data_filename, const char* result_filename, bool header, bool disable_shape_check, bool precise_float_parser) {
165
166
    auto writer = VirtualFileWriter::Make(result_filename);
    if (!writer->Init()) {
Chen Yufei's avatar
Chen Yufei committed
167
      Log::Fatal("Prediction results file %s cannot be created", result_filename);
Guolin Ke's avatar
Guolin Ke committed
168
    }
Guolin Ke's avatar
Guolin Ke committed
169
    auto label_idx = header ? -1 : boosting_->LabelIdx();
Chen Yufei's avatar
Chen Yufei committed
170
    auto parser = std::unique_ptr<Parser>(Parser::CreateParser(data_filename, header, boosting_->MaxFeatureIdx() + 1, label_idx,
171
                                                               precise_float_parser, boosting_->ParserConfigStr()));
Guolin Ke's avatar
Guolin Ke committed
172
173

    if (parser == nullptr) {
174
      Log::Fatal("Could not recognize the data format of data file %s", data_filename);
Guolin Ke's avatar
Guolin Ke committed
175
    }
176
177
178
    if (!header && !disable_shape_check && parser->NumFeatures() != boosting_->MaxFeatureIdx() + 1) {
      Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n" \
                 "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", parser->NumFeatures(), boosting_->MaxFeatureIdx() + 1);
179
    }
Guolin Ke's avatar
Guolin Ke committed
180
    TextReader<data_size_t> predict_data_reader(data_filename, header);
Guolin Ke's avatar
Guolin Ke committed
181
    std::vector<int> feature_remapper(parser->NumFeatures(), -1);
ww's avatar
ww committed
182
    bool need_adjust = false;
183
184
    // skip raw feature remapping if trained model has parser config str which may contain actual feature names.
    if (header && boosting_->ParserConfigStr().empty()) {
ww's avatar
ww committed
185
      std::string first_line = predict_data_reader.first_line();
Guolin Ke's avatar
Guolin Ke committed
186
      std::vector<std::string> header_words = Common::Split(first_line.c_str(), "\t,");
Guolin Ke's avatar
Guolin Ke committed
187
      std::unordered_map<std::string, int> header_mapper;
Guolin Ke's avatar
Guolin Ke committed
188
      for (int i = 0; i < static_cast<int>(header_words.size()); ++i) {
Guolin Ke's avatar
Guolin Ke committed
189
190
191
192
193
194
195
196
197
198
199
        if (header_mapper.count(header_words[i]) > 0) {
          Log::Fatal("Feature (%s) appears more than one time.", header_words[i].c_str());
        }
        header_mapper[header_words[i]] = i;
      }
      const auto& fnames = boosting_->FeatureNames();
      for (int i = 0; i < static_cast<int>(fnames.size()); ++i) {
        if (header_mapper.count(fnames[i]) <= 0) {
          Log::Warning("Feature (%s) is missed in data file. If it is weight/query/group/ignore_column, you can ignore this warning.", fnames[i].c_str());
        } else {
          feature_remapper[header_mapper.at(fnames[i])] = i;
ww's avatar
ww committed
200
201
        }
      }
Guolin Ke's avatar
Guolin Ke committed
202
203
      for (int i = 0; i < static_cast<int>(feature_remapper.size()); ++i) {
        if (feature_remapper[i] >= 0 && i != feature_remapper[i]) {
ww's avatar
ww committed
204
205
206
207
208
          need_adjust = true;
          break;
        }
      }
    }
Guolin Ke's avatar
Guolin Ke committed
209
    // function for parse data
210
211
    std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
    double tmp_label;
Guolin Ke's avatar
Guolin Ke committed
212
    parser_fun = [&parser, &feature_remapper, &tmp_label, need_adjust]
213
    (const char* buffer, std::vector<std::pair<int, double>>* feature) {
Guolin Ke's avatar
Guolin Ke committed
214
      parser->ParseOneLine(buffer, feature, &tmp_label);
Guolin Ke's avatar
Guolin Ke committed
215
      if (need_adjust) {
ww's avatar
ww committed
216
        int i = 0, j = static_cast<int>(feature->size());
Guolin Ke's avatar
Guolin Ke committed
217
        while (i < j) {
Guolin Ke's avatar
Guolin Ke committed
218
219
          if (feature_remapper[(*feature)[i].first] >= 0) {
            (*feature)[i].first = feature_remapper[(*feature)[i].first];
ww's avatar
ww committed
220
            ++i;
Guolin Ke's avatar
Guolin Ke committed
221
          } else {
222
            // move the non-used features to the end of the feature vector
ww's avatar
ww committed
223
224
225
226
227
            std::swap((*feature)[i], (*feature)[--j]);
          }
        }
        feature->resize(i);
      }
Guolin Ke's avatar
Guolin Ke committed
228
229
    };

Guolin Ke's avatar
Guolin Ke committed
230
231
232
    std::function<void(data_size_t, const std::vector<std::string>&)>
        process_fun = [&parser_fun, &writer, this](
                          data_size_t, const std::vector<std::string>& lines) {
233
      std::vector<std::pair<int, double>> oneline_features;
234
235
236
      std::vector<std::string> result_to_write(lines.size());
      OMP_INIT_EX();
      #pragma omp parallel for schedule(static) firstprivate(oneline_features)
237
      for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
238
        OMP_LOOP_EX_BEGIN();
Guolin Ke's avatar
Guolin Ke committed
239
240
241
242
        oneline_features.clear();
        // parser
        parser_fun(lines[i].c_str(), &oneline_features);
        // predict
Guolin Ke's avatar
Guolin Ke committed
243
244
245
        std::vector<double> result(num_pred_one_row_);
        predict_fun_(oneline_features, result.data());
        auto str_result = Common::Join<double>(result, "\t");
246
247
248
249
250
        result_to_write[i] = str_result;
        OMP_LOOP_EX_END();
      }
      OMP_THROW_EX();
      for (data_size_t i = 0; i < static_cast<data_size_t>(result_to_write.size()); ++i) {
251
252
        writer->Write(result_to_write[i].c_str(), result_to_write[i].size());
        writer->Write("\n", 1);
Guolin Ke's avatar
Guolin Ke committed
253
254
255
256
257
      }
    };
    predict_data_reader.ReadAllAndProcessParallel(process_fun);
  }

Nikita Titov's avatar
Nikita Titov committed
258
 private:
Guolin Ke's avatar
Guolin Ke committed
259
  void CopyToPredictBuffer(double* pred_buf, const std::vector<std::pair<int, double>>& features) {
260
261
262
    for (const auto &feature : features) {
      if (feature.first < num_feature_) {
        pred_buf[feature.first] = feature.second;
263
      }
264
265
266
    }
  }

Guolin Ke's avatar
Guolin Ke committed
267
  void ClearPredictBuffer(double* pred_buf, size_t buf_size, const std::vector<std::pair<int, double>>& features) {
268
    if (features.size() > static_cast<size_t>(buf_size / 2)) {
Guolin Ke's avatar
Guolin Ke committed
269
      std::memset(pred_buf, 0, sizeof(double)*(buf_size));
270
    } else {
271
272
273
      for (const auto &feature : features) {
        if (feature.first < num_feature_) {
          pred_buf[feature.first] = 0.0f;
Guolin Ke's avatar
Guolin Ke committed
274
        }
Guolin Ke's avatar
Guolin Ke committed
275
276
277
      }
    }
  }
278

Guolin Ke's avatar
Guolin Ke committed
279
280
  std::unordered_map<int, double> CopyToPredictMap(const std::vector<std::pair<int, double>>& features) {
    std::unordered_map<int, double> buf;
281
282
283
    for (const auto &feature : features) {
      if (feature.first < num_feature_) {
        buf[feature.first] = feature.second;
284
285
      }
    }
286
    return buf;
287
288
  }

Guolin Ke's avatar
Guolin Ke committed
289
290
  /*! \brief Boosting model */
  const Boosting* boosting_;
Guolin Ke's avatar
Guolin Ke committed
291
292
  /*! \brief function for prediction */
  PredictFunction predict_fun_;
293
  PredictSparseFunction predict_sparse_fun_;
294
  PredictionEarlyStopInstance early_stop_;
295
  int num_feature_;
Guolin Ke's avatar
Guolin Ke committed
296
  int num_pred_one_row_;
297
  std::vector<std::vector<double, Common::AlignmentAllocator<double, kAlignedSize>>> predict_buf_;
Guolin Ke's avatar
Guolin Ke committed
298
299
300
301
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
302
#endif   // LightGBM_PREDICTOR_HPP_