predictor.hpp 6.85 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#ifndef LIGHTGBM_PREDICTOR_HPP_
#define LIGHTGBM_PREDICTOR_HPP_

#include <LightGBM/meta.h>
#include <LightGBM/boosting.h>
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/dataset.h>

#include <omp.h>

#include <cstring>
#include <cstdio>
#include <vector>
#include <utility>
#include <functional>
#include <string>

namespace LightGBM {

/*!
* \brief Used to prediction data with input model
*/
class Predictor {
public:
  /*!
  * \brief Constructor
  * \param boosting Input boosting model
  * \param is_sigmoid True if need to predict result with sigmoid transform(if needed, like binary classification)
wxchan's avatar
wxchan committed
29
  * \param predict_leaf_index True if output leaf index instead of prediction score
Guolin Ke's avatar
Guolin Ke committed
30
  */
wxchan's avatar
wxchan committed
31
32
  Predictor(const Boosting* boosting, bool is_simgoid, bool predict_leaf_index)
    : is_simgoid_(is_simgoid), predict_leaf_index(predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    boosting_ = boosting;
    num_features_ = boosting_->MaxFeatureIdx() + 1;
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
    features_ = new double*[num_threads_];
    for (int i = 0; i < num_threads_; ++i) {
      features_[i] = new double[num_features_];
    }
  }
  /*!
  * \brief Destructor
  */
  ~Predictor() {
    if (features_ != nullptr) {
      for (int i = 0; i < num_threads_; ++i) {
        delete[] features_[i];
      }
      delete[] features_;
    }
  }

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
58
  * \brief prediction for one record, only raw result(without sigmoid transformation)
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
  * \param features Feature for this record
  * \return Prediction result
  */
  double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
Guolin Ke's avatar
Guolin Ke committed
63
    const int tid = PutFeatureValuesToBuffer(features);
Qiwei Ye's avatar
Qiwei Ye committed
64
    // get result without sigmoid transformation
Guolin Ke's avatar
Guolin Ke committed
65
66
    return boosting_->PredictRaw(features_[tid]);
  }
wxchan's avatar
wxchan committed
67
68
69
70
71
72
73
  
  /*!
  * \brief prediction for one record, only raw result(without sigmoid transformation)
  * \param features Feature for this record
  * \return Predictied leaf index
  */
  std::vector<int> PredictLeafIndexOneLine(const std::vector<std::pair<int, double>>& features) {
Guolin Ke's avatar
Guolin Ke committed
74
    const int tid = PutFeatureValuesToBuffer(features);
wxchan's avatar
wxchan committed
75
76
77
    // get result for leaf index
    return boosting_->PredictLeafIndex(features_[tid]);
  }
Guolin Ke's avatar
Guolin Ke committed
78
79

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
80
81
  * \brief prediction for one record, will use sigmoid transformation if needed(only enabled for binary classification noe)
  * \param features Feature of this record
Guolin Ke's avatar
Guolin Ke committed
82
83
84
  * \return Prediction result
  */
  double PredictOneLine(const std::vector<std::pair<int, double>>& features) {
Guolin Ke's avatar
Guolin Ke committed
85
86
    const int tid = PutFeatureValuesToBuffer(features);
    // get result with sigmoid transform if needed
Guolin Ke's avatar
Guolin Ke committed
87
88
89
    return boosting_->Predict(features_[tid]);
  }
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
90
  * \brief predicting on data, then saving result to disk
Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
  * \param data_filename Filename of data
  * \param has_label True if this data contains label
  * \param result_filename Filename of output result
  */
95
  void Predict(const char* data_filename, const char* result_filename) {
Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
100
101
102
103
104
    FILE* result_file;

#ifdef _MSC_VER
    fopen_s(&result_file, result_filename, "w");
#else
    result_file = fopen(result_filename, "w");
#endif

    if (result_file == NULL) {
Qiwei Ye's avatar
Qiwei Ye committed
105
      Log::Fatal("Predition result file %s doesn't exists", data_filename);
Guolin Ke's avatar
Guolin Ke committed
106
    }
107
108
    bool has_label = false;
    Parser* parser = Parser::CreateParser(data_filename, num_features_, &has_label);
Guolin Ke's avatar
Guolin Ke committed
109
110

    if (parser == nullptr) {
Qiwei Ye's avatar
Qiwei Ye committed
111
      Log::Fatal("Recongnizing input data format failed, filename %s", data_filename);
Guolin Ke's avatar
Guolin Ke committed
112
113
114
115
116
117
118
119
120
121
122
    }

    // function for parse data
    std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
    double tmp_label;
    if (has_label) {
      // parse function with label
      parser_fun = [this, &parser, &tmp_label]
      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
        parser->ParseOneLine(buffer, feature, &tmp_label);
      };
123
      Log::Info("Start prediction for data %s with labels", data_filename);
Guolin Ke's avatar
Guolin Ke committed
124
125
126
127
128
129
    } else {
      // parse function without label
      parser_fun = [this, &parser]
      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
        parser->ParseOneLine(buffer, feature);
      };
130
      Log::Info("Start prediction for data %s without label", data_filename);
Guolin Ke's avatar
Guolin Ke committed
131
    }
wxchan's avatar
wxchan committed
132
133
134
135
136
137
138
139
140
141
142
143
    std::function<std::string(const std::vector<std::pair<int, double>>&)> predict_fun;
    if (predict_leaf_index) {
      predict_fun = [this](const std::vector<std::pair<int, double>>& features){
        std::vector<int> predicted_leaf_index = PredictLeafIndexOneLine(features);
        std::stringstream result_ss;
        for (size_t i = 0; i < predicted_leaf_index.size(); ++i){
          if (i > 0) {
            result_ss << '\t';
          }
          result_ss << predicted_leaf_index[i];
        }
        return result_ss.str();  
Guolin Ke's avatar
Guolin Ke committed
144
145
      };
    }
wxchan's avatar
wxchan committed
146
147
148
149
150
151
152
153
154
155
156
157
    else {
      if (is_simgoid_) {
        predict_fun = [this](const std::vector<std::pair<int, double>>& features){
          return std::to_string(PredictOneLine(features));
        };
      } 
      else {
        predict_fun = [this](const std::vector<std::pair<int, double>>& features){
          return std::to_string(PredictRawOneLine(features));
        };
      } 
    }
Guolin Ke's avatar
Guolin Ke committed
158
159
160
161
    std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
      [this, &parser_fun, &predict_fun, &result_file]
    (data_size_t, const std::vector<std::string>& lines) {
      std::vector<std::pair<int, double>> oneline_features;
wxchan's avatar
wxchan committed
162
      std::vector<std::string> pred_result(lines.size(), "");
Guolin Ke's avatar
Guolin Ke committed
163
164
165
166
167
168
169
170
171
172
#pragma omp parallel for schedule(static) private(oneline_features)
      for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
        oneline_features.clear();
        // parser
        parser_fun(lines[i].c_str(), &oneline_features);
        // predict
        pred_result[i] = predict_fun(oneline_features);
      }

      for (size_t i = 0; i < pred_result.size(); ++i) {
wxchan's avatar
wxchan committed
173
        fprintf(result_file, "%s\n", pred_result[i].c_str());
Guolin Ke's avatar
Guolin Ke committed
174
175
176
177
178
179
180
181
182
183
      }
    };
    TextReader<data_size_t> predict_data_reader(data_filename);
    predict_data_reader.ReadAllAndProcessParallel(process_fun);

    fclose(result_file);
    delete parser;
  }

private:
Guolin Ke's avatar
Guolin Ke committed
184
185
186
187
188
189
190
191
192
193
194
195
  int PutFeatureValuesToBuffer(const std::vector<std::pair<int, double>>& features) {
    int tid = omp_get_thread_num();
    // init feature value
    std::memset(features_[tid], 0, sizeof(double)*num_features_);
    // put feature value
    for (const auto& p : features) {
      if (p.first < num_features_) {
        features_[tid][p.first] = p.second;
      }
    }
    return tid;
  }
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
200
201
202
203
204
205
  /*! \brief Boosting model */
  const Boosting* boosting_;
  /*! \brief Buffer for feature values */
  double** features_;
  /*! \brief Number of features */
  int num_features_;
  /*! \brief True if need to predict result with sigmoid transform */
  bool is_simgoid_;
  /*! \brief Number of threads */
  int num_threads_;
wxchan's avatar
wxchan committed
206
207
  /*! \brief True if output leaf index instead of prediction score */
  bool predict_leaf_index;
Guolin Ke's avatar
Guolin Ke committed
208
209
210
211
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
212
#endif   // LightGBM_PREDICTOR_HPP_