predictor.hpp 7.15 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#ifndef LIGHTGBM_PREDICTOR_HPP_
#define LIGHTGBM_PREDICTOR_HPP_

#include <LightGBM/meta.h>
#include <LightGBM/boosting.h>
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/dataset.h>

#include <omp.h>

#include <cstring>
#include <cstdio>
#include <vector>
#include <utility>
#include <functional>
#include <string>

namespace LightGBM {

/*!
* \brief Used to prediction data with input model
*/
class Predictor {
public:
  /*!
  * \brief Constructor
  * \param boosting Input boosting model
  * \param is_sigmoid True if need to predict result with sigmoid transform(if needed, like binary classification)
wxchan's avatar
wxchan committed
29
  * \param predict_leaf_index True if output leaf index instead of prediction score
Guolin Ke's avatar
Guolin Ke committed
30
  */
wxchan's avatar
wxchan committed
31
32
  Predictor(const Boosting* boosting, bool is_simgoid, bool predict_leaf_index)
    : is_simgoid_(is_simgoid), predict_leaf_index(predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    boosting_ = boosting;
    num_features_ = boosting_->MaxFeatureIdx() + 1;
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
    features_ = new double*[num_threads_];
    for (int i = 0; i < num_threads_; ++i) {
      features_[i] = new double[num_features_];
    }
  }
  /*!
  * \brief Destructor
  */
  ~Predictor() {
    if (features_ != nullptr) {
      for (int i = 0; i < num_threads_; ++i) {
        delete[] features_[i];
      }
      delete[] features_;
    }
  }

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
58
  * \brief prediction for one record, only raw result(without sigmoid transformation)
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
63
64
65
66
67
68
69
70
71
  * \param features Feature for this record
  * \return Prediction result
  */
  double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
    const int tid = omp_get_thread_num();
    // init feature value
    std::memset(features_[tid], 0, sizeof(double)*num_features_);
    // put feature value
    for (const auto& p : features) {
      if (p.first < num_features_) {
        features_[tid][p.first] = p.second;
      }
    }
Qiwei Ye's avatar
Qiwei Ye committed
72
    // get result without sigmoid transformation
Guolin Ke's avatar
Guolin Ke committed
73
74
    return boosting_->PredictRaw(features_[tid]);
  }
wxchan's avatar
wxchan committed
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  
  /*!
  * \brief prediction for one record, only raw result(without sigmoid transformation)
  * \param features Feature for this record
  * \return Predictied leaf index
  */
  std::vector<int> PredictLeafIndexOneLine(const std::vector<std::pair<int, double>>& features) {
    const int tid = omp_get_thread_num();
    // init feature value
    std::memset(features_[tid], 0, sizeof(double)*num_features_);
    // put feature value
    for (const auto& p : features) {
      if (p.first < num_features_) {
        features_[tid][p.first] = p.second;
      }
    }
    // get result for leaf index
    return boosting_->PredictLeafIndex(features_[tid]);
  }
Guolin Ke's avatar
Guolin Ke committed
94
95

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
96
97
  * \brief prediction for one record, will use sigmoid transformation if needed(only enabled for binary classification noe)
  * \param features Feature of this record
Guolin Ke's avatar
Guolin Ke committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  * \return Prediction result
  */
  double PredictOneLine(const std::vector<std::pair<int, double>>& features) {
    const int tid = omp_get_thread_num();
    // init feature value
    std::memset(features_[tid], 0, sizeof(double)*num_features_);
    // put feature value
    for (const auto& p : features) {
      if (p.first < num_features_) {
        features_[tid][p.first] = p.second;
      }
    }
    // get result with sigmoid transform
    return boosting_->Predict(features_[tid]);
  }
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
114
  * \brief predicting on data, then saving result to disk
Guolin Ke's avatar
Guolin Ke committed
115
116
117
118
  * \param data_filename Filename of data
  * \param has_label True if this data contains label
  * \param result_filename Filename of output result
  */
119
  void Predict(const char* data_filename, const char* result_filename) {
Guolin Ke's avatar
Guolin Ke committed
120
121
122
123
124
125
126
127
128
    FILE* result_file;

#ifdef _MSC_VER
    fopen_s(&result_file, result_filename, "w");
#else
    result_file = fopen(result_filename, "w");
#endif

    if (result_file == NULL) {
Qiwei Ye's avatar
Qiwei Ye committed
129
      Log::Fatal("Predition result file %s doesn't exists", data_filename);
Guolin Ke's avatar
Guolin Ke committed
130
    }
131
132
    bool has_label = false;
    Parser* parser = Parser::CreateParser(data_filename, num_features_, &has_label);
Guolin Ke's avatar
Guolin Ke committed
133
134

    if (parser == nullptr) {
Qiwei Ye's avatar
Qiwei Ye committed
135
      Log::Fatal("Recongnizing input data format failed, filename %s", data_filename);
Guolin Ke's avatar
Guolin Ke committed
136
137
138
139
140
141
142
143
144
145
146
    }

    // function for parse data
    std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
    double tmp_label;
    if (has_label) {
      // parse function with label
      parser_fun = [this, &parser, &tmp_label]
      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
        parser->ParseOneLine(buffer, feature, &tmp_label);
      };
147
      Log::Info("Start prediction for data %s with labels", data_filename);
Guolin Ke's avatar
Guolin Ke committed
148
149
150
151
152
153
    } else {
      // parse function without label
      parser_fun = [this, &parser]
      (const char* buffer, std::vector<std::pair<int, double>>* feature) {
        parser->ParseOneLine(buffer, feature);
      };
154
      Log::Info("Start prediction for data %s without label", data_filename);
Guolin Ke's avatar
Guolin Ke committed
155
    }
wxchan's avatar
wxchan committed
156
157
158
159
160
161
162
163
164
165
166
167
    std::function<std::string(const std::vector<std::pair<int, double>>&)> predict_fun;
    if (predict_leaf_index) {
      predict_fun = [this](const std::vector<std::pair<int, double>>& features){
        std::vector<int> predicted_leaf_index = PredictLeafIndexOneLine(features);
        std::stringstream result_ss;
        for (size_t i = 0; i < predicted_leaf_index.size(); ++i){
          if (i > 0) {
            result_ss << '\t';
          }
          result_ss << predicted_leaf_index[i];
        }
        return result_ss.str();  
Guolin Ke's avatar
Guolin Ke committed
168
169
      };
    }
wxchan's avatar
wxchan committed
170
171
172
173
174
175
176
177
178
179
180
181
    else {
      if (is_simgoid_) {
        predict_fun = [this](const std::vector<std::pair<int, double>>& features){
          return std::to_string(PredictOneLine(features));
        };
      } 
      else {
        predict_fun = [this](const std::vector<std::pair<int, double>>& features){
          return std::to_string(PredictRawOneLine(features));
        };
      } 
    }
Guolin Ke's avatar
Guolin Ke committed
182
183
184
185
    std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
      [this, &parser_fun, &predict_fun, &result_file]
    (data_size_t, const std::vector<std::string>& lines) {
      std::vector<std::pair<int, double>> oneline_features;
wxchan's avatar
wxchan committed
186
      std::vector<std::string> pred_result(lines.size(), "");
Guolin Ke's avatar
Guolin Ke committed
187
188
189
190
191
192
193
194
195
196
#pragma omp parallel for schedule(static) private(oneline_features)
      for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
        oneline_features.clear();
        // parser
        parser_fun(lines[i].c_str(), &oneline_features);
        // predict
        pred_result[i] = predict_fun(oneline_features);
      }

      for (size_t i = 0; i < pred_result.size(); ++i) {
wxchan's avatar
wxchan committed
197
        fprintf(result_file, "%s\n", pred_result[i].c_str());
Guolin Ke's avatar
Guolin Ke committed
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
      }
    };
    TextReader<data_size_t> predict_data_reader(data_filename);
    predict_data_reader.ReadAllAndProcessParallel(process_fun);

    fclose(result_file);
    delete parser;
  }

private:
  /*! \brief Boosting model */
  const Boosting* boosting_;
  /*! \brief Buffer for feature values */
  double** features_;
  /*! \brief Number of features */
  int num_features_;
  /*! \brief True if need to predict result with sigmoid transform */
  bool is_simgoid_;
  /*! \brief Number of threads */
  int num_threads_;
wxchan's avatar
wxchan committed
218
219
  /*! \brief True if output leaf index instead of prediction score */
  bool predict_leaf_index;
Guolin Ke's avatar
Guolin Ke committed
220
221
222
223
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
224
#endif   // LightGBM_PREDICTOR_HPP_