predictor.hpp 6.45 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#ifndef LIGHTGBM_PREDICTOR_HPP_
#define LIGHTGBM_PREDICTOR_HPP_

#include <LightGBM/meta.h>
#include <LightGBM/boosting.h>
#include <LightGBM/utils/text_reader.h>
#include <LightGBM/dataset.h>

#include <omp.h>

#include <cstring>
#include <cstdio>
#include <vector>
#include <utility>
#include <functional>
#include <string>

namespace LightGBM {

/*!
* \brief Used to prediction data with input model
*/
class Predictor {
public:
  /*!
  * \brief Constructor
  * \param boosting Input boosting model
  * \param is_sigmoid True if need to predict result with sigmoid transform(if needed, like binary classification)
wxchan's avatar
wxchan committed
29
  * \param predict_leaf_index True if output leaf index instead of prediction score
Guolin Ke's avatar
Guolin Ke committed
30
  */
wxchan's avatar
wxchan committed
31
32
  Predictor(const Boosting* boosting, bool is_simgoid, bool predict_leaf_index)
    : is_simgoid_(is_simgoid), predict_leaf_index(predict_leaf_index) {
Guolin Ke's avatar
Guolin Ke committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
    boosting_ = boosting;
    num_features_ = boosting_->MaxFeatureIdx() + 1;
#pragma omp parallel
#pragma omp master
    {
      num_threads_ = omp_get_num_threads();
    }
    features_ = new double*[num_threads_];
    for (int i = 0; i < num_threads_; ++i) {
      features_[i] = new double[num_features_];
    }
  }
  /*!
  * \brief Destructor
  */
  ~Predictor() {
    if (features_ != nullptr) {
      for (int i = 0; i < num_threads_; ++i) {
        delete[] features_[i];
      }
      delete[] features_;
    }
  }

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
58
  * \brief prediction for one record, only raw result(without sigmoid transformation)
Guolin Ke's avatar
Guolin Ke committed
59
60
61
62
  * \param features Feature for this record
  * \return Prediction result
  */
  double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
Guolin Ke's avatar
Guolin Ke committed
63
    const int tid = PutFeatureValuesToBuffer(features);
Qiwei Ye's avatar
Qiwei Ye committed
64
    // get result without sigmoid transformation
Guolin Ke's avatar
Guolin Ke committed
65
66
    return boosting_->PredictRaw(features_[tid]);
  }
wxchan's avatar
wxchan committed
67
68
69
70
71
72
73
  
  /*!
  * \brief prediction for one record, only raw result(without sigmoid transformation)
  * \param features Feature for this record
  * \return Predictied leaf index
  */
  std::vector<int> PredictLeafIndexOneLine(const std::vector<std::pair<int, double>>& features) {
Guolin Ke's avatar
Guolin Ke committed
74
    const int tid = PutFeatureValuesToBuffer(features);
wxchan's avatar
wxchan committed
75
76
77
    // get result for leaf index
    return boosting_->PredictLeafIndex(features_[tid]);
  }
Guolin Ke's avatar
Guolin Ke committed
78
79

  /*!
Qiwei Ye's avatar
Qiwei Ye committed
80
81
  * \brief prediction for one record, will use sigmoid transformation if needed(only enabled for binary classification noe)
  * \param features Feature of this record
Guolin Ke's avatar
Guolin Ke committed
82
83
84
  * \return Prediction result
  */
  double PredictOneLine(const std::vector<std::pair<int, double>>& features) {
Guolin Ke's avatar
Guolin Ke committed
85
86
    const int tid = PutFeatureValuesToBuffer(features);
    // get result with sigmoid transform if needed
Guolin Ke's avatar
Guolin Ke committed
87
88
89
    return boosting_->Predict(features_[tid]);
  }
  /*!
Qiwei Ye's avatar
Qiwei Ye committed
90
  * \brief predicting on data, then saving result to disk
Guolin Ke's avatar
Guolin Ke committed
91
92
93
94
  * \param data_filename Filename of data
  * \param has_label True if this data contains label
  * \param result_filename Filename of output result
  */
Guolin Ke's avatar
Guolin Ke committed
95
  void Predict(const char* data_filename, const char* result_filename, bool has_header) {
Guolin Ke's avatar
Guolin Ke committed
96
97
98
99
100
101
102
103
104
    FILE* result_file;

#ifdef _MSC_VER
    fopen_s(&result_file, result_filename, "w");
#else
    result_file = fopen(result_filename, "w");
#endif

    if (result_file == NULL) {
Qiwei Ye's avatar
Qiwei Ye committed
105
      Log::Fatal("Predition result file %s doesn't exists", data_filename);
Guolin Ke's avatar
Guolin Ke committed
106
    }
Guolin Ke's avatar
Guolin Ke committed
107
    Parser* parser = Parser::CreateParser(data_filename, has_header, num_features_, boosting_->LabelIdx());
Guolin Ke's avatar
Guolin Ke committed
108
109

    if (parser == nullptr) {
Qiwei Ye's avatar
Qiwei Ye committed
110
      Log::Fatal("Recongnizing input data format failed, filename %s", data_filename);
Guolin Ke's avatar
Guolin Ke committed
111
112
113
114
115
    }

    // function for parse data
    std::function<void(const char*, std::vector<std::pair<int, double>>*)> parser_fun;
    double tmp_label;
Guolin Ke's avatar
Guolin Ke committed
116
117
118
119
120
121

    parser_fun = [this, &parser, &tmp_label]
    (const char* buffer, std::vector<std::pair<int, double>>* feature) {
      parser->ParseOneLine(buffer, feature, &tmp_label);
    };

wxchan's avatar
wxchan committed
122
123
124
125
126
127
128
129
130
131
132
133
    std::function<std::string(const std::vector<std::pair<int, double>>&)> predict_fun;
    if (predict_leaf_index) {
      predict_fun = [this](const std::vector<std::pair<int, double>>& features){
        std::vector<int> predicted_leaf_index = PredictLeafIndexOneLine(features);
        std::stringstream result_ss;
        for (size_t i = 0; i < predicted_leaf_index.size(); ++i){
          if (i > 0) {
            result_ss << '\t';
          }
          result_ss << predicted_leaf_index[i];
        }
        return result_ss.str();  
Guolin Ke's avatar
Guolin Ke committed
134
135
      };
    }
wxchan's avatar
wxchan committed
136
137
138
139
140
141
142
143
144
145
146
147
    else {
      if (is_simgoid_) {
        predict_fun = [this](const std::vector<std::pair<int, double>>& features){
          return std::to_string(PredictOneLine(features));
        };
      } 
      else {
        predict_fun = [this](const std::vector<std::pair<int, double>>& features){
          return std::to_string(PredictRawOneLine(features));
        };
      } 
    }
Guolin Ke's avatar
Guolin Ke committed
148
149
150
151
    std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
      [this, &parser_fun, &predict_fun, &result_file]
    (data_size_t, const std::vector<std::string>& lines) {
      std::vector<std::pair<int, double>> oneline_features;
wxchan's avatar
wxchan committed
152
      std::vector<std::string> pred_result(lines.size(), "");
Guolin Ke's avatar
Guolin Ke committed
153
154
155
156
157
158
159
160
161
162
#pragma omp parallel for schedule(static) private(oneline_features)
      for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); i++) {
        oneline_features.clear();
        // parser
        parser_fun(lines[i].c_str(), &oneline_features);
        // predict
        pred_result[i] = predict_fun(oneline_features);
      }

      for (size_t i = 0; i < pred_result.size(); ++i) {
wxchan's avatar
wxchan committed
163
        fprintf(result_file, "%s\n", pred_result[i].c_str());
Guolin Ke's avatar
Guolin Ke committed
164
165
      }
    };
Guolin Ke's avatar
Guolin Ke committed
166
    TextReader<data_size_t> predict_data_reader(data_filename, has_header);
Guolin Ke's avatar
Guolin Ke committed
167
168
169
170
171
172
173
    predict_data_reader.ReadAllAndProcessParallel(process_fun);

    fclose(result_file);
    delete parser;
  }

private:
Guolin Ke's avatar
Guolin Ke committed
174
175
176
177
178
179
180
181
182
183
184
185
  int PutFeatureValuesToBuffer(const std::vector<std::pair<int, double>>& features) {
    int tid = omp_get_thread_num();
    // init feature value
    std::memset(features_[tid], 0, sizeof(double)*num_features_);
    // put feature value
    for (const auto& p : features) {
      if (p.first < num_features_) {
        features_[tid][p.first] = p.second;
      }
    }
    return tid;
  }
Guolin Ke's avatar
Guolin Ke committed
186
187
188
189
190
191
192
193
194
195
  /*! \brief Boosting model */
  const Boosting* boosting_;
  /*! \brief Buffer for feature values */
  double** features_;
  /*! \brief Number of features */
  int num_features_;
  /*! \brief True if need to predict result with sigmoid transform */
  bool is_simgoid_;
  /*! \brief Number of threads */
  int num_threads_;
wxchan's avatar
wxchan committed
196
197
  /*! \brief True if output leaf index instead of prediction score */
  bool predict_leaf_index;
Guolin Ke's avatar
Guolin Ke committed
198
199
200
201
};

}  // namespace LightGBM

Guolin Ke's avatar
Guolin Ke committed
202
#endif   // LightGBM_PREDICTOR_HPP_