"include/git@developer.sourcefind.cn:tianlh/lightgbm-dcu.git" did not exist on "68bd9ab97559453e438c34ac74e18f21e80c2c50"
c_api.cpp 26.7 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
#include <omp.h>

#include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
Guolin Ke's avatar
Guolin Ke committed
5
#include <LightGBM/c_api.h>
Guolin Ke's avatar
Guolin Ke committed
6
#include <LightGBM/dataset_loader.h>
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <LightGBM/config.h>

#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
Guolin Ke's avatar
Guolin Ke committed
17
#include <memory>
Guolin Ke's avatar
Guolin Ke committed
18

Guolin Ke's avatar
Guolin Ke committed
19
20
#include "./application/predictor.hpp"

Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
namespace LightGBM {

class Booster {
public:
  explicit Booster(const char* filename):
26
27
28
    boosting_(Boosting::CreateBoosting(filename)), 
    objective_fun_(nullptr), 
    predictor_(nullptr) {
Guolin Ke's avatar
Guolin Ke committed
29
30
31
32
33
34
  }

  Booster(const Dataset* train_data, 
    std::vector<const Dataset*> valid_data, 
    std::vector<std::string> valid_names,
    const char* parameters)
Guolin Ke's avatar
Guolin Ke committed
35
    :train_data_(train_data), valid_datas_(valid_data), predictor_(nullptr) {
Guolin Ke's avatar
Guolin Ke committed
36
37
38
    config_.LoadFromString(parameters);
    // create boosting
    if (config_.io_config.input_model.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
39
      Log::Warning("continued train from model is not support for c_api, \
Guolin Ke's avatar
Guolin Ke committed
40
41
42
43
44
45
46
47
        please use continued train with input score");
    }
    boosting_ = Boosting::CreateBoosting(config_.boosting_type, "");
    // create objective function
    objective_fun_ =
      ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
        config_.objective_config);
    // create training metric
Guolin Ke's avatar
Guolin Ke committed
48
49
50
51
52
53
54
    for (auto metric_type : config_.metric_types) {
      Metric* metric =
        Metric::CreateMetric(metric_type, config_.metric_config);
      if (metric == nullptr) { continue; }
      metric->Init("training", train_data_->metadata(),
        train_data_->num_data());
      train_metric_.push_back(metric);
Guolin Ke's avatar
Guolin Ke committed
55
    }
Guolin Ke's avatar
Guolin Ke committed
56
    
Guolin Ke's avatar
Guolin Ke committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    // add metric for validation data
    for (size_t i = 0; i < valid_datas_.size(); ++i) {
      valid_metrics_.emplace_back();
      for (auto metric_type : config_.metric_types) {
        Metric* metric = Metric::CreateMetric(metric_type, config_.metric_config);
        if (metric == nullptr) { continue; }
        metric->Init(valid_names[i].c_str(),
          valid_datas_[i]->metadata(),
          valid_datas_[i]->num_data());
        valid_metrics_.back().push_back(metric);
      }
    }
    // initialize the objective function
    objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
    // initialize the boosting
    boosting_->Init(config_.boosting_config, train_data_, objective_fun_,
      ConstPtrInVectorWarpper<Metric>(train_metric_));
    // add validation data into boosting
    for (size_t i = 0; i < valid_datas_.size(); ++i) {
      boosting_->AddDataset(valid_datas_[i],
        ConstPtrInVectorWarpper<Metric>(valid_metrics_[i]));
    }
  }

  ~Booster() {
    for (auto& metric : train_metric_) {
      if (metric != nullptr) { delete metric; }
    }
    for (auto& metric : valid_metrics_) {
      for (auto& sub_metric : metric) {
        if (sub_metric != nullptr) { delete sub_metric; }
      }
    }
    valid_metrics_.clear();
    if (boosting_ != nullptr) { delete boosting_; }
    if (objective_fun_ != nullptr) { delete objective_fun_; }
Guolin Ke's avatar
Guolin Ke committed
93
    if (predictor_ != nullptr) { delete predictor_; }
Guolin Ke's avatar
Guolin Ke committed
94
  }
95
96
97
98
99
100
101
102
103
104
105

  bool TrainOneIter() {
    return boosting_->TrainOneIter(nullptr, nullptr, false);
  }

  bool TrainOneIter(const float* gradients, const float* hessians) {
    return boosting_->TrainOneIter(gradients, hessians, false);
  }

  void PrepareForPrediction(int num_used_model, int predict_type) {
    boosting_->SetNumUsedModel(num_used_model);
Guolin Ke's avatar
Guolin Ke committed
106
107
108
    if (predictor_ != nullptr) { delete predictor_; }
    bool is_predict_leaf = false;
    bool is_raw_score = false;
Guolin Ke's avatar
Guolin Ke committed
109
    if (predict_type == C_API_PREDICT_LEAF_INDEX) {
Guolin Ke's avatar
Guolin Ke committed
110
      is_predict_leaf = true;
Guolin Ke's avatar
Guolin Ke committed
111
    } else if (predict_type == C_API_PREDICT_RAW_SCORE) {
Guolin Ke's avatar
Guolin Ke committed
112
      is_raw_score = true;
Guolin Ke's avatar
Guolin Ke committed
113
114
    } else {
      is_raw_score = false;
Guolin Ke's avatar
Guolin Ke committed
115
116
117
118
119
120
    }
    predictor_ = new Predictor(boosting_, is_raw_score, is_predict_leaf);
  }

  std::vector<double> Predict(const std::vector<std::pair<int, double>>& features) {
    return predictor_->GetPredictFunction()(features);
121
122
  }

123
124
125
126
  void PredictForFile(const char* data_filename, const char* result_filename, bool data_has_header) {
    predictor_->Predict(data_filename, result_filename, data_has_header);
  }

Guolin Ke's avatar
Guolin Ke committed
127
128
129
  void SaveModelToFile(int num_used_model, const char* filename) {
    boosting_->SaveModelToFile(num_used_model, true, filename);
  }
130
131
  const Boosting* GetBoosting() const { return boosting_; }

Guolin Ke's avatar
Guolin Ke committed
132
  const inline int NumberOfClasses() const { return boosting_->NumberOfClasses(); }
133

Guolin Ke's avatar
Guolin Ke committed
134
private:
135

Guolin Ke's avatar
Guolin Ke committed
136
137
138
139
140
141
142
143
144
145
146
147
148
  Boosting* boosting_;
  /*! \brief All configs */
  OverallConfig config_;
  /*! \brief Training data */
  const Dataset* train_data_;
  /*! \brief Validation data */
  std::vector<const Dataset*> valid_datas_;
  /*! \brief Metric for training data */
  std::vector<Metric*> train_metric_;
  /*! \brief Metrics for validation data */
  std::vector<std::vector<Metric*>> valid_metrics_;
  /*! \brief Training objective function */
  ObjectiveFunction* objective_fun_;
Guolin Ke's avatar
Guolin Ke committed
149
150
  /*! \brief Using predictor for prediction task */
  Predictor* predictor_;
151

Guolin Ke's avatar
Guolin Ke committed
152
153
154
};

}
Guolin Ke's avatar
Guolin Ke committed
155
156
157

using namespace LightGBM;

Guolin Ke's avatar
Guolin Ke committed
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

DllExport const char* LGBM_GetLastError() {
  return "Not error msg now, will support soon";
}



DllExport int LGBM_CreateDatasetFromFile(const char* filename,
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
173
  loader.SetHeader(filename);
Guolin Ke's avatar
Guolin Ke committed
174
175
176
  if (reference == nullptr) {
    *out = loader.LoadFromFile(filename);
  } else {
Guolin Ke's avatar
Guolin Ke committed
177
    *out = loader.LoadFromFileAlignWithOtherDataset(filename, reinterpret_cast<const Dataset*>(*reference));
Guolin Ke's avatar
Guolin Ke committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  }
  return 0;
}


DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
  DatesetHandle* out) {

  OverallConfig config;
  DatasetLoader loader(config.io_config, nullptr);
  *out = loader.LoadFromBinFile(filename, 0, 1);
  return 0;
}

DllExport int LGBM_CreateDatasetFromMat(const void* data,
193
  int data_type,
Guolin Ke's avatar
Guolin Ke committed
194
195
196
197
198
199
200
201
202
203
204
  int32_t nrow,
  int32_t ncol,
  int is_row_major,
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
205
  auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
Guolin Ke's avatar
Guolin Ke committed
206
207
208
209
210
  if (reference == nullptr) {
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
211
    std::vector<std::vector<double>> sample_values(ncol);
Guolin Ke's avatar
Guolin Ke committed
212
    for (size_t i = 0; i < sample_indices.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
213
      auto idx = sample_indices[i];
214
      auto row = get_row_fun(static_cast<int>(idx));
Guolin Ke's avatar
Guolin Ke committed
215
      for (size_t j = 0; j < row.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
216
217
218
        if (std::fabs(row[j]) > 1e-15) {
          sample_values[j].push_back(row[j]);
        }
Guolin Ke's avatar
Guolin Ke committed
219
220
      }
    }
Guolin Ke's avatar
Guolin Ke committed
221
    ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
Guolin Ke's avatar
Guolin Ke committed
222
  } else {
223
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
224
    ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
Guolin Ke's avatar
Guolin Ke committed
225
226
227
228
229
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    const int tid = omp_get_thread_num();
230
    auto one_row = get_row_fun(i);
Guolin Ke's avatar
Guolin Ke committed
231
232
233
234
    ret->PushOneRow(tid, i, one_row);
  }
  ret->FinishLoad();
  *out = ret;
235
236
237
  return 0;
}

238
239
DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
  int indptr_type,
240
241
  const int32_t* indices,
  const void* data,
242
243
244
245
  int data_type,
  int64_t nindptr,
  int64_t nelem,
  int64_t num_col,
246
247
248
249
250
251
252
253
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
254
  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
255
256
257
258
259
260
261
262
263
264
265
  int32_t nrow = static_cast<int32_t>(nindptr - 1);
  if (reference == nullptr) {
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
    std::vector<std::vector<double>> sample_values;
    for (size_t i = 0; i < sample_indices.size(); ++i) {
      auto idx = sample_indices[i];
      auto row = get_row_fun(static_cast<int>(idx));
      for (std::pair<int, double>& inner_data : row) {
Guolin Ke's avatar
Guolin Ke committed
266
267
268
269
270
271
272
        if (std::fabs(inner_data.second) > 1e-15) {
          if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
            // if need expand feature set
            size_t need_size = inner_data.first - sample_values.size() + 1;
            for (size_t j = 0; j < need_size; ++j) {
              sample_values.emplace_back();
            }
273
          }
Guolin Ke's avatar
Guolin Ke committed
274
275
          // edit the feature value
          sample_values[inner_data.first].push_back(inner_data.second);
276
277
278
        }
      }
    }
279
    CHECK(num_col >= static_cast<int>(sample_values.size()));
Guolin Ke's avatar
Guolin Ke committed
280
    ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
281
  } else {
282
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
283
    ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
284
285
286
287
288
289
290
291
292
293
294
295
296
297
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nindptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    auto one_row = get_row_fun(i);
    ret->PushOneRow(tid, i, one_row);
  }
  ret->FinishLoad();
  *out = ret;

  return 0;
}

Guolin Ke's avatar
Guolin Ke committed
298

299
300
DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
  int col_ptr_type,
Guolin Ke's avatar
Guolin Ke committed
301
302
  const int32_t* indices,
  const void* data,
303
304
305
306
  int data_type,
  int64_t ncol_ptr,
  int64_t nelem,
  int64_t num_row,
Guolin Ke's avatar
Guolin Ke committed
307
308
309
310
311
312
313
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {
  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
314
  auto get_col_fun = ColumnFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem);
Guolin Ke's avatar
Guolin Ke committed
315
316
317
318
319
320
321
322
323
324
325
  int32_t nrow = static_cast<int32_t>(num_row);
  if (reference == nullptr) {
    Log::Warning("Construct from CSC format is not efficient");
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
    std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
#pragma omp parallel for schedule(guided)
    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
      auto cur_col = get_col_fun(i);
326
      sample_values[i] = SampleFromOneColumn(cur_col, sample_indices);
Guolin Ke's avatar
Guolin Ke committed
327
    }
Guolin Ke's avatar
Guolin Ke committed
328
    ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
Guolin Ke's avatar
Guolin Ke committed
329
  } else {
330
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
331
    ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
Guolin Ke's avatar
Guolin Ke committed
332
333
334
335
336
337
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < ncol_ptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    auto one_col = get_col_fun(i);
Guolin Ke's avatar
Guolin Ke committed
338
    ret->PushOneColumn(tid, i, one_col);
Guolin Ke's avatar
Guolin Ke committed
339
340
341
342
343
344
  }
  ret->FinishLoad();
  *out = ret;
  return 0;
}

345
346
DllExport int LGBM_DatasetFree(DatesetHandle handle) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
347
348
349
350
351
352
353
354
355
356
357
358
359
360
  delete dataset;
  return 0;
}

DllExport int LGBM_DatasetSaveBinary(DatesetHandle handle,
  const char* filename) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
  dataset->SaveBinaryFile(filename);
  return 0;
}

DllExport int LGBM_DatasetSetField(DatesetHandle handle,
  const char* field_name,
  const void* field_data,
361
  int64_t num_element,
362
363
  int type) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
364
  bool is_success = false;
Guolin Ke's avatar
Guolin Ke committed
365
  if (type == C_API_DTYPE_FLOAT32) {
366
    is_success = dataset->SetFloatField(field_name, reinterpret_cast<const float*>(field_data), static_cast<int32_t>(num_element));
Guolin Ke's avatar
Guolin Ke committed
367
  } else if (type == C_API_DTYPE_INT32) {
368
369
370
371
    is_success = dataset->SetIntField(field_name, reinterpret_cast<const int*>(field_data), static_cast<int32_t>(num_element));
  }
  if (is_success) { return 0; }
  return -1;
372
373
374
375
}

DllExport int LGBM_DatasetGetField(DatesetHandle handle,
  const char* field_name,
376
  int64_t* out_len,
377
378
379
  const void** out_ptr,
  int* out_type) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
380
  if (dataset->GetFloatField(field_name, out_len, reinterpret_cast<const float**>(out_ptr))) {
Guolin Ke's avatar
Guolin Ke committed
381
    *out_type = C_API_DTYPE_FLOAT32;
382
383
    return 0;
  } else if (dataset->GetIntField(field_name, out_len, reinterpret_cast<const int**>(out_ptr))) {
Guolin Ke's avatar
Guolin Ke committed
384
    *out_type = C_API_DTYPE_INT32;
385
386
387
    return 0;
  }
  return -1;
388
389
390
}

DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
391
  int64_t* out) {
392
393
394
395
396
397
  auto dataset = reinterpret_cast<Dataset*>(handle);
  *out = dataset->num_data();
  return 0;
}

DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
398
  int64_t* out) {
399
400
401
  auto dataset = reinterpret_cast<Dataset*>(handle);
  *out = dataset->num_total_features();
  return 0;
Guolin Ke's avatar
Guolin Ke committed
402
}
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462


// ---- start of booster

DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
  const DatesetHandle valid_datas[],
  const char* valid_names[],
  int n_valid_datas,
  const char* parameters,
  BoosterHandle* out) {
  const Dataset* p_train_data = reinterpret_cast<const Dataset*>(train_data);
  std::vector<const Dataset*> p_valid_datas;
  std::vector<std::string> p_valid_names;
  for (int i = 0; i < n_valid_datas; ++i) {
    p_valid_datas.emplace_back(reinterpret_cast<const Dataset*>(valid_datas[i]));
    p_valid_names.emplace_back(valid_names[i]);
  }
  *out = new Booster(p_train_data, p_valid_datas, p_valid_names, parameters);
  return 0;
}

DllExport int LGBM_BoosterLoadFromModelfile(
  const char* filename,
  BoosterHandle* out) {
  *out = new Booster(filename);
  return 0;
}

DllExport int LGBM_BoosterFree(BoosterHandle handle) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  delete ref_booster;
  return 0;
}


DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  if (ref_booster->TrainOneIter()) {
    *is_finished = 1;
  } else {
    *is_finished = 0;
  }
  return 0;
}

DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
  const float* grad,
  const float* hess,
  int* is_finished) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  if (ref_booster->TrainOneIter(grad, hess)) {
    *is_finished = 1;
  } else {
    *is_finished = 0;
  }
  return 0;
}

DllExport int LGBM_BoosterEval(BoosterHandle handle,
  int data,
463
  int64_t* out_len,
Guolin Ke's avatar
Guolin Ke committed
464
  float* out_results) {
465
466
467
468

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  auto result_buf = boosting->GetEvalAt(data);
469
  *out_len = static_cast<int64_t>(result_buf.size());
470
  for (size_t i = 0; i < result_buf.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
471
    (out_results)[i] = static_cast<float>(result_buf[i]);
472
473
474
475
476
  }
  return 0;
}

DllExport int LGBM_BoosterGetScore(BoosterHandle handle,
477
  int64_t* out_len,
478
479
480
481
482
  const float** out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  int len = 0;
Guolin Ke's avatar
Guolin Ke committed
483
  *out_result = boosting->GetTrainingScore(&len);
484
  *out_len = static_cast<int64_t>(len);
485
486
487
488

  return 0;
}

Guolin Ke's avatar
Guolin Ke committed
489
490
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
  int data,
491
  int64_t* out_len,
Guolin Ke's avatar
Guolin Ke committed
492
493
494
495
496
  float* out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  int len = 0;
Guolin Ke's avatar
Guolin Ke committed
497
  boosting->GetPredictAt(data, out_result, &len);
498
  *out_len = static_cast<int64_t>(len);
Guolin Ke's avatar
Guolin Ke committed
499
500
501
  return 0;
}

502
503
504
505
506
507
508
509
510
511
512
513
514
515
DllExport int LGBM_BoosterPredictForFile(BoosterHandle handle,
  int predict_type,
  int64_t n_used_trees,
  int data_has_header,
  const char* data_filename,
  const char* result_filename) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type);
  bool bool_data_has_header = data_has_header > 0 ? true : false;
  ref_booster->PredictForFile(data_filename, result_filename, bool_data_has_header);
  return 0;
}

516
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
517
518
  const void* indptr,
  int indptr_type,
519
520
  const int32_t* indices,
  const void* data,
521
522
523
524
  int data_type,
  int64_t nindptr,
  int64_t nelem,
  int64_t,
525
  int predict_type,
526
  int64_t n_used_trees,
Guolin Ke's avatar
Guolin Ke committed
527
  double* out_result) {
528

Guolin Ke's avatar
Guolin Ke committed
529
530
531
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type);

532
  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
Guolin Ke's avatar
Guolin Ke committed
533
  int num_class = ref_booster->NumberOfClasses();
Guolin Ke's avatar
Guolin Ke committed
534
535
536
537
538
  int nrow = static_cast<int>(nindptr - 1);
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = ref_booster->Predict(one_row);
Guolin Ke's avatar
Guolin Ke committed
539
    for (int j = 0; j < num_class; ++j) {
Guolin Ke's avatar
Guolin Ke committed
540
541
542
543
544
      out_result[i * num_class + j] = predicton_result[j];
    }
  }
  return 0;
}
545
546
547

DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
  const void* data,
548
  int data_type,
549
550
  int32_t nrow,
  int32_t ncol,
Guolin Ke's avatar
Guolin Ke committed
551
  int is_row_major,
552
  int predict_type,
553
  int64_t n_used_trees,
Guolin Ke's avatar
Guolin Ke committed
554
555
556
557
558
  double* out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type);

559
  auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
Guolin Ke's avatar
Guolin Ke committed
560
  int num_class = ref_booster->NumberOfClasses();
Guolin Ke's avatar
Guolin Ke committed
561
562
563
564
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = ref_booster->Predict(one_row);
Guolin Ke's avatar
Guolin Ke committed
565
    for (int j = 0; j < num_class; ++j) {
Guolin Ke's avatar
Guolin Ke committed
566
567
568
569
570
      out_result[i * num_class + j] = predicton_result[j];
    }
  }
  return 0;
}
571
572
573

DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
  int num_used_model,
Guolin Ke's avatar
Guolin Ke committed
574
575
576
577
578
579
  const char* filename) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->SaveModelToFile(num_used_model, filename);
  return 0;
}
580

Guolin Ke's avatar
Guolin Ke committed
581
// ---- start of some help functions
582
583
584

std::function<std::vector<double>(int row_idx)>
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) {
Guolin Ke's avatar
Guolin Ke committed
585
  if (data_type == C_API_DTYPE_FLOAT32) {
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
Guolin Ke's avatar
Guolin Ke committed
607
  } else if (data_type == C_API_DTYPE_FLOAT64) {
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else {
    Log::Fatal("unknown data type in RowFunctionFromDenseMatric");
  }
Guolin Ke's avatar
Guolin Ke committed
632
  return nullptr;
633
634
635
636
}

std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) {
Guolin Ke's avatar
Guolin Ke committed
637
638
639
640
641
642
643
644
  auto inner_function = RowFunctionFromDenseMatric(data, num_row, num_col, data_type, is_row_major);
  if (inner_function != nullptr) {
    return [inner_function](int row_idx) {
      auto raw_values = inner_function(row_idx);
      std::vector<std::pair<int, double>> ret;
      for (int i = 0; i < static_cast<int>(raw_values.size()); ++i) {
        if (std::fabs(raw_values[i]) > 1e-15) {
          ret.emplace_back(i, raw_values[i]);
645
        }
Guolin Ke's avatar
Guolin Ke committed
646
647
648
      }
      return ret;
    };
649
  }
Guolin Ke's avatar
Guolin Ke committed
650
  return nullptr;
651
652
653
654
}

std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, const void* data, int data_type, int64_t nindptr, int64_t nelem) {
Guolin Ke's avatar
Guolin Ke committed
655
  if (data_type == C_API_DTYPE_FLOAT32) {
656
    const float* data_ptr = reinterpret_cast<const float*>(data);
Guolin Ke's avatar
Guolin Ke committed
657
    if (indptr_type == C_API_DTYPE_INT32) {
658
659
660
661
662
663
      const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
664
        CHECK(start >= 0 && end <= nelem);
665
666
667
668
669
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
Guolin Ke's avatar
Guolin Ke committed
670
    } else if (indptr_type == C_API_DTYPE_INT64) {
671
672
673
674
675
676
      const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
677
        CHECK(start >= 0 && end <= nelem);
678
679
680
681
682
683
684
685
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in RowFunctionFromCSR");
    }
Guolin Ke's avatar
Guolin Ke committed
686
  } else if (data_type == C_API_DTYPE_FLOAT64) {
687
    const double* data_ptr = reinterpret_cast<const double*>(data);
Guolin Ke's avatar
Guolin Ke committed
688
    if (indptr_type == C_API_DTYPE_INT32) {
689
690
691
692
693
694
      const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
695
        CHECK(start >= 0 && end <= nelem);
696
697
698
699
700
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
Guolin Ke's avatar
Guolin Ke committed
701
    } else if (indptr_type == C_API_DTYPE_INT64) {
702
703
704
705
706
707
      const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
708
        CHECK(start >= 0 && end <= nelem);
709
710
711
712
713
714
715
716
717
718
719
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in RowFunctionFromCSR");
    }
  } else {
    Log::Fatal("unknown data type in RowFunctionFromCSR");
  }
Guolin Ke's avatar
Guolin Ke committed
720
  return nullptr;
721
722
723
724
}

std::function<std::vector<std::pair<int, double>>(int idx)>
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t nelem) {
Guolin Ke's avatar
Guolin Ke committed
725
  if (data_type == C_API_DTYPE_FLOAT32) {
726
    const float* data_ptr = reinterpret_cast<const float*>(data);
Guolin Ke's avatar
Guolin Ke committed
727
    if (col_ptr_type == C_API_DTYPE_INT32) {
728
729
730
731
732
733
734
735
736
737
738
739
      const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
Guolin Ke's avatar
Guolin Ke committed
740
    } else if (col_ptr_type == C_API_DTYPE_INT64) {
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
      const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in ColumnFunctionFromCSC");
    }
Guolin Ke's avatar
Guolin Ke committed
756
  } else if (data_type == C_API_DTYPE_FLOAT64) {
757
    const double* data_ptr = reinterpret_cast<const double*>(data);
Guolin Ke's avatar
Guolin Ke committed
758
    if (col_ptr_type == C_API_DTYPE_INT32) {
759
760
761
762
763
764
765
766
767
768
769
770
      const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
Guolin Ke's avatar
Guolin Ke committed
771
    } else if (col_ptr_type == C_API_DTYPE_INT64) {
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
      const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in ColumnFunctionFromCSC");
    }
  } else {
    Log::Fatal("unknown data type in ColumnFunctionFromCSC");
  }
Guolin Ke's avatar
Guolin Ke committed
790
  return nullptr;
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
}

std::vector<double> SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<size_t>& indices) {
  size_t j = 0;
  std::vector<double> ret;
  for (auto row_idx : indices) {
    while (j < data.size() && data[j].first < static_cast<int>(row_idx)) {
      ++j;
    }
    if (j < data.size() && data[j].first == static_cast<int>(row_idx)) {
      ret.push_back(data[j].second);
    } else {
      ret.push_back(0);
    }
  }
  return ret;
}