c_api.cpp 26.9 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
#include <omp.h>

#include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
Guolin Ke's avatar
Guolin Ke committed
5
#include <LightGBM/c_api.h>
Guolin Ke's avatar
Guolin Ke committed
6
#include <LightGBM/dataset_loader.h>
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <LightGBM/config.h>

#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
Guolin Ke's avatar
Guolin Ke committed
17
#include <memory>
Guolin Ke's avatar
Guolin Ke committed
18

Guolin Ke's avatar
Guolin Ke committed
19
20
#include "./application/predictor.hpp"

Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
namespace LightGBM {

class Booster {
public:
  explicit Booster(const char* filename):
Guolin Ke's avatar
Guolin Ke committed
26
    boosting_(Boosting::CreateBoosting(filename)), predictor_(nullptr) {
Guolin Ke's avatar
Guolin Ke committed
27
28
29
30
31
32
  }

  Booster(const Dataset* train_data, 
    std::vector<const Dataset*> valid_data, 
    std::vector<std::string> valid_names,
    const char* parameters)
Guolin Ke's avatar
Guolin Ke committed
33
    :train_data_(train_data), valid_datas_(valid_data), predictor_(nullptr) {
Guolin Ke's avatar
Guolin Ke committed
34
35
36
    config_.LoadFromString(parameters);
    // create boosting
    if (config_.io_config.input_model.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
37
      Log::Warning("continued train from model is not support for c_api, \
Guolin Ke's avatar
Guolin Ke committed
38
39
40
41
42
43
44
45
        please use continued train with input score");
    }
    boosting_ = Boosting::CreateBoosting(config_.boosting_type, "");
    // create objective function
    objective_fun_ =
      ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
        config_.objective_config);
    // create training metric
Guolin Ke's avatar
Guolin Ke committed
46
47
48
49
50
51
52
    for (auto metric_type : config_.metric_types) {
      Metric* metric =
        Metric::CreateMetric(metric_type, config_.metric_config);
      if (metric == nullptr) { continue; }
      metric->Init("training", train_data_->metadata(),
        train_data_->num_data());
      train_metric_.push_back(metric);
Guolin Ke's avatar
Guolin Ke committed
53
    }
Guolin Ke's avatar
Guolin Ke committed
54
    
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    // add metric for validation data
    for (size_t i = 0; i < valid_datas_.size(); ++i) {
      valid_metrics_.emplace_back();
      for (auto metric_type : config_.metric_types) {
        Metric* metric = Metric::CreateMetric(metric_type, config_.metric_config);
        if (metric == nullptr) { continue; }
        metric->Init(valid_names[i].c_str(),
          valid_datas_[i]->metadata(),
          valid_datas_[i]->num_data());
        valid_metrics_.back().push_back(metric);
      }
    }
    // initialize the objective function
    objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
    // initialize the boosting
    boosting_->Init(config_.boosting_config, train_data_, objective_fun_,
      ConstPtrInVectorWarpper<Metric>(train_metric_));
    // add validation data into boosting
    for (size_t i = 0; i < valid_datas_.size(); ++i) {
      boosting_->AddDataset(valid_datas_[i],
        ConstPtrInVectorWarpper<Metric>(valid_metrics_[i]));
    }
  }

  ~Booster() {
    for (auto& metric : train_metric_) {
      if (metric != nullptr) { delete metric; }
    }
    for (auto& metric : valid_metrics_) {
      for (auto& sub_metric : metric) {
        if (sub_metric != nullptr) { delete sub_metric; }
      }
    }
    valid_metrics_.clear();
    if (boosting_ != nullptr) { delete boosting_; }
    if (objective_fun_ != nullptr) { delete objective_fun_; }
Guolin Ke's avatar
Guolin Ke committed
91
    if (predictor_ != nullptr) { delete predictor_; }
Guolin Ke's avatar
Guolin Ke committed
92
  }
93
94
95
96
97
98
99
100
101
102
103

  bool TrainOneIter() {
    return boosting_->TrainOneIter(nullptr, nullptr, false);
  }

  bool TrainOneIter(const float* gradients, const float* hessians) {
    return boosting_->TrainOneIter(gradients, hessians, false);
  }

  void PrepareForPrediction(int num_used_model, int predict_type) {
    boosting_->SetNumUsedModel(num_used_model);
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    if (predictor_ != nullptr) { delete predictor_; }
    bool is_predict_leaf = false;
    bool is_raw_score = false;
    if (predict_type == 2) {
      is_predict_leaf = true;
    } else if (predict_type == 1) {
      is_raw_score = false;
    } else {
      is_raw_score = true;
    }
    predictor_ = new Predictor(boosting_, is_raw_score, is_predict_leaf);
  }

  std::vector<double> Predict(const std::vector<std::pair<int, double>>& features) {
    return predictor_->GetPredictFunction()(features);
119
120
  }

Guolin Ke's avatar
Guolin Ke committed
121
122
123
  void SaveModelToFile(int num_used_model, const char* filename) {
    boosting_->SaveModelToFile(num_used_model, true, filename);
  }
124
125
  const Boosting* GetBoosting() const { return boosting_; }

Guolin Ke's avatar
Guolin Ke committed
126
  const inline int NumberOfClasses() const { return boosting_->NumberOfClasses(); }
127

Guolin Ke's avatar
Guolin Ke committed
128
private:
129

Guolin Ke's avatar
Guolin Ke committed
130
131
132
133
134
135
136
137
138
139
140
141
142
  Boosting* boosting_;
  /*! \brief All configs */
  OverallConfig config_;
  /*! \brief Training data */
  const Dataset* train_data_;
  /*! \brief Validation data */
  std::vector<const Dataset*> valid_datas_;
  /*! \brief Metric for training data */
  std::vector<Metric*> train_metric_;
  /*! \brief Metrics for validation data */
  std::vector<std::vector<Metric*>> valid_metrics_;
  /*! \brief Training objective function */
  ObjectiveFunction* objective_fun_;
Guolin Ke's avatar
Guolin Ke committed
143
144
  /*! \brief Using predictor for prediction task */
  Predictor* predictor_;
145

Guolin Ke's avatar
Guolin Ke committed
146
147
148
};

}
Guolin Ke's avatar
Guolin Ke committed
149
150
151

using namespace LightGBM;

Guolin Ke's avatar
Guolin Ke committed
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

DllExport const char* LGBM_GetLastError() {
  return "Not error msg now, will support soon";
}



DllExport int LGBM_CreateDatasetFromFile(const char* filename,
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  if (reference == nullptr) {
    *out = loader.LoadFromFile(filename);
  } else {
Guolin Ke's avatar
Guolin Ke committed
170
    *out = loader.LoadFromFileAlignWithOtherDataset(filename, reinterpret_cast<const Dataset*>(*reference));
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
  }
  return 0;
}


DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
  DatesetHandle* out) {

  OverallConfig config;
  DatasetLoader loader(config.io_config, nullptr);
  *out = loader.LoadFromBinFile(filename, 0, 1);
  return 0;
}

DllExport int LGBM_CreateDatasetFromMat(const void* data,
186
  int data_type,
Guolin Ke's avatar
Guolin Ke committed
187
188
189
190
191
192
193
194
195
196
197
  int32_t nrow,
  int32_t ncol,
  int is_row_major,
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
198
  auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
203
  if (reference == nullptr) {
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
204
    std::vector<std::vector<double>> sample_values(ncol);
Guolin Ke's avatar
Guolin Ke committed
205
    for (size_t i = 0; i < sample_indices.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
206
      auto idx = sample_indices[i];
207
      auto row = get_row_fun(static_cast<int>(idx));
Guolin Ke's avatar
Guolin Ke committed
208
      for (size_t j = 0; j < row.size(); ++j) {
209
        sample_values[j].push_back(row[j]);
Guolin Ke's avatar
Guolin Ke committed
210
211
      }
    }
212
    ret = loader.CostructFromSampleData(sample_values, nrow);
Guolin Ke's avatar
Guolin Ke committed
213
  } else {
214
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
215
    reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
Guolin Ke's avatar
Guolin Ke committed
216
217
218
219
220
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    const int tid = omp_get_thread_num();
221
    auto one_row = get_row_fun(i);
Guolin Ke's avatar
Guolin Ke committed
222
223
224
225
    ret->PushOneRow(tid, i, one_row);
  }
  ret->FinishLoad();
  *out = ret;
226
227
228
  return 0;
}

229
230
DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
  int indptr_type,
231
232
  const int32_t* indices,
  const void* data,
233
234
235
236
  int data_type,
  int64_t nindptr,
  int64_t nelem,
  int64_t num_col,
237
238
239
240
241
242
243
244
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
245
  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
  int32_t nrow = static_cast<int32_t>(nindptr - 1);
  if (reference == nullptr) {
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
    std::vector<std::vector<double>> sample_values;
    for (size_t i = 0; i < sample_indices.size(); ++i) {
      auto idx = sample_indices[i];
      auto row = get_row_fun(static_cast<int>(idx));
      // push 0 first, then edit the value according existing feature values
      for (auto& feature_values : sample_values) {
        feature_values.push_back(0.0);
      }
      for (std::pair<int, double>& inner_data : row) {
        if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
          // if need expand feature set
          size_t need_size = inner_data.first - sample_values.size() + 1;
          for (size_t j = 0; j < need_size; ++j) {
            // push i+1 0
            sample_values.emplace_back(i + 1, 0.0f);
          }
        }
        // edit the feature value
        sample_values[inner_data.first][i] = inner_data.second;
      }
    }
273
    CHECK(num_col >= static_cast<int>(sample_values.size()));
274
275
    ret = loader.CostructFromSampleData(sample_values, nrow);
  } else {
276
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
277
    reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
278
279
280
281
282
283
284
285
286
287
288
289
290
291
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nindptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    auto one_row = get_row_fun(i);
    ret->PushOneRow(tid, i, one_row);
  }
  ret->FinishLoad();
  *out = ret;

  return 0;
}

Guolin Ke's avatar
Guolin Ke committed
292

293
294
DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
  int col_ptr_type,
Guolin Ke's avatar
Guolin Ke committed
295
296
  const int32_t* indices,
  const void* data,
297
298
299
300
  int data_type,
  int64_t ncol_ptr,
  int64_t nelem,
  int64_t num_row,
Guolin Ke's avatar
Guolin Ke committed
301
302
303
304
305
306
307
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {
  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
308
  auto get_col_fun = ColumnFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem);
Guolin Ke's avatar
Guolin Ke committed
309
310
311
312
313
314
315
316
317
318
319
  int32_t nrow = static_cast<int32_t>(num_row);
  if (reference == nullptr) {
    Log::Warning("Construct from CSC format is not efficient");
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
    std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
#pragma omp parallel for schedule(guided)
    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
      auto cur_col = get_col_fun(i);
320
      sample_values[i] = SampleFromOneColumn(cur_col, sample_indices);
Guolin Ke's avatar
Guolin Ke committed
321
322
323
    }
    ret = loader.CostructFromSampleData(sample_values, nrow);
  } else {
324
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
325
    reinterpret_cast<const Dataset*>(*reference)->CopyFeatureBinMapperTo(ret, config.io_config.is_enable_sparse);
Guolin Ke's avatar
Guolin Ke committed
326
327
328
329
330
331
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < ncol_ptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    auto one_col = get_col_fun(i);
Guolin Ke's avatar
Guolin Ke committed
332
    ret->PushOneColumn(tid, i, one_col);
Guolin Ke's avatar
Guolin Ke committed
333
334
335
336
337
338
  }
  ret->FinishLoad();
  *out = ret;
  return 0;
}

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
DllExport int LGBM_DatasetFree(DatesetHandle* handle) {
  auto dataset = reinterpret_cast<Dataset*>(*handle);
  delete dataset;
  return 0;
}

DllExport int LGBM_DatasetSaveBinary(DatesetHandle handle,
  const char* filename) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
  dataset->SaveBinaryFile(filename);
  return 0;
}

DllExport int LGBM_DatasetSetField(DatesetHandle handle,
  const char* field_name,
  const void* field_data,
355
  int64_t num_element,
356
357
  int type) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
358
359
360
361
362
363
364
365
  bool is_success = false;
  if (type == dtype_float32) {
    is_success = dataset->SetFloatField(field_name, reinterpret_cast<const float*>(field_data), static_cast<int32_t>(num_element));
  } else if (type == dtype_int32) {
    is_success = dataset->SetIntField(field_name, reinterpret_cast<const int*>(field_data), static_cast<int32_t>(num_element));
  }
  if (is_success) { return 0; }
  return -1;
366
367
368
369
}

DllExport int LGBM_DatasetGetField(DatesetHandle handle,
  const char* field_name,
370
  int64_t* out_len,
371
372
373
  const void** out_ptr,
  int* out_type) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
374
375
376
377
378
379
380
381
  if (dataset->GetFloatField(field_name, out_len, reinterpret_cast<const float**>(out_ptr))) {
    *out_type = dtype_float32;
    return 0;
  } else if (dataset->GetIntField(field_name, out_len, reinterpret_cast<const int**>(out_ptr))) {
    *out_type = dtype_int32;
    return 0;
  }
  return -1;
382
383
384
}

DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
385
  int64_t* out) {
386
387
388
389
390
391
  auto dataset = reinterpret_cast<Dataset*>(handle);
  *out = dataset->num_data();
  return 0;
}

DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
392
  int64_t* out) {
393
394
395
  auto dataset = reinterpret_cast<Dataset*>(handle);
  *out = dataset->num_total_features();
  return 0;
Guolin Ke's avatar
Guolin Ke committed
396
}
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456


// ---- start of booster

DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
  const DatesetHandle valid_datas[],
  const char* valid_names[],
  int n_valid_datas,
  const char* parameters,
  BoosterHandle* out) {
  const Dataset* p_train_data = reinterpret_cast<const Dataset*>(train_data);
  std::vector<const Dataset*> p_valid_datas;
  std::vector<std::string> p_valid_names;
  for (int i = 0; i < n_valid_datas; ++i) {
    p_valid_datas.emplace_back(reinterpret_cast<const Dataset*>(valid_datas[i]));
    p_valid_names.emplace_back(valid_names[i]);
  }
  *out = new Booster(p_train_data, p_valid_datas, p_valid_names, parameters);
  return 0;
}

DllExport int LGBM_BoosterLoadFromModelfile(
  const char* filename,
  BoosterHandle* out) {
  *out = new Booster(filename);
  return 0;
}

DllExport int LGBM_BoosterFree(BoosterHandle handle) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  delete ref_booster;
  return 0;
}


DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  if (ref_booster->TrainOneIter()) {
    *is_finished = 1;
  } else {
    *is_finished = 0;
  }
  return 0;
}

DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
  const float* grad,
  const float* hess,
  int* is_finished) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  if (ref_booster->TrainOneIter(grad, hess)) {
    *is_finished = 1;
  } else {
    *is_finished = 0;
  }
  return 0;
}

DllExport int LGBM_BoosterEval(BoosterHandle handle,
  int data,
457
  int64_t* out_len,
Guolin Ke's avatar
Guolin Ke committed
458
  float* out_results) {
459
460
461
462

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  auto result_buf = boosting->GetEvalAt(data);
463
  *out_len = static_cast<int64_t>(result_buf.size());
464
  for (size_t i = 0; i < result_buf.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
465
    (out_results)[i] = static_cast<float>(result_buf[i]);
466
467
468
469
470
  }
  return 0;
}

DllExport int LGBM_BoosterGetScore(BoosterHandle handle,
471
  int64_t* out_len,
472
473
474
475
476
  const float** out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  int len = 0;
Guolin Ke's avatar
Guolin Ke committed
477
  *out_result = boosting->GetTrainingScore(&len);
478
  *out_len = static_cast<int64_t>(len);
479
480
481
482

  return 0;
}

Guolin Ke's avatar
Guolin Ke committed
483
484
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
  int data,
485
  int64_t* out_len,
Guolin Ke's avatar
Guolin Ke committed
486
487
488
489
490
  float* out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  int len = 0;
Guolin Ke's avatar
Guolin Ke committed
491
  boosting->GetPredictAt(data, out_result, &len);
492
  *out_len = static_cast<int64_t>(len);
Guolin Ke's avatar
Guolin Ke committed
493
494
495
  return 0;
}

496
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
497
498
  const void* indptr,
  int indptr_type,
499
500
  const int32_t* indices,
  const void* data,
501
502
503
504
  int data_type,
  int64_t nindptr,
  int64_t nelem,
  int64_t,
505
  int predict_type,
506
  int64_t n_used_trees,
Guolin Ke's avatar
Guolin Ke committed
507
  double* out_result) {
508

Guolin Ke's avatar
Guolin Ke committed
509
510
511
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type);

512
  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
Guolin Ke's avatar
Guolin Ke committed
513
  int num_class = ref_booster->NumberOfClasses();
Guolin Ke's avatar
Guolin Ke committed
514
515
516
517
518
  int nrow = static_cast<int>(nindptr - 1);
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = ref_booster->Predict(one_row);
Guolin Ke's avatar
Guolin Ke committed
519
    for (int j = 0; j < num_class; ++j) {
Guolin Ke's avatar
Guolin Ke committed
520
521
522
523
524
      out_result[i * num_class + j] = predicton_result[j];
    }
  }
  return 0;
}
525
526
527

DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
  const void* data,
528
  int data_type,
529
530
  int32_t nrow,
  int32_t ncol,
Guolin Ke's avatar
Guolin Ke committed
531
  int is_row_major,
532
  int predict_type,
533
  int64_t n_used_trees,
Guolin Ke's avatar
Guolin Ke committed
534
535
536
537
538
  double* out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type);

539
  auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
Guolin Ke's avatar
Guolin Ke committed
540
  int num_class = ref_booster->NumberOfClasses();
Guolin Ke's avatar
Guolin Ke committed
541
542
543
544
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = ref_booster->Predict(one_row);
Guolin Ke's avatar
Guolin Ke committed
545
    for (int j = 0; j < num_class; ++j) {
Guolin Ke's avatar
Guolin Ke committed
546
547
548
549
550
      out_result[i * num_class + j] = predicton_result[j];
    }
  }
  return 0;
}
551
552
553

DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
  int num_used_model,
Guolin Ke's avatar
Guolin Ke committed
554
555
556
557
558
559
  const char* filename) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->SaveModelToFile(num_used_model, filename);
  return 0;
}
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817



std::function<std::vector<double>(int row_idx)>
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else {
    Log::Fatal("unknown data type in RowFunctionFromDenseMatric");
  }
}

std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else {
    Log::Fatal("unknown data type in RowPairFunctionFromDenseMatric");
  }
}

std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, const void* data, int data_type, int64_t nindptr, int64_t nelem) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (indptr_type == dtype_int32) {
      const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
        CHECK(start >= 0 && end < nelem);
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (indptr_type == dtype_int64) {
      const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
        CHECK(start >= 0 && end < nelem);
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in RowFunctionFromCSR");
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (indptr_type == dtype_int32) {
      const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
        CHECK(start >= 0 && end < nelem);
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (indptr_type == dtype_int64) {
      const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
        CHECK(start >= 0 && end < nelem);
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in RowFunctionFromCSR");
    }
  } else {
    Log::Fatal("unknown data type in RowFunctionFromCSR");
  }
}

std::function<std::vector<std::pair<int, double>>(int idx)>
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t nelem) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (col_ptr_type == dtype_int32) {
      const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (col_ptr_type == dtype_int64) {
      const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in ColumnFunctionFromCSC");
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (col_ptr_type == dtype_int32) {
      const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (col_ptr_type == dtype_int64) {
      const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in ColumnFunctionFromCSC");
    }
  } else {
    Log::Fatal("unknown data type in ColumnFunctionFromCSC");
  }
}

std::vector<double> SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<size_t>& indices) {
  size_t j = 0;
  std::vector<double> ret;
  for (auto row_idx : indices) {
    while (j < data.size() && data[j].first < static_cast<int>(row_idx)) {
      ++j;
    }
    if (j < data.size() && data[j].first == static_cast<int>(row_idx)) {
      ret.push_back(data[j].second);
    } else {
      ret.push_back(0);
    }
  }
  return ret;
}