c_api.cpp 27 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
3
4
#include <omp.h>

#include <LightGBM/utils/common.h>
#include <LightGBM/utils/random.h>
Guolin Ke's avatar
Guolin Ke committed
5
#include <LightGBM/c_api.h>
Guolin Ke's avatar
Guolin Ke committed
6
#include <LightGBM/dataset_loader.h>
Guolin Ke's avatar
Guolin Ke committed
7
8
9
10
11
12
13
14
15
16
#include <LightGBM/dataset.h>
#include <LightGBM/boosting.h>
#include <LightGBM/objective_function.h>
#include <LightGBM/metric.h>
#include <LightGBM/config.h>

#include <cstdio>
#include <vector>
#include <string>
#include <cstring>
Guolin Ke's avatar
Guolin Ke committed
17
#include <memory>
Guolin Ke's avatar
Guolin Ke committed
18

Guolin Ke's avatar
Guolin Ke committed
19
20
#include "./application/predictor.hpp"

Guolin Ke's avatar
Guolin Ke committed
21
22
23
24
25
namespace LightGBM {

class Booster {
public:
  explicit Booster(const char* filename):
Guolin Ke's avatar
Guolin Ke committed
26
    boosting_(Boosting::CreateBoosting(filename)), predictor_(nullptr) {
Guolin Ke's avatar
Guolin Ke committed
27
28
29
30
31
32
  }

  Booster(const Dataset* train_data, 
    std::vector<const Dataset*> valid_data, 
    std::vector<std::string> valid_names,
    const char* parameters)
Guolin Ke's avatar
Guolin Ke committed
33
    :train_data_(train_data), valid_datas_(valid_data), predictor_(nullptr) {
Guolin Ke's avatar
Guolin Ke committed
34
35
36
    config_.LoadFromString(parameters);
    // create boosting
    if (config_.io_config.input_model.size() > 0) {
Guolin Ke's avatar
Guolin Ke committed
37
      Log::Warning("continued train from model is not support for c_api, \
Guolin Ke's avatar
Guolin Ke committed
38
39
40
41
42
43
44
45
        please use continued train with input score");
    }
    boosting_ = Boosting::CreateBoosting(config_.boosting_type, "");
    // create objective function
    objective_fun_ =
      ObjectiveFunction::CreateObjectiveFunction(config_.objective_type,
        config_.objective_config);
    // create training metric
Guolin Ke's avatar
Guolin Ke committed
46
47
48
49
50
51
52
    for (auto metric_type : config_.metric_types) {
      Metric* metric =
        Metric::CreateMetric(metric_type, config_.metric_config);
      if (metric == nullptr) { continue; }
      metric->Init("training", train_data_->metadata(),
        train_data_->num_data());
      train_metric_.push_back(metric);
Guolin Ke's avatar
Guolin Ke committed
53
    }
Guolin Ke's avatar
Guolin Ke committed
54
    
Guolin Ke's avatar
Guolin Ke committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
    // add metric for validation data
    for (size_t i = 0; i < valid_datas_.size(); ++i) {
      valid_metrics_.emplace_back();
      for (auto metric_type : config_.metric_types) {
        Metric* metric = Metric::CreateMetric(metric_type, config_.metric_config);
        if (metric == nullptr) { continue; }
        metric->Init(valid_names[i].c_str(),
          valid_datas_[i]->metadata(),
          valid_datas_[i]->num_data());
        valid_metrics_.back().push_back(metric);
      }
    }
    // initialize the objective function
    objective_fun_->Init(train_data_->metadata(), train_data_->num_data());
    // initialize the boosting
    boosting_->Init(config_.boosting_config, train_data_, objective_fun_,
      ConstPtrInVectorWarpper<Metric>(train_metric_));
    // add validation data into boosting
    for (size_t i = 0; i < valid_datas_.size(); ++i) {
      boosting_->AddDataset(valid_datas_[i],
        ConstPtrInVectorWarpper<Metric>(valid_metrics_[i]));
    }
  }

  ~Booster() {
    for (auto& metric : train_metric_) {
      if (metric != nullptr) { delete metric; }
    }
    for (auto& metric : valid_metrics_) {
      for (auto& sub_metric : metric) {
        if (sub_metric != nullptr) { delete sub_metric; }
      }
    }
    valid_metrics_.clear();
    if (boosting_ != nullptr) { delete boosting_; }
    if (objective_fun_ != nullptr) { delete objective_fun_; }
Guolin Ke's avatar
Guolin Ke committed
91
    if (predictor_ != nullptr) { delete predictor_; }
Guolin Ke's avatar
Guolin Ke committed
92
  }
93
94
95
96
97
98
99
100
101
102
103

  bool TrainOneIter() {
    return boosting_->TrainOneIter(nullptr, nullptr, false);
  }

  bool TrainOneIter(const float* gradients, const float* hessians) {
    return boosting_->TrainOneIter(gradients, hessians, false);
  }

  void PrepareForPrediction(int num_used_model, int predict_type) {
    boosting_->SetNumUsedModel(num_used_model);
Guolin Ke's avatar
Guolin Ke committed
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    if (predictor_ != nullptr) { delete predictor_; }
    bool is_predict_leaf = false;
    bool is_raw_score = false;
    if (predict_type == 2) {
      is_predict_leaf = true;
    } else if (predict_type == 1) {
      is_raw_score = false;
    } else {
      is_raw_score = true;
    }
    predictor_ = new Predictor(boosting_, is_raw_score, is_predict_leaf);
  }

  std::vector<double> Predict(const std::vector<std::pair<int, double>>& features) {
    return predictor_->GetPredictFunction()(features);
119
120
  }

Guolin Ke's avatar
Guolin Ke committed
121
122
123
  void SaveModelToFile(int num_used_model, const char* filename) {
    boosting_->SaveModelToFile(num_used_model, true, filename);
  }
124
125
  const Boosting* GetBoosting() const { return boosting_; }

Guolin Ke's avatar
Guolin Ke committed
126
  const inline int NumberOfClasses() const { return boosting_->NumberOfClasses(); }
127

Guolin Ke's avatar
Guolin Ke committed
128
private:
129

Guolin Ke's avatar
Guolin Ke committed
130
131
132
133
134
135
136
137
138
139
140
141
142
  Boosting* boosting_;
  /*! \brief All configs */
  OverallConfig config_;
  /*! \brief Training data */
  const Dataset* train_data_;
  /*! \brief Validation data */
  std::vector<const Dataset*> valid_datas_;
  /*! \brief Metric for training data */
  std::vector<Metric*> train_metric_;
  /*! \brief Metrics for validation data */
  std::vector<std::vector<Metric*>> valid_metrics_;
  /*! \brief Training objective function */
  ObjectiveFunction* objective_fun_;
Guolin Ke's avatar
Guolin Ke committed
143
144
  /*! \brief Using predictor for prediction task */
  Predictor* predictor_;
145

Guolin Ke's avatar
Guolin Ke committed
146
147
148
};

}
Guolin Ke's avatar
Guolin Ke committed
149
150
151

using namespace LightGBM;

Guolin Ke's avatar
Guolin Ke committed
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

DllExport const char* LGBM_GetLastError() {
  return "Not error msg now, will support soon";
}



DllExport int LGBM_CreateDatasetFromFile(const char* filename,
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  if (reference == nullptr) {
    *out = loader.LoadFromFile(filename);
  } else {
Guolin Ke's avatar
Guolin Ke committed
170
    *out = loader.LoadFromFileAlignWithOtherDataset(filename, reinterpret_cast<const Dataset*>(*reference));
Guolin Ke's avatar
Guolin Ke committed
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
  }
  return 0;
}


DllExport int LGBM_CreateDatasetFromBinaryFile(const char* filename,
  DatesetHandle* out) {

  OverallConfig config;
  DatasetLoader loader(config.io_config, nullptr);
  *out = loader.LoadFromBinFile(filename, 0, 1);
  return 0;
}

DllExport int LGBM_CreateDatasetFromMat(const void* data,
186
  int data_type,
Guolin Ke's avatar
Guolin Ke committed
187
188
189
190
191
192
193
194
195
196
197
  int32_t nrow,
  int32_t ncol,
  int is_row_major,
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
198
  auto get_row_fun = RowFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
Guolin Ke's avatar
Guolin Ke committed
199
200
201
202
203
  if (reference == nullptr) {
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
204
    std::vector<std::vector<double>> sample_values(ncol);
Guolin Ke's avatar
Guolin Ke committed
205
    for (size_t i = 0; i < sample_indices.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
206
      auto idx = sample_indices[i];
207
      auto row = get_row_fun(static_cast<int>(idx));
Guolin Ke's avatar
Guolin Ke committed
208
      for (size_t j = 0; j < row.size(); ++j) {
Guolin Ke's avatar
Guolin Ke committed
209
210
211
        if (std::fabs(row[j]) > 1e-15) {
          sample_values[j].push_back(row[j]);
        }
Guolin Ke's avatar
Guolin Ke committed
212
213
      }
    }
Guolin Ke's avatar
Guolin Ke committed
214
    ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
Guolin Ke's avatar
Guolin Ke committed
215
  } else {
216
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
217
    ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
Guolin Ke's avatar
Guolin Ke committed
218
219
220
221
222
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    const int tid = omp_get_thread_num();
223
    auto one_row = get_row_fun(i);
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
    ret->PushOneRow(tid, i, one_row);
  }
  ret->FinishLoad();
  *out = ret;
228
229
230
  return 0;
}

231
232
DllExport int LGBM_CreateDatasetFromCSR(const void* indptr,
  int indptr_type,
233
234
  const int32_t* indices,
  const void* data,
235
236
237
238
  int data_type,
  int64_t nindptr,
  int64_t nelem,
  int64_t num_col,
239
240
241
242
243
244
245
246
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {

  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
247
  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
248
249
250
251
252
253
254
255
256
257
258
  int32_t nrow = static_cast<int32_t>(nindptr - 1);
  if (reference == nullptr) {
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
    std::vector<std::vector<double>> sample_values;
    for (size_t i = 0; i < sample_indices.size(); ++i) {
      auto idx = sample_indices[i];
      auto row = get_row_fun(static_cast<int>(idx));
      for (std::pair<int, double>& inner_data : row) {
Guolin Ke's avatar
Guolin Ke committed
259
260
261
262
263
264
265
        if (std::fabs(inner_data.second) > 1e-15) {
          if (static_cast<size_t>(inner_data.first) >= sample_values.size()) {
            // if need expand feature set
            size_t need_size = inner_data.first - sample_values.size() + 1;
            for (size_t j = 0; j < need_size; ++j) {
              sample_values.emplace_back();
            }
266
          }
Guolin Ke's avatar
Guolin Ke committed
267
268
          // edit the feature value
          sample_values[inner_data.first].push_back(inner_data.second);
269
270
271
        }
      }
    }
272
    CHECK(num_col >= static_cast<int>(sample_values.size()));
Guolin Ke's avatar
Guolin Ke committed
273
    ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
274
  } else {
275
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
276
    ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
277
278
279
280
281
282
283
284
285
286
287
288
289
290
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nindptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    auto one_row = get_row_fun(i);
    ret->PushOneRow(tid, i, one_row);
  }
  ret->FinishLoad();
  *out = ret;

  return 0;
}

Guolin Ke's avatar
Guolin Ke committed
291

292
293
DllExport int LGBM_CreateDatasetFromCSC(const void* col_ptr,
  int col_ptr_type,
Guolin Ke's avatar
Guolin Ke committed
294
295
  const int32_t* indices,
  const void* data,
296
297
298
299
  int data_type,
  int64_t ncol_ptr,
  int64_t nelem,
  int64_t num_row,
Guolin Ke's avatar
Guolin Ke committed
300
301
302
303
304
305
306
  const char* parameters,
  const DatesetHandle* reference,
  DatesetHandle* out) {
  OverallConfig config;
  config.LoadFromString(parameters);
  DatasetLoader loader(config.io_config, nullptr);
  Dataset* ret = nullptr;
307
  auto get_col_fun = ColumnFunctionFromCSC(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem);
Guolin Ke's avatar
Guolin Ke committed
308
309
310
311
312
313
314
315
316
317
318
  int32_t nrow = static_cast<int32_t>(num_row);
  if (reference == nullptr) {
    Log::Warning("Construct from CSC format is not efficient");
    // sample data first
    Random rand(config.io_config.data_random_seed);
    const size_t sample_cnt = static_cast<size_t>(nrow < config.io_config.bin_construct_sample_cnt ? nrow : config.io_config.bin_construct_sample_cnt);
    auto sample_indices = rand.Sample(nrow, sample_cnt);
    std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
#pragma omp parallel for schedule(guided)
    for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
      auto cur_col = get_col_fun(i);
319
      sample_values[i] = SampleFromOneColumn(cur_col, sample_indices);
Guolin Ke's avatar
Guolin Ke committed
320
    }
Guolin Ke's avatar
Guolin Ke committed
321
    ret = loader.CostructFromSampleData(sample_values, sample_cnt, nrow);
Guolin Ke's avatar
Guolin Ke committed
322
  } else {
323
    ret = new Dataset(nrow, config.io_config.num_class);
Guolin Ke's avatar
Guolin Ke committed
324
    ret->CopyFeatureMapperFrom(reinterpret_cast<const Dataset*>(*reference), config.io_config.is_enable_sparse);
Guolin Ke's avatar
Guolin Ke committed
325
326
327
328
329
330
  }

#pragma omp parallel for schedule(guided)
  for (int i = 0; i < ncol_ptr - 1; ++i) {
    const int tid = omp_get_thread_num();
    auto one_col = get_col_fun(i);
Guolin Ke's avatar
Guolin Ke committed
331
    ret->PushOneColumn(tid, i, one_col);
Guolin Ke's avatar
Guolin Ke committed
332
333
334
335
336
337
  }
  ret->FinishLoad();
  *out = ret;
  return 0;
}

338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
DllExport int LGBM_DatasetFree(DatesetHandle* handle) {
  auto dataset = reinterpret_cast<Dataset*>(*handle);
  delete dataset;
  return 0;
}

DllExport int LGBM_DatasetSaveBinary(DatesetHandle handle,
  const char* filename) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
  dataset->SaveBinaryFile(filename);
  return 0;
}

DllExport int LGBM_DatasetSetField(DatesetHandle handle,
  const char* field_name,
  const void* field_data,
354
  int64_t num_element,
355
356
  int type) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
357
358
359
360
361
362
363
364
  bool is_success = false;
  if (type == dtype_float32) {
    is_success = dataset->SetFloatField(field_name, reinterpret_cast<const float*>(field_data), static_cast<int32_t>(num_element));
  } else if (type == dtype_int32) {
    is_success = dataset->SetIntField(field_name, reinterpret_cast<const int*>(field_data), static_cast<int32_t>(num_element));
  }
  if (is_success) { return 0; }
  return -1;
365
366
367
368
}

DllExport int LGBM_DatasetGetField(DatesetHandle handle,
  const char* field_name,
369
  int64_t* out_len,
370
371
372
  const void** out_ptr,
  int* out_type) {
  auto dataset = reinterpret_cast<Dataset*>(handle);
373
374
375
376
377
378
379
380
  if (dataset->GetFloatField(field_name, out_len, reinterpret_cast<const float**>(out_ptr))) {
    *out_type = dtype_float32;
    return 0;
  } else if (dataset->GetIntField(field_name, out_len, reinterpret_cast<const int**>(out_ptr))) {
    *out_type = dtype_int32;
    return 0;
  }
  return -1;
381
382
383
}

DllExport int LGBM_DatasetGetNumData(DatesetHandle handle,
384
  int64_t* out) {
385
386
387
388
389
390
  auto dataset = reinterpret_cast<Dataset*>(handle);
  *out = dataset->num_data();
  return 0;
}

DllExport int LGBM_DatasetGetNumFeature(DatesetHandle handle,
391
  int64_t* out) {
392
393
394
  auto dataset = reinterpret_cast<Dataset*>(handle);
  *out = dataset->num_total_features();
  return 0;
Guolin Ke's avatar
Guolin Ke committed
395
}
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455


// ---- start of booster

DllExport int LGBM_BoosterCreate(const DatesetHandle train_data,
  const DatesetHandle valid_datas[],
  const char* valid_names[],
  int n_valid_datas,
  const char* parameters,
  BoosterHandle* out) {
  const Dataset* p_train_data = reinterpret_cast<const Dataset*>(train_data);
  std::vector<const Dataset*> p_valid_datas;
  std::vector<std::string> p_valid_names;
  for (int i = 0; i < n_valid_datas; ++i) {
    p_valid_datas.emplace_back(reinterpret_cast<const Dataset*>(valid_datas[i]));
    p_valid_names.emplace_back(valid_names[i]);
  }
  *out = new Booster(p_train_data, p_valid_datas, p_valid_names, parameters);
  return 0;
}

DllExport int LGBM_BoosterLoadFromModelfile(
  const char* filename,
  BoosterHandle* out) {
  *out = new Booster(filename);
  return 0;
}

DllExport int LGBM_BoosterFree(BoosterHandle handle) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  delete ref_booster;
  return 0;
}


DllExport int LGBM_BoosterUpdateOneIter(BoosterHandle handle, int* is_finished) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  if (ref_booster->TrainOneIter()) {
    *is_finished = 1;
  } else {
    *is_finished = 0;
  }
  return 0;
}

DllExport int LGBM_BoosterUpdateOneIterCustom(BoosterHandle handle,
  const float* grad,
  const float* hess,
  int* is_finished) {
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  if (ref_booster->TrainOneIter(grad, hess)) {
    *is_finished = 1;
  } else {
    *is_finished = 0;
  }
  return 0;
}

DllExport int LGBM_BoosterEval(BoosterHandle handle,
  int data,
456
  int64_t* out_len,
Guolin Ke's avatar
Guolin Ke committed
457
  float* out_results) {
458
459
460
461

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  auto result_buf = boosting->GetEvalAt(data);
462
  *out_len = static_cast<int64_t>(result_buf.size());
463
  for (size_t i = 0; i < result_buf.size(); ++i) {
Guolin Ke's avatar
Guolin Ke committed
464
    (out_results)[i] = static_cast<float>(result_buf[i]);
465
466
467
468
469
  }
  return 0;
}

DllExport int LGBM_BoosterGetScore(BoosterHandle handle,
470
  int64_t* out_len,
471
472
473
474
475
  const float** out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  int len = 0;
Guolin Ke's avatar
Guolin Ke committed
476
  *out_result = boosting->GetTrainingScore(&len);
477
  *out_len = static_cast<int64_t>(len);
478
479
480
481

  return 0;
}

Guolin Ke's avatar
Guolin Ke committed
482
483
DllExport int LGBM_BoosterGetPredict(BoosterHandle handle,
  int data,
484
  int64_t* out_len,
Guolin Ke's avatar
Guolin Ke committed
485
486
487
488
489
  float* out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  auto boosting = ref_booster->GetBoosting();
  int len = 0;
Guolin Ke's avatar
Guolin Ke committed
490
  boosting->GetPredictAt(data, out_result, &len);
491
  *out_len = static_cast<int64_t>(len);
Guolin Ke's avatar
Guolin Ke committed
492
493
494
  return 0;
}

495
DllExport int LGBM_BoosterPredictForCSR(BoosterHandle handle,
496
497
  const void* indptr,
  int indptr_type,
498
499
  const int32_t* indices,
  const void* data,
500
501
502
503
  int data_type,
  int64_t nindptr,
  int64_t nelem,
  int64_t,
504
  int predict_type,
505
  int64_t n_used_trees,
Guolin Ke's avatar
Guolin Ke committed
506
  double* out_result) {
507

Guolin Ke's avatar
Guolin Ke committed
508
509
510
  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type);

511
  auto get_row_fun = RowFunctionFromCSR(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
Guolin Ke's avatar
Guolin Ke committed
512
  int num_class = ref_booster->NumberOfClasses();
Guolin Ke's avatar
Guolin Ke committed
513
514
515
516
517
  int nrow = static_cast<int>(nindptr - 1);
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = ref_booster->Predict(one_row);
Guolin Ke's avatar
Guolin Ke committed
518
    for (int j = 0; j < num_class; ++j) {
Guolin Ke's avatar
Guolin Ke committed
519
520
521
522
523
      out_result[i * num_class + j] = predicton_result[j];
    }
  }
  return 0;
}
524
525
526

DllExport int LGBM_BoosterPredictForMat(BoosterHandle handle,
  const void* data,
527
  int data_type,
528
529
  int32_t nrow,
  int32_t ncol,
Guolin Ke's avatar
Guolin Ke committed
530
  int is_row_major,
531
  int predict_type,
532
  int64_t n_used_trees,
Guolin Ke's avatar
Guolin Ke committed
533
534
535
536
537
  double* out_result) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->PrepareForPrediction(static_cast<int>(n_used_trees), predict_type);

538
  auto get_row_fun = RowPairFunctionFromDenseMatric(data, nrow, ncol, data_type, is_row_major);
Guolin Ke's avatar
Guolin Ke committed
539
  int num_class = ref_booster->NumberOfClasses();
Guolin Ke's avatar
Guolin Ke committed
540
541
542
543
#pragma omp parallel for schedule(guided)
  for (int i = 0; i < nrow; ++i) {
    auto one_row = get_row_fun(i);
    auto predicton_result = ref_booster->Predict(one_row);
Guolin Ke's avatar
Guolin Ke committed
544
    for (int j = 0; j < num_class; ++j) {
Guolin Ke's avatar
Guolin Ke committed
545
546
547
548
549
      out_result[i * num_class + j] = predicton_result[j];
    }
  }
  return 0;
}
550
551
552

DllExport int LGBM_BoosterSaveModel(BoosterHandle handle,
  int num_used_model,
Guolin Ke's avatar
Guolin Ke committed
553
554
555
556
557
558
  const char* filename) {

  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
  ref_booster->SaveModelToFile(num_used_model, filename);
  return 0;
}
559

Guolin Ke's avatar
Guolin Ke committed
560
// ---- start of some help functions
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610

std::function<std::vector<double>(int row_idx)>
RowFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<double> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.push_back(static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else {
    Log::Fatal("unknown data type in RowFunctionFromDenseMatric");
  }
Guolin Ke's avatar
Guolin Ke committed
611
  return nullptr;
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
}

std::function<std::vector<std::pair<int, double>>(int row_idx)>
RowPairFunctionFromDenseMatric(const void* data, int num_row, int num_col, int data_type, int is_row_major) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (is_row_major) {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        auto tmp_ptr = data_ptr + num_col * row_idx;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(tmp_ptr + i)));
        }
        return ret;
      };
    } else {
      return [data_ptr, num_col, num_row](int row_idx) {
        CHECK(row_idx < num_row);
        std::vector<std::pair<int, double>> ret;
        for (int i = 0; i < num_col; ++i) {
          ret.emplace_back(i, static_cast<double>(*(data_ptr + num_row * i + row_idx)));
        }
        return ret;
      };
    }
  } else {
    Log::Fatal("unknown data type in RowPairFunctionFromDenseMatric");
  }
Guolin Ke's avatar
Guolin Ke committed
663
  return nullptr;
664
665
666
667
668
669
670
671
672
673
674
675
676
}

std::function<std::vector<std::pair<int, double>>(int idx)>
RowFunctionFromCSR(const void* indptr, int indptr_type, const int32_t* indices, const void* data, int data_type, int64_t nindptr, int64_t nelem) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (indptr_type == dtype_int32) {
      const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
677
        CHECK(start >= 0 && end <= nelem);
678
679
680
681
682
683
684
685
686
687
688
689
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (indptr_type == dtype_int64) {
      const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
690
        CHECK(start >= 0 && end <= nelem);
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in RowFunctionFromCSR");
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (indptr_type == dtype_int32) {
      const int32_t* ptr_indptr = reinterpret_cast<const int32_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
708
        CHECK(start >= 0 && end <= nelem);
709
710
711
712
713
714
715
716
717
718
719
720
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (indptr_type == dtype_int64) {
      const int64_t* ptr_indptr = reinterpret_cast<const int64_t*>(indptr);
      return [ptr_indptr, indices, data_ptr, nindptr, nelem](int idx) {
        CHECK(idx + 1 < nindptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_indptr[idx];
        int64_t end = ptr_indptr[idx + 1];
Guolin Ke's avatar
Guolin Ke committed
721
        CHECK(start >= 0 && end <= nelem);
722
723
724
725
726
727
728
729
730
731
732
        for (int64_t i = start; i <= end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in RowFunctionFromCSR");
    }
  } else {
    Log::Fatal("unknown data type in RowFunctionFromCSR");
  }
Guolin Ke's avatar
Guolin Ke committed
733
  return nullptr;
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
}

std::function<std::vector<std::pair<int, double>>(int idx)>
ColumnFunctionFromCSC(const void* col_ptr, int col_ptr_type, const int32_t* indices, const void* data, int data_type, int64_t ncol_ptr, int64_t nelem) {
  if (data_type == dtype_float32) {
    const float* data_ptr = reinterpret_cast<const float*>(data);
    if (col_ptr_type == dtype_int32) {
      const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (col_ptr_type == dtype_int64) {
      const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in ColumnFunctionFromCSC");
    }
  } else if (data_type == dtype_float64) {
    const double* data_ptr = reinterpret_cast<const double*>(data);
    if (col_ptr_type == dtype_int32) {
      const int32_t* ptr_col_ptr = reinterpret_cast<const int32_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else if (col_ptr_type == dtype_int64) {
      const int64_t* ptr_col_ptr = reinterpret_cast<const int64_t*>(col_ptr);
      return [ptr_col_ptr, indices, data_ptr, ncol_ptr, nelem](int idx) {
        CHECK(idx + 1 < ncol_ptr);
        std::vector<std::pair<int, double>> ret;
        int64_t start = ptr_col_ptr[idx];
        int64_t end = ptr_col_ptr[idx + 1];
        CHECK(start >= 0 && end <= nelem);
        for (int64_t i = start; i < end; ++i) {
          ret.emplace_back(indices[i], data_ptr[i]);
        }
        return ret;
      };
    } else {
      Log::Fatal("unknown data type in ColumnFunctionFromCSC");
    }
  } else {
    Log::Fatal("unknown data type in ColumnFunctionFromCSC");
  }
Guolin Ke's avatar
Guolin Ke committed
803
  return nullptr;
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
}

std::vector<double> SampleFromOneColumn(const std::vector<std::pair<int, double>>& data, const std::vector<size_t>& indices) {
  size_t j = 0;
  std::vector<double> ret;
  for (auto row_idx : indices) {
    while (j < data.size() && data[j].first < static_cast<int>(row_idx)) {
      ++j;
    }
    if (j < data.size() && data[j].first == static_cast<int>(row_idx)) {
      ret.push_back(data[j].second);
    } else {
      ret.push_back(0);
    }
  }
  return ret;
}