lgb.Predictor.R 15.7 KB
Newer Older
1
#' @importFrom methods is new
James Lamb's avatar
James Lamb committed
2
#' @importFrom R6 R6Class
3
#' @importFrom utils read.delim
4
#' @importClassesFrom Matrix dsparseMatrix dsparseVector dgCMatrix dgRMatrix CsparseMatrix RsparseMatrix
James Lamb's avatar
James Lamb committed
5
6
Predictor <- R6::R6Class(

7
  classname = "lgb.Predictor",
8
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
9
  public = list(
James Lamb's avatar
James Lamb committed
10

11
    # Initialize will create a starter model
12
    initialize = function(modelfile, params = list(), fast_predict_config = list()) {
13
      private$params <- .params2str(params = params)
14
      handle <- NULL
James Lamb's avatar
James Lamb committed
15

16
      if (is.character(modelfile)) {
James Lamb's avatar
James Lamb committed
17

18
        # Create handle on it
19
        handle <- .Call(
20
          LGBM_BoosterCreateFromModelfile_R
21
          , path.expand(modelfile)
22
        )
23
        private$need_free_handle <- TRUE
James Lamb's avatar
James Lamb committed
24

25
      } else if (methods::is(modelfile, "lgb.Booster.handle") || inherits(modelfile, "externalptr")) {
James Lamb's avatar
James Lamb committed
26

27
        # Check if model file is a booster handle already
Guolin Ke's avatar
Guolin Ke committed
28
        handle <- modelfile
29
        private$need_free_handle <- FALSE
James Lamb's avatar
James Lamb committed
30

31
      } else if (.is_Booster(modelfile)) {
32
33
34
35

        handle <- modelfile$get_handle()
        private$need_free_handle <- FALSE

Guolin Ke's avatar
Guolin Ke committed
36
      } else {
James Lamb's avatar
James Lamb committed
37

38
        stop("lgb.Predictor: modelfile must be either a character filename or an lgb.Booster.handle")
James Lamb's avatar
James Lamb committed
39

Guolin Ke's avatar
Guolin Ke committed
40
      }
James Lamb's avatar
James Lamb committed
41

42
43
      private$fast_predict_config <- fast_predict_config

44
      # Override class and store it
Guolin Ke's avatar
Guolin Ke committed
45
46
      class(handle) <- "lgb.Booster.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
47

48
49
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
50
    },
James Lamb's avatar
James Lamb committed
51

52
    # Get current iteration
Guolin Ke's avatar
Guolin Ke committed
53
    current_iter = function() {
James Lamb's avatar
James Lamb committed
54

55
      cur_iter <- 0L
56
57
58
59
      .Call(
        LGBM_BoosterGetCurrentIteration_R
        , private$handle
        , cur_iter
60
      )
61
      return(cur_iter)
James Lamb's avatar
James Lamb committed
62

Guolin Ke's avatar
Guolin Ke committed
63
    },
James Lamb's avatar
James Lamb committed
64

65
66
    # Predict from data
    predict = function(data,
67
                       start_iteration = NULL,
68
69
70
                       num_iteration = NULL,
                       rawscore = FALSE,
                       predleaf = FALSE,
71
                       predcontrib = FALSE,
72
                       header = FALSE) {
James Lamb's avatar
James Lamb committed
73

74
75
      # Check if number of iterations is existing - if not, then set it to -1 (use all)
      if (is.null(num_iteration)) {
76
        num_iteration <- -1L
77
      }
78
79
80
81
      # Check if start iterations is existing - if not, then set it to 0 (start from the first iteration)
      if (is.null(start_iteration)) {
        start_iteration <- 0L
      }
James Lamb's avatar
James Lamb committed
82

Laurae's avatar
Laurae committed
83
      # Check if data is a file name and not a matrix
84
      if (identical(class(data), "character") && length(data) == 1L) {
James Lamb's avatar
James Lamb committed
85

86
87
        data <- path.expand(data)

88
        # Data is a filename, create a temporary file with a "lightgbm_" pattern in it
Guolin Ke's avatar
Guolin Ke committed
89
        tmp_filename <- tempfile(pattern = "lightgbm_")
90
        on.exit(unlink(tmp_filename), add = TRUE)
James Lamb's avatar
James Lamb committed
91

92
        # Predict from temporary file
93
94
        .Call(
          LGBM_BoosterPredictForFile_R
95
96
97
98
99
100
          , private$handle
          , data
          , as.integer(header)
          , as.integer(rawscore)
          , as.integer(predleaf)
          , as.integer(predcontrib)
101
          , as.integer(start_iteration)
102
103
          , as.integer(num_iteration)
          , private$params
104
          , tmp_filename
105
        )
James Lamb's avatar
James Lamb committed
106

107
        # Get predictions from file
108
        preds <- utils::read.delim(tmp_filename, header = FALSE, sep = "\t")
Guolin Ke's avatar
Guolin Ke committed
109
        num_row <- nrow(preds)
110
        preds <- as.vector(t(preds))
James Lamb's avatar
James Lamb committed
111

112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
      } else if (predcontrib && inherits(data, c("dsparseMatrix", "dsparseVector"))) {

        ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle)
        ncols_out <- integer(1L)
        .Call(LGBM_BoosterGetNumClasses_R, private$handle, ncols_out)
        ncols_out <- (ncols + 1L) * max(ncols_out, 1L)
        if (is.na(ncols_out)) {
          ncols_out <- as.numeric(ncols + 1L) * as.numeric(max(ncols_out, 1L))
        }
        if (!inherits(data, "dsparseVector") && ncols_out > .Machine$integer.max) {
          stop("Resulting matrix of feature contributions is too large for R to handle.")
        }

        if (inherits(data, "dsparseVector")) {

          if (length(data) > ncols) {
            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
                         , ncols
                         , length(data)))
          }
          res <- .Call(
            LGBM_BoosterPredictSparseOutput_R
            , private$handle
            , c(0L, as.integer(length(data@x)))
            , data@i - 1L
            , data@x
            , TRUE
            , 1L
            , ncols
            , start_iteration
            , num_iteration
            , private$params
          )
          out <- methods::new("dsparseVector")
          out@i <- res$indices + 1L
          out@x <- res$data
          out@length <- ncols_out
          return(out)

        } else if (inherits(data, "dgRMatrix")) {

          if (ncol(data) > ncols) {
            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
                         , ncols
                         , ncol(data)))
          }
          res <- .Call(
            LGBM_BoosterPredictSparseOutput_R
            , private$handle
            , data@p
            , data@j
            , data@x
            , TRUE
            , nrow(data)
            , ncols
            , start_iteration
            , num_iteration
            , private$params
          )
          out <- methods::new("dgRMatrix")
          out@p <- res$indptr
          out@j <- res$indices
          out@x <- res$data
          out@Dim <- as.integer(c(nrow(data), ncols_out))

        } else if (inherits(data, "dgCMatrix")) {

          if (ncol(data) != ncols) {
            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
                         , ncols
                         , ncol(data)))
          }
          res <- .Call(
            LGBM_BoosterPredictSparseOutput_R
            , private$handle
            , data@p
            , data@i
            , data@x
            , FALSE
            , nrow(data)
            , ncols
            , start_iteration
            , num_iteration
            , private$params
          )
          out <- methods::new("dgCMatrix")
          out@p <- res$indptr
          out@i <- res$indices
          out@x <- res$data
          out@Dim <- as.integer(c(nrow(data), length(res$indptr) - 1L))

        } else {

          stop(sprintf("Predictions on sparse inputs are only allowed for '%s', '%s', '%s' - got: %s"
                       , "dsparseVector"
                       , "dgRMatrix"
                       , "dgCMatrix"
209
                       , toString(class(data))))
210
211
212
213
214
215
216
        }

        if (NROW(row.names(data))) {
          out@Dimnames[[1L]] <- row.names(data)
        }
        return(out)

Guolin Ke's avatar
Guolin Ke committed
217
      } else {
James Lamb's avatar
James Lamb committed
218

219
        # Not a file, we need to predict from R object
Guolin Ke's avatar
Guolin Ke committed
220
        num_row <- nrow(data)
221
222
223
        if (is.null(num_row)) {
          num_row <- 1L
        }
James Lamb's avatar
James Lamb committed
224

225
        npred <- 0L
James Lamb's avatar
James Lamb committed
226

227
        # Check number of predictions to do
228
229
        .Call(
          LGBM_BoosterCalcNumPredict_R
230
231
232
233
234
          , private$handle
          , as.integer(num_row)
          , as.integer(rawscore)
          , as.integer(predleaf)
          , as.integer(predcontrib)
235
          , as.integer(start_iteration)
236
          , as.integer(num_iteration)
237
          , npred
238
        )
James Lamb's avatar
James Lamb committed
239

240
241
        # Pre-allocate empty vector
        preds <- numeric(npred)
James Lamb's avatar
James Lamb committed
242

243
        # Check if data is a matrix
Guolin Ke's avatar
Guolin Ke committed
244
        if (is.matrix(data)) {
245
246
          # this if() prevents the memory and computational costs
          # of converting something that is already "double" to "double"
247
248
249
          if (storage.mode(data) != "double") {
            storage.mode(data) <- "double"
          }
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418

          if (nrow(data) == 1L) {

            use_fast_config <- private$check_can_use_fast_predict_config(
              csr = FALSE
              , rawscore = rawscore
              , predleaf = predleaf
              , predcontrib = predcontrib
              , start_iteration = start_iteration
              , num_iteration = num_iteration
            )

            if (use_fast_config) {
              .Call(
                LGBM_BoosterPredictForMatSingleRowFast_R
                , private$fast_predict_config$handle
                , data
                , preds
              )
            } else {
              .Call(
                LGBM_BoosterPredictForMatSingleRow_R
                , private$handle
                , data
                , rawscore
                , predleaf
                , predcontrib
                , start_iteration
                , num_iteration
                , private$params
                , preds
              )
            }

          } else {
            .Call(
              LGBM_BoosterPredictForMat_R
              , private$handle
              , data
              , as.integer(nrow(data))
              , as.integer(ncol(data))
              , as.integer(rawscore)
              , as.integer(predleaf)
              , as.integer(predcontrib)
              , as.integer(start_iteration)
              , as.integer(num_iteration)
              , private$params
              , preds
            )
          }

        } else if (inherits(data, "dsparseVector")) {

          if (length(self$fast_predict_config)) {
            ncols <- self$fast_predict_config$ncols
            use_fast_config <- private$check_can_use_fast_predict_config(
                csr = TRUE
                , rawscore = rawscore
                , predleaf = predleaf
                , predcontrib = predcontrib
                , start_iteration = start_iteration
                , num_iteration = num_iteration
              )
          } else {
            ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle)
            use_fast_config <- FALSE
          }

          if (length(data) > ncols) {
            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
                         , ncols
                         , length(data)))
          }

          if (use_fast_config) {
            .Call(
              LGBM_BoosterPredictForCSRSingleRowFast_R
              , self$fast_predict_config$handle
              , data@i - 1L
              , data@x
              , preds
            )
          } else {
            .Call(
              LGBM_BoosterPredictForCSRSingleRow_R
              , private$handle
              , data@i - 1L
              , data@x
              , ncols
              , as.integer(rawscore)
              , as.integer(predleaf)
              , as.integer(predcontrib)
              , start_iteration
              , num_iteration
              , private$params
              , preds
            )
          }

        } else if (inherits(data, "dgRMatrix")) {

          ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle)
          if (ncol(data) > ncols) {
            stop(sprintf("Model was fitted to data with %d columns, input data has %.0f columns."
                         , ncols
                         , ncol(data)))
          }

          if (nrow(data) == 1L) {

            if (length(self$fast_predict_config)) {
              ncols <- self$fast_predict_config$ncols
              use_fast_config <- private$check_can_use_fast_predict_config(
                csr = TRUE
                , rawscore = rawscore
                , predleaf = predleaf
                , predcontrib = predcontrib
                , start_iteration = start_iteration
                , num_iteration = num_iteration
              )
            } else {
              ncols <- .Call(LGBM_BoosterGetNumFeature_R, private$handle)
              use_fast_config <- FALSE
            }

            if (use_fast_config) {
              .Call(
                LGBM_BoosterPredictForCSRSingleRowFast_R
                , self$fast_predict_config$handle
                , data@j
                , data@x
                , preds
              )
            } else {
              .Call(
                LGBM_BoosterPredictForCSRSingleRow_R
                , private$handle
                , data@j
                , data@x
                , ncols
                , as.integer(rawscore)
                , as.integer(predleaf)
                , as.integer(predcontrib)
                , start_iteration
                , num_iteration
                , private$params
                , preds
              )
            }

          } else {

            .Call(
              LGBM_BoosterPredictForCSR_R
              , private$handle
              , data@p
              , data@j
              , data@x
              , ncols
              , as.integer(rawscore)
              , as.integer(predleaf)
              , as.integer(predcontrib)
              , start_iteration
              , num_iteration
              , private$params
              , preds
            )

          }
James Lamb's avatar
James Lamb committed
419
420

        } else if (methods::is(data, "dgCMatrix")) {
421
          if (length(data@p) > 2147483647L) {
422
423
            stop("Cannot support large CSC matrix")
          }
424
          # Check if data is a dgCMatrix (sparse matrix, column compressed format)
425
426
          .Call(
            LGBM_BoosterPredictForCSC_R
427
428
429
430
431
432
433
434
435
436
            , private$handle
            , data@p
            , data@i
            , data@x
            , length(data@p)
            , length(data@x)
            , nrow(data)
            , as.integer(rawscore)
            , as.integer(predleaf)
            , as.integer(predcontrib)
437
            , as.integer(start_iteration)
438
439
            , as.integer(num_iteration)
            , private$params
440
            , preds
441
          )
James Lamb's avatar
James Lamb committed
442

Guolin Ke's avatar
Guolin Ke committed
443
        } else {
James Lamb's avatar
James Lamb committed
444

445
          stop("predict: cannot predict on data of class ", sQuote(class(data)))
James Lamb's avatar
James Lamb committed
446

447
        }
Guolin Ke's avatar
Guolin Ke committed
448
      }
James Lamb's avatar
James Lamb committed
449

450
      # Check if number of rows is strange (not a multiple of the dataset rows)
451
      if (length(preds) %% num_row != 0L) {
452
453
454
        stop(
          "predict: prediction length "
          , sQuote(length(preds))
455
          , " is not a multiple of nrows(data): "
456
457
          , sQuote(num_row)
        )
Guolin Ke's avatar
Guolin Ke committed
458
      }
James Lamb's avatar
James Lamb committed
459

460
      # Get number of cases per row
Guolin Ke's avatar
Guolin Ke committed
461
      npred_per_case <- length(preds) / num_row
James Lamb's avatar
James Lamb committed
462

463
      # Data reshaping
464
      if (npred_per_case > 1L || predleaf || predcontrib) {
465
466
        preds <- matrix(preds, ncol = npred_per_case, byrow = TRUE)
      }
James Lamb's avatar
James Lamb committed
467

468
469
470
471
472
473
474
475
476
      # Keep row names if possible
      if (NROW(row.names(data)) && NROW(data) == NROW(preds)) {
        if (is.null(dim(preds))) {
          names(preds) <- row.names(data)
        } else {
          row.names(preds) <- row.names(data)
        }
      }

477
      return(preds)
Guolin Ke's avatar
Guolin Ke committed
478
    }
James Lamb's avatar
James Lamb committed
479

480
  ),
481
482
483
484
  private = list(
    handle = NULL
    , need_free_handle = FALSE
    , params = ""
485
486
487
488
489
490
491
492
493
494
495
496
    , fast_predict_config = list()
    , check_can_use_fast_predict_config = function(csr,
                                                   rawscore,
                                                   predleaf,
                                                   predcontrib,
                                                   start_iteration,
                                                   num_iteration) {

      if (!NROW(private$fast_predict_config)) {
        return(FALSE)
      }

497
      if (.is_null_handle(private$fast_predict_config$handle)) {
498
499
500
501
502
503
504
505
506
507
508
509
510
511
        warning(paste0("Model had fast CSR predict configuration, but it is inactive."
                       , " Try re-generating it through 'lgb.configure_fast_predict'."))
        return(FALSE)
      }

      if (isTRUE(csr) != private$fast_predict_config$csr) {
        return(FALSE)
      }

      return(
        private$params == "" &&
        private$fast_predict_config$rawscore == rawscore &&
        private$fast_predict_config$predleaf == predleaf &&
        private$fast_predict_config$predcontrib == predcontrib &&
512
513
        .equal_or_both_null(private$fast_predict_config$start_iteration, start_iteration) &&
        .equal_or_both_null(private$fast_predict_config$num_iteration, num_iteration)
514
515
      )
    }
516
517
518
519
520
521
522
523
524
525
526
527

    # finalize() will free up the handles
    , finalize = function() {
      if (private$need_free_handle) {
        .Call(
          LGBM_BoosterFree_R
          , private$handle
        )
        private$handle <- NULL
      }
      return(invisible(NULL))
    }
528
  )
Guolin Ke's avatar
Guolin Ke committed
529
)