lgb.cv.R 18.1 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
2
#' @importFrom R6 R6Class
CVBooster <- R6::R6Class(
3
  classname = "lgb.CVBooster",
4
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
5
  public = list(
6
    best_iter = -1L,
7
    best_score = NA,
Guolin Ke's avatar
Guolin Ke committed
8
    record_evals = list(),
9
10
    boosters = list(),
    initialize = function(x) {
Guolin Ke's avatar
Guolin Ke committed
11
      self$boosters <- x
12
    },
13
14
15
    reset_parameter = function(new_params) {
      for (x in boosters) { x$reset_parameter(new_params) }
      self
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
    }
  )
)

James Lamb's avatar
James Lamb committed
20
#' @title Main CV logic for LightGBM
James Lamb's avatar
James Lamb committed
21
#' @description Cross validation logic used by LightGBM
James Lamb's avatar
James Lamb committed
22
#' @name lgb.cv
James Lamb's avatar
James Lamb committed
23
#' @inheritParams lgb_shared_params
24
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
Guolin Ke's avatar
Guolin Ke committed
25
#' @param label vector of response values. Should be provided only when data is an R-matrix.
26
#' @param weight vector of response values. If not NULL, will set to dataset
27
#' @param obj objective function, can be character or custom objective function. Examples include
28
29
#'            \code{regression}, \code{regression_l1}, \code{huber},
#'             \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
Guolin Ke's avatar
Guolin Ke committed
30
#' @param eval evaluation function, can be (list of) character or custom eval function
31
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
Guolin Ke's avatar
Guolin Ke committed
32
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
33
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
34
#'                   by the values of outcome labels.
Guolin Ke's avatar
Guolin Ke committed
35
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
36
37
#'              (each element must be a vector of test fold's indices). When folds are supplied,
#'              the \code{nfold} and \code{stratified} parameters are ignored.
Guolin Ke's avatar
Guolin Ke committed
38
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
39
#' @param categorical_feature list of str or int
40
41
42
43
44
#'                            type int represents index,
#'                            type str represents feature names
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#'                   into a predictor model which frees up memory and the original datasets
James Lamb's avatar
James Lamb committed
45
46
47
48
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#'            \itemize{
#'                \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
#'                \item{num_leaves}{number of leaves in one tree. defaults to 127}
49
#'                \item{max_depth}{Limit the max depth for tree model. This is used to deal with
James Lamb's avatar
James Lamb committed
50
51
#'                                 overfit when #data is small. Tree still grow by leaf-wise.}
#'                \item{num_threads}{Number of threads for LightGBM. For the best speed, set this to
52
#'                                   the number of real CPU cores, not the number of threads (most
James Lamb's avatar
James Lamb committed
53
54
#'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
#'            }
55
#'
56
#' @return a trained model \code{lgb.CVBooster}.
57
#'
Guolin Ke's avatar
Guolin Ke committed
58
#' @examples
59
60
61
62
63
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2")
64
65
66
#' model <- lgb.cv(
#'   params = params
#'   , data = dtrain
67
68
69
70
71
#'   , nrounds = 10L
#'   , nfold = 3L
#'   , min_data = 1L
#'   , learning_rate = 1.0
#'   , early_stopping_rounds = 5L
72
#' )
73
#' @importFrom data.table data.table setorderv
Guolin Ke's avatar
Guolin Ke committed
74
#' @export
75
76
77
78
79
80
81
82
83
84
85
lgb.cv <- function(params = list()
                   , data
                   , nrounds = 10L
                   , nfold = 3L
                   , label = NULL
                   , weight = NULL
                   , obj = NULL
                   , eval = NULL
                   , verbose = 1L
                   , record = TRUE
                   , eval_freq = 1L
86
                   , showsd = TRUE
87
88
89
90
91
92
93
94
95
96
                   , stratified = TRUE
                   , folds = NULL
                   , init_model = NULL
                   , colnames = NULL
                   , categorical_feature = NULL
                   , early_stopping_rounds = NULL
                   , callbacks = list()
                   , reset_data = FALSE
                   , ...
                   ) {
97

98
  # Setup temporary variables
99
  params <- append(params, list(...))
100
101
102
103
104
  params$verbose <- verbose
  params <- lgb.check.obj(params, obj)
  params <- lgb.check.eval(params, eval)
  fobj <- NULL
  feval <- NULL
105

106
  if (nrounds <= 0L) {
107
108
    stop("nrounds should be greater than zero")
  }
109

110
  # Check for objective (function or not)
111
  if (is.function(params$objective)) {
Guolin Ke's avatar
Guolin Ke committed
112
113
114
    fobj <- params$objective
    params$objective <- "NONE"
  }
115

116
117
118
119
  # Check for loss (function or not)
  if (is.function(eval)) {
    feval <- eval
  }
120

121
  # Init predictor to empty
Guolin Ke's avatar
Guolin Ke committed
122
  predictor <- NULL
123

124
  # Check for boosting from a trained model
125
  if (is.character(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
126
    predictor <- Predictor$new(init_model)
127
  } else if (lgb.is.Booster(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
128
129
    predictor <- init_model$to_predictor()
  }
130

131
  # Set the iteration to start from / end to (and check for boosting from a trained model, again)
132
  begin_iteration <- 1L
133
  if (!is.null(predictor)) {
134
    begin_iteration <- predictor$current_iter() + 1L
Guolin Ke's avatar
Guolin Ke committed
135
  }
136
  # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
137
  n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
138
  if (any(names(params) %in% n_trees)) {
139
    end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1L]]] - 1L
140
  } else {
141
    end_iteration <- begin_iteration + nrounds - 1L
142
  }
143

144
  # Check for training dataset type correctness
145
  if (!lgb.is.Dataset(data)) {
146
147
148
    if (is.null(label)) {
      stop("Labels must be provided for lgb.cv")
    }
149
    data <- lgb.Dataset(data, label = label)
Guolin Ke's avatar
Guolin Ke committed
150
  }
151

152
153
  # Check for weights
  if (!is.null(weight)) {
154
    data$setinfo("weight", weight)
155
  }
156

157
  # Update parameters with parsed parameters
Guolin Ke's avatar
Guolin Ke committed
158
  data$update_params(params)
159

160
  # Create the predictor set
Guolin Ke's avatar
Guolin Ke committed
161
  data$.__enclos_env__$private$set_predictor(predictor)
162

163
164
165
166
  # Write column names
  if (!is.null(colnames)) {
    data$set_colnames(colnames)
  }
167

168
169
170
171
  # Write categorical features
  if (!is.null(categorical_feature)) {
    data$set_categorical_feature(categorical_feature)
  }
172

173
  # Construct datasets, if needed
Guolin Ke's avatar
Guolin Ke committed
174
  data$construct()
175

176
  # Check for folds
177
  if (!is.null(folds)) {
178

179
    # Check for list of folds or for single value
180
    if (!is.list(folds) || length(folds) < 2L) {
181
      stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
182
    }
183

184
    # Set number of folds
Guolin Ke's avatar
Guolin Ke committed
185
    nfold <- length(folds)
186

Guolin Ke's avatar
Guolin Ke committed
187
  } else {
188

189
    # Check fold value
190
    if (nfold <= 1L) {
191
192
      stop(sQuote("nfold"), " must be > 1")
    }
193

194
    # Create folds
195
    folds <- generate.cv.folds(
196
197
198
199
200
201
      nfold = nfold
      , nrows = nrow(data)
      , stratified = stratified
      , label = getinfo(data, "label")
      , group = getinfo(data, "group")
      , params = params
202
    )
203

Guolin Ke's avatar
Guolin Ke committed
204
  }
205

206
  # Add printing log callback
207
  if (verbose > 0L && eval_freq > 0L) {
Guolin Ke's avatar
Guolin Ke committed
208
209
    callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
  }
210

211
212
213
214
  # Add evaluation log callback
  if (record) {
    callbacks <- add.cb(callbacks, cb.record.evaluation())
  }
215

216
217
218
219
220
  # If early stopping was passed as a parameter in params(), prefer that to keyword argument
  # early_stopping_rounds by overwriting the value in 'early_stopping_rounds'
  early_stop <- .PARAMETER_ALIASES()[["early_stopping_round"]]
  early_stop_param_indx <- names(params) %in% early_stop
  if (any(early_stop_param_indx)) {
221
    first_early_stop_param <- which(early_stop_param_indx)[[1L]]
222
223
224
225
226
227
228
229
230
231
232
    first_early_stop_param_name <- names(params)[[first_early_stop_param]]
    early_stopping_rounds <- params[[first_early_stop_param_name]]
  }

  # Did user pass parameters that indicate they want to use early stopping?
  using_early_stopping_via_args <- !is.null(early_stopping_rounds)

  boosting_param_names <- .PARAMETER_ALIASES()[["boosting"]]
  using_dart <- any(
    sapply(
      X = boosting_param_names
233
234
      , FUN = function(param) {
        identical(params[[param]], "dart")
235
      }
236
237
238
239
    )
  )

  # Cannot use early stopping with 'dart' boosting
240
  if (using_dart) {
241
242
243
244
245
    warning("Early stopping is not available in 'dart' mode.")
    using_early_stopping_via_args <- FALSE

    # Remove the cb.early.stop() function if it was passed in to callbacks
    callbacks <- Filter(
246
      f = function(cb_func) {
247
248
249
250
251
252
253
        !identical(attr(cb_func, "name"), "cb.early.stop")
      }
      , x = callbacks
    )
  }

  # If user supplied early_stopping_rounds, add the early stopping callback
254
  if (using_early_stopping_via_args) {
255
256
257
258
259
260
261
    callbacks <- add.cb(
      callbacks
      , cb.early.stop(
        stopping_rounds = early_stopping_rounds
        , verbose = verbose
      )
    )
Guolin Ke's avatar
Guolin Ke committed
262
  }
263

264
  # Categorize callbacks
Guolin Ke's avatar
Guolin Ke committed
265
  cb <- categorize.callbacks(callbacks)
266

267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
  # Construct booster for each fold. The data.table() code below is used to
  # guarantee that indices are sorted while keeping init_score and weight together
  # with the correct indices. Note that it takes advantage of the fact that
  # someDT$some_column returns NULL is 'some_column' does not exist in the data.table
  bst_folds <- lapply(
    X = seq_along(folds)
    , FUN = function(k) {

      # For learning-to-rank, each fold is a named list with two elements:
      #   * `fold` = an integer vector of row indices
      #   * `group` = an integer vector describing which groups are in the fold
      # For classification or regression tasks, it will just be an integer
      # vector of row indices
      folds_have_group <- "group" %in% names(folds[[k]])
      if (folds_have_group) {
        test_indices <- folds[[k]]$fold
        test_group_indices <- folds[[k]]$group
        test_groups <- getinfo(data, "group")[test_group_indices]
        train_groups <- getinfo(data, "group")[-test_group_indices]
      } else {
        test_indices <- folds[[k]]
      }
      train_indices <- seq_len(nrow(data))[-test_indices]

      # set up test set
      indexDT <- data.table::data.table(
        indices = test_indices
        , weight = getinfo(data, "weight")[test_indices]
        , init_score = getinfo(data, "init_score")[test_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtest <- slice(data, indexDT$indices)
      setinfo(dtest, "weight", indexDT$weight)
      setinfo(dtest, "init_score", indexDT$init_score)

      # set up training set
      indexDT <- data.table::data.table(
        indices = train_indices
        , weight = getinfo(data, "weight")[train_indices]
        , init_score = getinfo(data, "init_score")[train_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtrain <- slice(data, indexDT$indices)
      setinfo(dtrain, "weight", indexDT$weight)
      setinfo(dtrain, "init_score", indexDT$init_score)

      if (folds_have_group) {
        setinfo(dtest, "group", test_groups)
        setinfo(dtrain, "group", train_groups)
      }

318
319
      booster <- Booster$new(params, dtrain)
      booster$add_valid(dtest, "valid")
320
321
322
323
324
      return(
        list(booster = booster)
      )
    }
  )
325

326
  # Create new booster
Guolin Ke's avatar
Guolin Ke committed
327
  cv_booster <- CVBooster$new(bst_folds)
328

329
330
331
  # Callback env
  env <- CB_ENV$new()
  env$model <- cv_booster
Guolin Ke's avatar
Guolin Ke committed
332
  env$begin_iteration <- begin_iteration
333
  env$end_iteration <- end_iteration
334

335
  # Start training model using number of iterations to start and end with
336
  for (i in seq.int(from = begin_iteration, to = end_iteration)) {
337

338
    # Overwrite iteration in environment
Guolin Ke's avatar
Guolin Ke committed
339
340
    env$iteration <- i
    env$eval_list <- list()
341

342
343
344
345
    # Loop through "pre_iter" element
    for (f in cb$pre_iter) {
      f(env)
    }
346

347
    # Update one boosting iteration
Guolin Ke's avatar
Guolin Ke committed
348
    msg <- lapply(cv_booster$boosters, function(fd) {
349
350
      fd$booster$update(fobj = fobj)
      fd$booster$eval_valid(feval = feval)
Guolin Ke's avatar
Guolin Ke committed
351
    })
352

353
    # Prepare collection of evaluation results
Guolin Ke's avatar
Guolin Ke committed
354
    merged_msg <- lgb.merge.cv.result(msg)
355

356
    # Write evaluation result in environment
Guolin Ke's avatar
Guolin Ke committed
357
    env$eval_list <- merged_msg$eval_list
358

359
    # Check for standard deviation requirement
360
    if (showsd) {
361
362
      env$eval_err_list <- merged_msg$eval_err_list
    }
363

364
365
366
367
    # Loop through env
    for (f in cb$post_iter) {
      f(env)
    }
368

369
    # Check for early stopping and break if needed
370
    if (env$met_early_stop) break
371

Guolin Ke's avatar
Guolin Ke committed
372
  }
373

374
  if (record && is.na(env$best_score)) {
375
376
377
    if (env$eval_list[[1L]]$higher_better[1L] == TRUE) {
      cv_booster$best_iter <- unname(which.max(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]])))
      cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]]
378
    } else {
379
380
      cv_booster$best_iter <- unname(which.min(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]])))
      cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]]
381
382
    }
  }
383

384
385
386
  if (reset_data) {
    lapply(cv_booster$boosters, function(fd) {
      # Store temporarily model data elsewhere
387
388
      booster_old <- list(
        best_iter = fd$booster$best_iter
389
        , best_score = fd$booster$best_score
390
391
        , record_evals = fd$booster$record_evals
      )
392
393
394
395
396
397
398
      # Reload model
      fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
      fd$booster$best_iter <- booster_old$best_iter
      fd$booster$best_score <- booster_old$best_score
      fd$booster$record_evals <- booster_old$record_evals
    })
  }
399

400
401
  # Return booster
  return(cv_booster)
402

Guolin Ke's avatar
Guolin Ke committed
403
404
405
}

# Generates random (stratified if needed) CV folds
406
generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
407

408
409
  # Check for group existence
  if (is.null(group)) {
410

411
    # Shuffle
412
    rnd_idx <- sample.int(nrows)
413

414
    # Request stratified folds
415
    if (isTRUE(stratified) && params$objective %in% c("binary", "multiclass") && length(label) == length(rnd_idx)) {
416

417
418
419
      y <- label[rnd_idx]
      y <- factor(y)
      folds <- lgb.stratified.folds(y, nfold)
420

421
    } else {
422

423
424
      # Make simple non-stratified folds
      folds <- list()
425

426
      # Loop through each fold
427
      for (i in seq_len(nfold)) {
428
        kstep <- length(rnd_idx) %/% (nfold - i + 1L)
429
        folds[[i]] <- rnd_idx[seq_len(kstep)]
430
        rnd_idx <- rnd_idx[-seq_len(kstep)]
431
      }
432

433
    }
434

Guolin Ke's avatar
Guolin Ke committed
435
  } else {
436

437
438
439
440
    # When doing group, stratified is not possible (only random selection)
    if (nfold > length(group)) {
      stop("\n\tYou requested too many folds for the number of available groups.\n")
    }
441

442
    # Degroup the groups
443
    ungrouped <- inverse.rle(list(lengths = group, values = seq_along(group)))
444

445
    # Can't stratify, shuffle
446
    rnd_idx <- sample.int(length(group))
447

448
    # Make simple non-stratified folds
Guolin Ke's avatar
Guolin Ke committed
449
    folds <- list()
450

451
    # Loop through each fold
452
    for (i in seq_len(nfold)) {
453
      kstep <- length(rnd_idx) %/% (nfold - i + 1L)
454
455
456
457
      folds[[i]] <- list(
        fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
        , group = rnd_idx[seq_len(kstep)]
      )
458
      rnd_idx <- rnd_idx[-seq_len(kstep)]
Guolin Ke's avatar
Guolin Ke committed
459
    }
460

Guolin Ke's avatar
Guolin Ke committed
461
  }
462

463
464
  # Return folds
  return(folds)
465

Guolin Ke's avatar
Guolin Ke committed
466
467
468
469
470
}

# Creates CV folds stratified by the values of y.
# It was borrowed from caret::lgb.stratified.folds and simplified
# by always returning an unnamed list of fold indices.
471
#' @importFrom stats quantile
472
lgb.stratified.folds <- function(y, k = 10L) {
473

474
475
476
477
478
479
480
481
  ## Group the numeric data based on their magnitudes
  ## and sample within those groups.
  ## When the number of samples is low, we may have
  ## issues further slicing the numeric data into
  ## groups. The number of groups will depend on the
  ## ratio of the number of folds to the sample size.
  ## At most, we will use quantiles. If the sample
  ## is too small, we just do regular unstratified CV
Guolin Ke's avatar
Guolin Ke committed
482
  if (is.numeric(y)) {
483

484
    cuts <- length(y) %/% k
485
486
    if (cuts < 2L) {
      cuts <- 2L
487
    }
488
489
    if (cuts > 5L) {
      cuts <- 5L
490
491
492
    }
    y <- cut(
      y
493
      , unique(stats::quantile(y, probs = seq.int(0.0, 1.0, length.out = cuts)))
494
495
      , include.lowest = TRUE
    )
496

Guolin Ke's avatar
Guolin Ke committed
497
  }
498

Guolin Ke's avatar
Guolin Ke committed
499
  if (k < length(y)) {
500

501
    ## Reset levels so that the possible levels and
Guolin Ke's avatar
Guolin Ke committed
502
503
504
505
    ## the levels in the vector are the same
    y <- factor(as.character(y))
    numInClass <- table(y)
    foldVector <- vector(mode = "integer", length(y))
506

Guolin Ke's avatar
Guolin Ke committed
507
508
509
    ## For each class, balance the fold allocation as far
    ## as possible, then resample the remainder.
    ## The final assignment of folds is also randomized.
510

511
    for (i in seq_along(numInClass)) {
512

513
      ## Create a vector of integers from 1:k as many times as possible without
Guolin Ke's avatar
Guolin Ke committed
514
515
      ## going over the number of samples in the class. Note that if the number
      ## of samples in a class is less than k, nothing is producd here.
516
      seqVector <- rep(seq_len(k), numInClass[i] %/% k)
517

518
      ## Add enough random integers to get  length(seqVector) == numInClass[i]
519
      if (numInClass[i] %% k > 0L) {
520
        seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k))
521
      }
522

523
      ## Shuffle the integers for fold assignment and assign to this classes's data
524
      foldVector[y == dimnames(numInClass)$y[i]] <- sample(seqVector)
525

Guolin Ke's avatar
Guolin Ke committed
526
    }
527

Guolin Ke's avatar
Guolin Ke committed
528
  } else {
529

Guolin Ke's avatar
Guolin Ke committed
530
    foldVector <- seq(along = y)
531

Guolin Ke's avatar
Guolin Ke committed
532
  }
533

534
  # Return data
Guolin Ke's avatar
Guolin Ke committed
535
  out <- split(seq(along = y), foldVector)
536
537
  names(out) <- NULL
  out
Guolin Ke's avatar
Guolin Ke committed
538
539
}

540
lgb.merge.cv.result <- function(msg, showsd = TRUE) {
541

542
  # Get CV message length
543
  if (length(msg) == 0L) {
544
545
    stop("lgb.cv: size of cv result error")
  }
546

547
  # Get evaluation message length
548
  eval_len <- length(msg[[1L]])
549

550
  # Is evaluation message empty?
551
  if (eval_len == 0L) {
552
553
    stop("lgb.cv: should provide at least one metric for CV")
  }
554

555
  # Get evaluation results using a list apply
556
  eval_result <- lapply(seq_len(eval_len), function(j) {
557
558
    as.numeric(lapply(seq_along(msg), function(i) {
      msg[[i]][[j]]$value }))
Guolin Ke's avatar
Guolin Ke committed
559
  })
560

561
  # Get evaluation
562
  ret_eval <- msg[[1L]]
563

564
565
566
567
  # Go through evaluation length items
  for (j in seq_len(eval_len)) {
    ret_eval[[j]]$value <- mean(eval_result[[j]])
  }
568

569
  # Preinit evaluation error
Guolin Ke's avatar
Guolin Ke committed
570
  ret_eval_err <- NULL
571

572
  # Check for standard deviation
573
  if (showsd) {
574

575
    # Parse standard deviation
576
    for (j in seq_len(eval_len)) {
577
578
      ret_eval_err <- c(
        ret_eval_err
579
        , sqrt(mean(eval_result[[j]] ^ 2L) - mean(eval_result[[j]]) ^ 2L)
580
      )
Guolin Ke's avatar
Guolin Ke committed
581
    }
582

583
    # Convert to list
Guolin Ke's avatar
Guolin Ke committed
584
    ret_eval_err <- as.list(ret_eval_err)
585

Guolin Ke's avatar
Guolin Ke committed
586
  }
587

588
  # Return errors
589
590
591
592
  list(
    eval_list = ret_eval
    , eval_err_list = ret_eval_err
  )
593

594
}