lgb.cv.R 18.3 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
2
#' @importFrom R6 R6Class
CVBooster <- R6::R6Class(
3
  classname = "lgb.CVBooster",
4
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
5
  public = list(
6
    best_iter = -1L,
7
    best_score = NA,
Guolin Ke's avatar
Guolin Ke committed
8
    record_evals = list(),
9
10
    boosters = list(),
    initialize = function(x) {
Guolin Ke's avatar
Guolin Ke committed
11
      self$boosters <- x
12
    },
13
14
15
    reset_parameter = function(new_params) {
      for (x in boosters) { x$reset_parameter(new_params) }
      self
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
    }
  )
)

20
#' @name lgb.cv
James Lamb's avatar
James Lamb committed
21
#' @title Main CV logic for LightGBM
James Lamb's avatar
James Lamb committed
22
23
#' @description Cross validation logic used by LightGBM
#' @inheritParams lgb_shared_params
24
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
25
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
26
#' @param weight vector of response values. If not NULL, will set to dataset
27
#' @param obj objective function, can be character or custom objective function. Examples include
28
29
#'            \code{regression}, \code{regression_l1}, \code{huber},
#'             \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
Guolin Ke's avatar
Guolin Ke committed
30
#' @param eval evaluation function, can be (list of) character or custom eval function
31
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
Guolin Ke's avatar
Guolin Ke committed
32
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
33
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
34
#'                   by the values of outcome labels.
Guolin Ke's avatar
Guolin Ke committed
35
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
36
37
#'              (each element must be a vector of test fold's indices). When folds are supplied,
#'              the \code{nfold} and \code{stratified} parameters are ignored.
Guolin Ke's avatar
Guolin Ke committed
38
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
39
40
41
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
42
43
44
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#'                   into a predictor model which frees up memory and the original datasets
James Lamb's avatar
James Lamb committed
45
46
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#'            \itemize{
47
48
49
#'                \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
#'                \item{\code{num_leaves}: Maximum number of leaves in one tree.}
#'                \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with
James Lamb's avatar
James Lamb committed
50
#'                                 overfit when #data is small. Tree still grow by leaf-wise.}
51
#'                \item{\code{num_threads}: Number of threads for LightGBM. For the best speed, set this to
52
#'                                   the number of real CPU cores, not the number of threads (most
James Lamb's avatar
James Lamb committed
53
54
#'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
#'            }
55
#'
56
#' @return a trained model \code{lgb.CVBooster}.
57
#'
Guolin Ke's avatar
Guolin Ke committed
58
#' @examples
59
60
61
62
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2")
63
64
65
#' model <- lgb.cv(
#'   params = params
#'   , data = dtrain
66
#'   , nrounds = 5L
67
68
69
#'   , nfold = 3L
#'   , min_data = 1L
#'   , learning_rate = 1.0
70
#' )
71
#' @importFrom data.table data.table setorderv
Guolin Ke's avatar
Guolin Ke committed
72
#' @export
73
74
75
76
77
78
79
80
81
82
83
lgb.cv <- function(params = list()
                   , data
                   , nrounds = 10L
                   , nfold = 3L
                   , label = NULL
                   , weight = NULL
                   , obj = NULL
                   , eval = NULL
                   , verbose = 1L
                   , record = TRUE
                   , eval_freq = 1L
84
                   , showsd = TRUE
85
86
87
88
89
90
91
92
93
94
                   , stratified = TRUE
                   , folds = NULL
                   , init_model = NULL
                   , colnames = NULL
                   , categorical_feature = NULL
                   , early_stopping_rounds = NULL
                   , callbacks = list()
                   , reset_data = FALSE
                   , ...
                   ) {
95

96
97
98
99
100
101
102
103
104
105
106
107
108
  # validate parameters
  if (nrounds <= 0L) {
    stop("nrounds should be greater than zero")
  }

  # If 'data' is not an lgb.Dataset, try to construct one using 'label'
  if (!lgb.is.Dataset(data)) {
    if (is.null(label)) {
      stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
    }
    data <- lgb.Dataset(data, label = label)
  }

109
  # Setup temporary variables
110
  params <- append(params, list(...))
111
112
113
114
115
  params$verbose <- verbose
  params <- lgb.check.obj(params, obj)
  params <- lgb.check.eval(params, eval)
  fobj <- NULL
  feval <- NULL
116

117
  # Check for objective (function or not)
118
  if (is.function(params$objective)) {
Guolin Ke's avatar
Guolin Ke committed
119
120
121
    fobj <- params$objective
    params$objective <- "NONE"
  }
122

123
124
125
126
  # Check for loss (function or not)
  if (is.function(eval)) {
    feval <- eval
  }
127

128
  # Init predictor to empty
Guolin Ke's avatar
Guolin Ke committed
129
  predictor <- NULL
130

131
  # Check for boosting from a trained model
132
  if (is.character(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
133
    predictor <- Predictor$new(init_model)
134
  } else if (lgb.is.Booster(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
135
136
    predictor <- init_model$to_predictor()
  }
137

138
  # Set the iteration to start from / end to (and check for boosting from a trained model, again)
139
  begin_iteration <- 1L
140
  if (!is.null(predictor)) {
141
    begin_iteration <- predictor$current_iter() + 1L
Guolin Ke's avatar
Guolin Ke committed
142
  }
143
  # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
144
  n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
145
  if (any(names(params) %in% n_trees)) {
146
    end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1L]]] - 1L
147
  } else {
148
    end_iteration <- begin_iteration + nrounds - 1L
149
  }
150

151
152
  # Check for weights
  if (!is.null(weight)) {
153
    data$setinfo("weight", weight)
154
  }
155

156
  # Update parameters with parsed parameters
Guolin Ke's avatar
Guolin Ke committed
157
  data$update_params(params)
158

159
  # Create the predictor set
Guolin Ke's avatar
Guolin Ke committed
160
  data$.__enclos_env__$private$set_predictor(predictor)
161

162
163
164
165
  # Write column names
  if (!is.null(colnames)) {
    data$set_colnames(colnames)
  }
166

167
168
169
170
  # Write categorical features
  if (!is.null(categorical_feature)) {
    data$set_categorical_feature(categorical_feature)
  }
171

172
  # Construct datasets, if needed
Guolin Ke's avatar
Guolin Ke committed
173
  data$construct()
174

175
  # Check for folds
176
  if (!is.null(folds)) {
177

178
    # Check for list of folds or for single value
179
    if (!identical(class(folds), "list") || length(folds) < 2L) {
180
      stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
181
    }
182

183
    # Set number of folds
Guolin Ke's avatar
Guolin Ke committed
184
    nfold <- length(folds)
185

Guolin Ke's avatar
Guolin Ke committed
186
  } else {
187

188
    # Check fold value
189
    if (nfold <= 1L) {
190
191
      stop(sQuote("nfold"), " must be > 1")
    }
192

193
    # Create folds
194
    folds <- generate.cv.folds(
195
196
197
198
199
200
      nfold = nfold
      , nrows = nrow(data)
      , stratified = stratified
      , label = getinfo(data, "label")
      , group = getinfo(data, "group")
      , params = params
201
    )
202

Guolin Ke's avatar
Guolin Ke committed
203
  }
204

205
  # Add printing log callback
206
  if (verbose > 0L && eval_freq > 0L) {
Guolin Ke's avatar
Guolin Ke committed
207
208
    callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
  }
209

210
211
212
213
  # Add evaluation log callback
  if (record) {
    callbacks <- add.cb(callbacks, cb.record.evaluation())
  }
214

215
216
217
218
219
  # If early stopping was passed as a parameter in params(), prefer that to keyword argument
  # early_stopping_rounds by overwriting the value in 'early_stopping_rounds'
  early_stop <- .PARAMETER_ALIASES()[["early_stopping_round"]]
  early_stop_param_indx <- names(params) %in% early_stop
  if (any(early_stop_param_indx)) {
220
    first_early_stop_param <- which(early_stop_param_indx)[[1L]]
221
222
223
224
225
226
227
228
229
230
231
    first_early_stop_param_name <- names(params)[[first_early_stop_param]]
    early_stopping_rounds <- params[[first_early_stop_param_name]]
  }

  # Did user pass parameters that indicate they want to use early stopping?
  using_early_stopping_via_args <- !is.null(early_stopping_rounds)

  boosting_param_names <- .PARAMETER_ALIASES()[["boosting"]]
  using_dart <- any(
    sapply(
      X = boosting_param_names
232
233
      , FUN = function(param) {
        identical(params[[param]], "dart")
234
      }
235
236
237
238
    )
  )

  # Cannot use early stopping with 'dart' boosting
239
  if (using_dart) {
240
241
242
243
244
    warning("Early stopping is not available in 'dart' mode.")
    using_early_stopping_via_args <- FALSE

    # Remove the cb.early.stop() function if it was passed in to callbacks
    callbacks <- Filter(
245
      f = function(cb_func) {
246
247
248
249
250
251
252
        !identical(attr(cb_func, "name"), "cb.early.stop")
      }
      , x = callbacks
    )
  }

  # If user supplied early_stopping_rounds, add the early stopping callback
253
  if (using_early_stopping_via_args) {
254
255
256
257
258
259
260
    callbacks <- add.cb(
      callbacks
      , cb.early.stop(
        stopping_rounds = early_stopping_rounds
        , verbose = verbose
      )
    )
Guolin Ke's avatar
Guolin Ke committed
261
  }
262

263
  # Categorize callbacks
Guolin Ke's avatar
Guolin Ke committed
264
  cb <- categorize.callbacks(callbacks)
265

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
  # Construct booster for each fold. The data.table() code below is used to
  # guarantee that indices are sorted while keeping init_score and weight together
  # with the correct indices. Note that it takes advantage of the fact that
  # someDT$some_column returns NULL is 'some_column' does not exist in the data.table
  bst_folds <- lapply(
    X = seq_along(folds)
    , FUN = function(k) {

      # For learning-to-rank, each fold is a named list with two elements:
      #   * `fold` = an integer vector of row indices
      #   * `group` = an integer vector describing which groups are in the fold
      # For classification or regression tasks, it will just be an integer
      # vector of row indices
      folds_have_group <- "group" %in% names(folds[[k]])
      if (folds_have_group) {
        test_indices <- folds[[k]]$fold
        test_group_indices <- folds[[k]]$group
        test_groups <- getinfo(data, "group")[test_group_indices]
        train_groups <- getinfo(data, "group")[-test_group_indices]
      } else {
        test_indices <- folds[[k]]
      }
      train_indices <- seq_len(nrow(data))[-test_indices]

      # set up test set
      indexDT <- data.table::data.table(
        indices = test_indices
        , weight = getinfo(data, "weight")[test_indices]
        , init_score = getinfo(data, "init_score")[test_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtest <- slice(data, indexDT$indices)
      setinfo(dtest, "weight", indexDT$weight)
      setinfo(dtest, "init_score", indexDT$init_score)

      # set up training set
      indexDT <- data.table::data.table(
        indices = train_indices
        , weight = getinfo(data, "weight")[train_indices]
        , init_score = getinfo(data, "init_score")[train_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtrain <- slice(data, indexDT$indices)
      setinfo(dtrain, "weight", indexDT$weight)
      setinfo(dtrain, "init_score", indexDT$init_score)

      if (folds_have_group) {
        setinfo(dtest, "group", test_groups)
        setinfo(dtrain, "group", train_groups)
      }

317
318
      booster <- Booster$new(params, dtrain)
      booster$add_valid(dtest, "valid")
319
320
321
322
323
      return(
        list(booster = booster)
      )
    }
  )
324

325
  # Create new booster
Guolin Ke's avatar
Guolin Ke committed
326
  cv_booster <- CVBooster$new(bst_folds)
327

328
329
330
  # Callback env
  env <- CB_ENV$new()
  env$model <- cv_booster
Guolin Ke's avatar
Guolin Ke committed
331
  env$begin_iteration <- begin_iteration
332
  env$end_iteration <- end_iteration
333

334
  # Start training model using number of iterations to start and end with
335
  for (i in seq.int(from = begin_iteration, to = end_iteration)) {
336

337
    # Overwrite iteration in environment
Guolin Ke's avatar
Guolin Ke committed
338
339
    env$iteration <- i
    env$eval_list <- list()
340

341
342
343
344
    # Loop through "pre_iter" element
    for (f in cb$pre_iter) {
      f(env)
    }
345

346
    # Update one boosting iteration
Guolin Ke's avatar
Guolin Ke committed
347
    msg <- lapply(cv_booster$boosters, function(fd) {
348
349
      fd$booster$update(fobj = fobj)
      fd$booster$eval_valid(feval = feval)
Guolin Ke's avatar
Guolin Ke committed
350
    })
351

352
    # Prepare collection of evaluation results
Guolin Ke's avatar
Guolin Ke committed
353
    merged_msg <- lgb.merge.cv.result(msg)
354

355
    # Write evaluation result in environment
Guolin Ke's avatar
Guolin Ke committed
356
    env$eval_list <- merged_msg$eval_list
357

358
    # Check for standard deviation requirement
359
    if (showsd) {
360
361
      env$eval_err_list <- merged_msg$eval_err_list
    }
362

363
364
365
366
    # Loop through env
    for (f in cb$post_iter) {
      f(env)
    }
367

368
    # Check for early stopping and break if needed
369
    if (env$met_early_stop) break
370

Guolin Ke's avatar
Guolin Ke committed
371
  }
372

373
  if (record && is.na(env$best_score)) {
374
375
376
    if (env$eval_list[[1L]]$higher_better[1L] == TRUE) {
      cv_booster$best_iter <- unname(which.max(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]])))
      cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]]
377
    } else {
378
379
      cv_booster$best_iter <- unname(which.min(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]])))
      cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]]
380
381
    }
  }
382

383
384
385
  if (reset_data) {
    lapply(cv_booster$boosters, function(fd) {
      # Store temporarily model data elsewhere
386
387
      booster_old <- list(
        best_iter = fd$booster$best_iter
388
        , best_score = fd$booster$best_score
389
390
        , record_evals = fd$booster$record_evals
      )
391
392
393
394
395
396
397
      # Reload model
      fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
      fd$booster$best_iter <- booster_old$best_iter
      fd$booster$best_score <- booster_old$best_score
      fd$booster$record_evals <- booster_old$record_evals
    })
  }
398

399
400
  # Return booster
  return(cv_booster)
401

Guolin Ke's avatar
Guolin Ke committed
402
403
404
}

# Generates random (stratified if needed) CV folds
405
generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
406

407
408
  # Check for group existence
  if (is.null(group)) {
409

410
    # Shuffle
411
    rnd_idx <- sample.int(nrows)
412

413
    # Request stratified folds
414
    if (isTRUE(stratified) && params$objective %in% c("binary", "multiclass") && length(label) == length(rnd_idx)) {
415

416
417
418
      y <- label[rnd_idx]
      y <- factor(y)
      folds <- lgb.stratified.folds(y, nfold)
419

420
    } else {
421

422
423
      # Make simple non-stratified folds
      folds <- list()
424

425
      # Loop through each fold
426
      for (i in seq_len(nfold)) {
427
        kstep <- length(rnd_idx) %/% (nfold - i + 1L)
428
        folds[[i]] <- rnd_idx[seq_len(kstep)]
429
        rnd_idx <- rnd_idx[-seq_len(kstep)]
430
      }
431

432
    }
433

Guolin Ke's avatar
Guolin Ke committed
434
  } else {
435

436
437
438
439
    # When doing group, stratified is not possible (only random selection)
    if (nfold > length(group)) {
      stop("\n\tYou requested too many folds for the number of available groups.\n")
    }
440

441
    # Degroup the groups
442
    ungrouped <- inverse.rle(list(lengths = group, values = seq_along(group)))
443

444
    # Can't stratify, shuffle
445
    rnd_idx <- sample.int(length(group))
446

447
    # Make simple non-stratified folds
Guolin Ke's avatar
Guolin Ke committed
448
    folds <- list()
449

450
    # Loop through each fold
451
    for (i in seq_len(nfold)) {
452
      kstep <- length(rnd_idx) %/% (nfold - i + 1L)
453
454
455
456
      folds[[i]] <- list(
        fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
        , group = rnd_idx[seq_len(kstep)]
      )
457
      rnd_idx <- rnd_idx[-seq_len(kstep)]
Guolin Ke's avatar
Guolin Ke committed
458
    }
459

Guolin Ke's avatar
Guolin Ke committed
460
  }
461

462
463
  # Return folds
  return(folds)
464

Guolin Ke's avatar
Guolin Ke committed
465
466
467
468
469
}

# Creates CV folds stratified by the values of y.
# It was borrowed from caret::lgb.stratified.folds and simplified
# by always returning an unnamed list of fold indices.
470
#' @importFrom stats quantile
471
lgb.stratified.folds <- function(y, k = 10L) {
472

473
474
475
476
477
478
479
480
  ## Group the numeric data based on their magnitudes
  ## and sample within those groups.
  ## When the number of samples is low, we may have
  ## issues further slicing the numeric data into
  ## groups. The number of groups will depend on the
  ## ratio of the number of folds to the sample size.
  ## At most, we will use quantiles. If the sample
  ## is too small, we just do regular unstratified CV
Guolin Ke's avatar
Guolin Ke committed
481
  if (is.numeric(y)) {
482

483
    cuts <- length(y) %/% k
484
485
    if (cuts < 2L) {
      cuts <- 2L
486
    }
487
488
    if (cuts > 5L) {
      cuts <- 5L
489
490
491
    }
    y <- cut(
      y
492
      , unique(stats::quantile(y, probs = seq.int(0.0, 1.0, length.out = cuts)))
493
494
      , include.lowest = TRUE
    )
495

Guolin Ke's avatar
Guolin Ke committed
496
  }
497

Guolin Ke's avatar
Guolin Ke committed
498
  if (k < length(y)) {
499

500
    ## Reset levels so that the possible levels and
Guolin Ke's avatar
Guolin Ke committed
501
502
503
504
    ## the levels in the vector are the same
    y <- factor(as.character(y))
    numInClass <- table(y)
    foldVector <- vector(mode = "integer", length(y))
505

Guolin Ke's avatar
Guolin Ke committed
506
507
508
    ## For each class, balance the fold allocation as far
    ## as possible, then resample the remainder.
    ## The final assignment of folds is also randomized.
509

510
    for (i in seq_along(numInClass)) {
511

512
      ## Create a vector of integers from 1:k as many times as possible without
Guolin Ke's avatar
Guolin Ke committed
513
514
      ## going over the number of samples in the class. Note that if the number
      ## of samples in a class is less than k, nothing is producd here.
515
      seqVector <- rep(seq_len(k), numInClass[i] %/% k)
516

517
      ## Add enough random integers to get  length(seqVector) == numInClass[i]
518
      if (numInClass[i] %% k > 0L) {
519
        seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k))
520
      }
521

522
      ## Shuffle the integers for fold assignment and assign to this classes's data
523
      foldVector[y == dimnames(numInClass)$y[i]] <- sample(seqVector)
524

Guolin Ke's avatar
Guolin Ke committed
525
    }
526

Guolin Ke's avatar
Guolin Ke committed
527
  } else {
528

Guolin Ke's avatar
Guolin Ke committed
529
    foldVector <- seq(along = y)
530

Guolin Ke's avatar
Guolin Ke committed
531
  }
532

533
  # Return data
Guolin Ke's avatar
Guolin Ke committed
534
  out <- split(seq(along = y), foldVector)
535
536
  names(out) <- NULL
  out
Guolin Ke's avatar
Guolin Ke committed
537
538
}

539
lgb.merge.cv.result <- function(msg, showsd = TRUE) {
540

541
  # Get CV message length
542
  if (length(msg) == 0L) {
543
544
    stop("lgb.cv: size of cv result error")
  }
545

546
  # Get evaluation message length
547
  eval_len <- length(msg[[1L]])
548

549
  # Is evaluation message empty?
550
  if (eval_len == 0L) {
551
552
    stop("lgb.cv: should provide at least one metric for CV")
  }
553

554
  # Get evaluation results using a list apply
555
  eval_result <- lapply(seq_len(eval_len), function(j) {
556
557
    as.numeric(lapply(seq_along(msg), function(i) {
      msg[[i]][[j]]$value }))
Guolin Ke's avatar
Guolin Ke committed
558
  })
559

560
  # Get evaluation
561
  ret_eval <- msg[[1L]]
562

563
564
565
566
  # Go through evaluation length items
  for (j in seq_len(eval_len)) {
    ret_eval[[j]]$value <- mean(eval_result[[j]])
  }
567

568
  # Preinit evaluation error
Guolin Ke's avatar
Guolin Ke committed
569
  ret_eval_err <- NULL
570

571
  # Check for standard deviation
572
  if (showsd) {
573

574
    # Parse standard deviation
575
    for (j in seq_len(eval_len)) {
576
577
      ret_eval_err <- c(
        ret_eval_err
578
        , sqrt(mean(eval_result[[j]] ^ 2L) - mean(eval_result[[j]]) ^ 2L)
579
      )
Guolin Ke's avatar
Guolin Ke committed
580
    }
581

582
    # Convert to list
Guolin Ke's avatar
Guolin Ke committed
583
    ret_eval_err <- as.list(ret_eval_err)
584

Guolin Ke's avatar
Guolin Ke committed
585
  }
586

587
  # Return errors
588
589
590
591
  list(
    eval_list = ret_eval
    , eval_err_list = ret_eval_err
  )
592

593
}