lgb.cv.R 18.5 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
2
#' @importFrom R6 R6Class
CVBooster <- R6::R6Class(
3
  classname = "lgb.CVBooster",
4
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
5
  public = list(
6
    best_iter = -1L,
7
    best_score = NA,
Guolin Ke's avatar
Guolin Ke committed
8
    record_evals = list(),
9
10
    boosters = list(),
    initialize = function(x) {
Guolin Ke's avatar
Guolin Ke committed
11
      self$boosters <- x
12
    },
13
14
15
    reset_parameter = function(new_params) {
      for (x in boosters) { x$reset_parameter(new_params) }
      self
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
    }
  )
)

20
#' @name lgb.cv
James Lamb's avatar
James Lamb committed
21
#' @title Main CV logic for LightGBM
James Lamb's avatar
James Lamb committed
22
23
#' @description Cross validation logic used by LightGBM
#' @inheritParams lgb_shared_params
24
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
25
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
26
#' @param weight vector of response values. If not NULL, will set to dataset
27
#' @param obj objective function, can be character or custom objective function. Examples include
28
29
#'            \code{regression}, \code{regression_l1}, \code{huber},
#'             \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
Guolin Ke's avatar
Guolin Ke committed
30
#' @param eval evaluation function, can be (list of) character or custom eval function
31
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
Guolin Ke's avatar
Guolin Ke committed
32
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
33
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
34
#'                   by the values of outcome labels.
Guolin Ke's avatar
Guolin Ke committed
35
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
36
37
#'              (each element must be a vector of test fold's indices). When folds are supplied,
#'              the \code{nfold} and \code{stratified} parameters are ignored.
Guolin Ke's avatar
Guolin Ke committed
38
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
39
40
41
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
42
43
44
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#'                   into a predictor model which frees up memory and the original datasets
James Lamb's avatar
James Lamb committed
45
46
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#'            \itemize{
47
48
49
#'                \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
#'                \item{\code{num_leaves}: Maximum number of leaves in one tree.}
#'                \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with
James Lamb's avatar
James Lamb committed
50
#'                                 overfit when #data is small. Tree still grow by leaf-wise.}
51
#'                \item{\code{num_threads}: Number of threads for LightGBM. For the best speed, set this to
52
#'                                   the number of real CPU cores, not the number of threads (most
James Lamb's avatar
James Lamb committed
53
54
#'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
#'            }
55
#'
56
#' @return a trained model \code{lgb.CVBooster}.
57
#'
Guolin Ke's avatar
Guolin Ke committed
58
#' @examples
59
60
61
62
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2")
63
64
65
#' model <- lgb.cv(
#'   params = params
#'   , data = dtrain
66
#'   , nrounds = 5L
67
68
69
#'   , nfold = 3L
#'   , min_data = 1L
#'   , learning_rate = 1.0
70
#' )
71
#' @importFrom data.table data.table setorderv
Guolin Ke's avatar
Guolin Ke committed
72
#' @export
73
74
75
76
77
78
79
80
81
82
83
lgb.cv <- function(params = list()
                   , data
                   , nrounds = 10L
                   , nfold = 3L
                   , label = NULL
                   , weight = NULL
                   , obj = NULL
                   , eval = NULL
                   , verbose = 1L
                   , record = TRUE
                   , eval_freq = 1L
84
                   , showsd = TRUE
85
86
87
88
89
90
91
92
93
94
                   , stratified = TRUE
                   , folds = NULL
                   , init_model = NULL
                   , colnames = NULL
                   , categorical_feature = NULL
                   , early_stopping_rounds = NULL
                   , callbacks = list()
                   , reset_data = FALSE
                   , ...
                   ) {
95

96
97
98
99
100
101
102
103
104
105
106
107
108
  # validate parameters
  if (nrounds <= 0L) {
    stop("nrounds should be greater than zero")
  }

  # If 'data' is not an lgb.Dataset, try to construct one using 'label'
  if (!lgb.is.Dataset(data)) {
    if (is.null(label)) {
      stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
    }
    data <- lgb.Dataset(data, label = label)
  }

109
  # Setup temporary variables
110
  params <- append(params, list(...))
111
112
113
114
115
  params$verbose <- verbose
  params <- lgb.check.obj(params, obj)
  params <- lgb.check.eval(params, eval)
  fobj <- NULL
  feval <- NULL
116

117
  # Check for objective (function or not)
118
  if (is.function(params$objective)) {
Guolin Ke's avatar
Guolin Ke committed
119
120
121
    fobj <- params$objective
    params$objective <- "NONE"
  }
122

123
124
125
126
  # Check for loss (function or not)
  if (is.function(eval)) {
    feval <- eval
  }
127

128
  # Init predictor to empty
Guolin Ke's avatar
Guolin Ke committed
129
  predictor <- NULL
130

131
  # Check for boosting from a trained model
132
  if (is.character(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
133
    predictor <- Predictor$new(init_model)
134
  } else if (lgb.is.Booster(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
135
136
    predictor <- init_model$to_predictor()
  }
137

138
  # Set the iteration to start from / end to (and check for boosting from a trained model, again)
139
  begin_iteration <- 1L
140
  if (!is.null(predictor)) {
141
    begin_iteration <- predictor$current_iter() + 1L
Guolin Ke's avatar
Guolin Ke committed
142
  }
143
  # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
144
  n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
145
  if (any(names(params) %in% n_trees)) {
146
    end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1L]]] - 1L
147
  } else {
148
    end_iteration <- begin_iteration + nrounds - 1L
149
  }
150

151
152
  # Check for weights
  if (!is.null(weight)) {
153
    data$setinfo("weight", weight)
154
  }
155

156
  # Update parameters with parsed parameters
Guolin Ke's avatar
Guolin Ke committed
157
  data$update_params(params)
158

159
  # Create the predictor set
Guolin Ke's avatar
Guolin Ke committed
160
  data$.__enclos_env__$private$set_predictor(predictor)
161

162
163
164
165
  # Write column names
  if (!is.null(colnames)) {
    data$set_colnames(colnames)
  }
166

167
168
169
170
  # Write categorical features
  if (!is.null(categorical_feature)) {
    data$set_categorical_feature(categorical_feature)
  }
171

172
  # Construct datasets, if needed
Guolin Ke's avatar
Guolin Ke committed
173
  data$construct()
174

175
  # Check for folds
176
  if (!is.null(folds)) {
177

178
    # Check for list of folds or for single value
179
    if (!identical(class(folds), "list") || length(folds) < 2L) {
180
      stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
181
    }
182

183
    # Set number of folds
Guolin Ke's avatar
Guolin Ke committed
184
    nfold <- length(folds)
185

Guolin Ke's avatar
Guolin Ke committed
186
  } else {
187

188
    # Check fold value
189
    if (nfold <= 1L) {
190
191
      stop(sQuote("nfold"), " must be > 1")
    }
192

193
    # Create folds
194
    folds <- generate.cv.folds(
195
196
197
198
199
200
      nfold = nfold
      , nrows = nrow(data)
      , stratified = stratified
      , label = getinfo(data, "label")
      , group = getinfo(data, "group")
      , params = params
201
    )
202

Guolin Ke's avatar
Guolin Ke committed
203
  }
204

205
  # Add printing log callback
206
  if (verbose > 0L && eval_freq > 0L) {
Guolin Ke's avatar
Guolin Ke committed
207
208
    callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
  }
209

210
211
212
213
  # Add evaluation log callback
  if (record) {
    callbacks <- add.cb(callbacks, cb.record.evaluation())
  }
214

215
216
217
218
219
  # If early stopping was passed as a parameter in params(), prefer that to keyword argument
  # early_stopping_rounds by overwriting the value in 'early_stopping_rounds'
  early_stop <- .PARAMETER_ALIASES()[["early_stopping_round"]]
  early_stop_param_indx <- names(params) %in% early_stop
  if (any(early_stop_param_indx)) {
220
    first_early_stop_param <- which(early_stop_param_indx)[[1L]]
221
222
223
224
225
226
227
228
229
230
231
    first_early_stop_param_name <- names(params)[[first_early_stop_param]]
    early_stopping_rounds <- params[[first_early_stop_param_name]]
  }

  # Did user pass parameters that indicate they want to use early stopping?
  using_early_stopping_via_args <- !is.null(early_stopping_rounds)

  boosting_param_names <- .PARAMETER_ALIASES()[["boosting"]]
  using_dart <- any(
    sapply(
      X = boosting_param_names
232
233
      , FUN = function(param) {
        identical(params[[param]], "dart")
234
      }
235
236
237
238
    )
  )

  # Cannot use early stopping with 'dart' boosting
239
  if (using_dart) {
240
241
242
243
244
    warning("Early stopping is not available in 'dart' mode.")
    using_early_stopping_via_args <- FALSE

    # Remove the cb.early.stop() function if it was passed in to callbacks
    callbacks <- Filter(
245
      f = function(cb_func) {
246
247
248
249
250
251
252
        !identical(attr(cb_func, "name"), "cb.early.stop")
      }
      , x = callbacks
    )
  }

  # If user supplied early_stopping_rounds, add the early stopping callback
253
  if (using_early_stopping_via_args) {
254
255
256
257
258
259
260
    callbacks <- add.cb(
      callbacks
      , cb.early.stop(
        stopping_rounds = early_stopping_rounds
        , verbose = verbose
      )
    )
Guolin Ke's avatar
Guolin Ke committed
261
  }
262

263
  # Categorize callbacks
Guolin Ke's avatar
Guolin Ke committed
264
  cb <- categorize.callbacks(callbacks)
265

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
  # Construct booster for each fold. The data.table() code below is used to
  # guarantee that indices are sorted while keeping init_score and weight together
  # with the correct indices. Note that it takes advantage of the fact that
  # someDT$some_column returns NULL is 'some_column' does not exist in the data.table
  bst_folds <- lapply(
    X = seq_along(folds)
    , FUN = function(k) {

      # For learning-to-rank, each fold is a named list with two elements:
      #   * `fold` = an integer vector of row indices
      #   * `group` = an integer vector describing which groups are in the fold
      # For classification or regression tasks, it will just be an integer
      # vector of row indices
      folds_have_group <- "group" %in% names(folds[[k]])
      if (folds_have_group) {
        test_indices <- folds[[k]]$fold
        test_group_indices <- folds[[k]]$group
        test_groups <- getinfo(data, "group")[test_group_indices]
        train_groups <- getinfo(data, "group")[-test_group_indices]
      } else {
        test_indices <- folds[[k]]
      }
      train_indices <- seq_len(nrow(data))[-test_indices]

      # set up test set
      indexDT <- data.table::data.table(
        indices = test_indices
        , weight = getinfo(data, "weight")[test_indices]
        , init_score = getinfo(data, "init_score")[test_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtest <- slice(data, indexDT$indices)
      setinfo(dtest, "weight", indexDT$weight)
      setinfo(dtest, "init_score", indexDT$init_score)

      # set up training set
      indexDT <- data.table::data.table(
        indices = train_indices
        , weight = getinfo(data, "weight")[train_indices]
        , init_score = getinfo(data, "init_score")[train_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtrain <- slice(data, indexDT$indices)
      setinfo(dtrain, "weight", indexDT$weight)
      setinfo(dtrain, "init_score", indexDT$init_score)

      if (folds_have_group) {
        setinfo(dtest, "group", test_groups)
        setinfo(dtrain, "group", train_groups)
      }

317
318
      booster <- Booster$new(params, dtrain)
      booster$add_valid(dtest, "valid")
319
320
321
322
323
      return(
        list(booster = booster)
      )
    }
  )
324

325
  # Create new booster
Guolin Ke's avatar
Guolin Ke committed
326
  cv_booster <- CVBooster$new(bst_folds)
327

328
329
330
  # Callback env
  env <- CB_ENV$new()
  env$model <- cv_booster
Guolin Ke's avatar
Guolin Ke committed
331
  env$begin_iteration <- begin_iteration
332
  env$end_iteration <- end_iteration
333

334
  # Start training model using number of iterations to start and end with
335
  for (i in seq.int(from = begin_iteration, to = end_iteration)) {
336

337
    # Overwrite iteration in environment
Guolin Ke's avatar
Guolin Ke committed
338
339
    env$iteration <- i
    env$eval_list <- list()
340

341
342
343
344
    # Loop through "pre_iter" element
    for (f in cb$pre_iter) {
      f(env)
    }
345

346
    # Update one boosting iteration
Guolin Ke's avatar
Guolin Ke committed
347
    msg <- lapply(cv_booster$boosters, function(fd) {
348
349
      fd$booster$update(fobj = fobj)
      fd$booster$eval_valid(feval = feval)
Guolin Ke's avatar
Guolin Ke committed
350
    })
351

352
    # Prepare collection of evaluation results
Guolin Ke's avatar
Guolin Ke committed
353
    merged_msg <- lgb.merge.cv.result(msg)
354

355
    # Write evaluation result in environment
Guolin Ke's avatar
Guolin Ke committed
356
    env$eval_list <- merged_msg$eval_list
357

358
    # Check for standard deviation requirement
359
    if (showsd) {
360
361
      env$eval_err_list <- merged_msg$eval_err_list
    }
362

363
364
365
366
    # Loop through env
    for (f in cb$post_iter) {
      f(env)
    }
367

368
    # Check for early stopping and break if needed
369
    if (env$met_early_stop) break
370

Guolin Ke's avatar
Guolin Ke committed
371
  }
372

373
374
  # When early stopping is not activated, we compute the best iteration / score ourselves
  # based on the first first metric
375
  if (record && is.na(env$best_score)) {
376
377
378
379
    first_metric <- cv_booster$boosters[[1L]][[1L]]$.__enclos_env__$private$eval_names[1L]
    .find_best <- which.min
    if (isTRUE(env$eval_list[[1L]]$higher_better[1L])) {
      .find_best <- which.max
380
    }
381
382
383
384
385
386
387
388
    cv_booster$best_iter <- unname(
      .find_best(
        unlist(
          cv_booster$record_evals[["valid"]][[first_metric]][[.EVAL_KEY()]]
        )
      )
    )
    cv_booster$best_score <- cv_booster$record_evals[["valid"]][[first_metric]][[.EVAL_KEY()]][[cv_booster$best_iter]]
389
  }
390

391
392
393
  if (reset_data) {
    lapply(cv_booster$boosters, function(fd) {
      # Store temporarily model data elsewhere
394
395
      booster_old <- list(
        best_iter = fd$booster$best_iter
396
        , best_score = fd$booster$best_score
397
398
        , record_evals = fd$booster$record_evals
      )
399
400
401
402
403
404
405
      # Reload model
      fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
      fd$booster$best_iter <- booster_old$best_iter
      fd$booster$best_score <- booster_old$best_score
      fd$booster$record_evals <- booster_old$record_evals
    })
  }
406

407
408
  # Return booster
  return(cv_booster)
409

Guolin Ke's avatar
Guolin Ke committed
410
411
412
}

# Generates random (stratified if needed) CV folds
413
generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
414

415
416
  # Check for group existence
  if (is.null(group)) {
417

418
    # Shuffle
419
    rnd_idx <- sample.int(nrows)
420

421
    # Request stratified folds
422
    if (isTRUE(stratified) && params$objective %in% c("binary", "multiclass") && length(label) == length(rnd_idx)) {
423

424
425
426
      y <- label[rnd_idx]
      y <- factor(y)
      folds <- lgb.stratified.folds(y, nfold)
427

428
    } else {
429

430
431
      # Make simple non-stratified folds
      folds <- list()
432

433
      # Loop through each fold
434
      for (i in seq_len(nfold)) {
435
        kstep <- length(rnd_idx) %/% (nfold - i + 1L)
436
        folds[[i]] <- rnd_idx[seq_len(kstep)]
437
        rnd_idx <- rnd_idx[-seq_len(kstep)]
438
      }
439

440
    }
441

Guolin Ke's avatar
Guolin Ke committed
442
  } else {
443

444
445
446
447
    # When doing group, stratified is not possible (only random selection)
    if (nfold > length(group)) {
      stop("\n\tYou requested too many folds for the number of available groups.\n")
    }
448

449
    # Degroup the groups
450
    ungrouped <- inverse.rle(list(lengths = group, values = seq_along(group)))
451

452
    # Can't stratify, shuffle
453
    rnd_idx <- sample.int(length(group))
454

455
    # Make simple non-stratified folds
Guolin Ke's avatar
Guolin Ke committed
456
    folds <- list()
457

458
    # Loop through each fold
459
    for (i in seq_len(nfold)) {
460
      kstep <- length(rnd_idx) %/% (nfold - i + 1L)
461
462
463
464
      folds[[i]] <- list(
        fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
        , group = rnd_idx[seq_len(kstep)]
      )
465
      rnd_idx <- rnd_idx[-seq_len(kstep)]
Guolin Ke's avatar
Guolin Ke committed
466
    }
467

Guolin Ke's avatar
Guolin Ke committed
468
  }
469

470
471
  # Return folds
  return(folds)
472

Guolin Ke's avatar
Guolin Ke committed
473
474
475
476
477
}

# Creates CV folds stratified by the values of y.
# It was borrowed from caret::lgb.stratified.folds and simplified
# by always returning an unnamed list of fold indices.
478
#' @importFrom stats quantile
479
lgb.stratified.folds <- function(y, k = 10L) {
480

481
482
483
484
485
486
487
488
  ## Group the numeric data based on their magnitudes
  ## and sample within those groups.
  ## When the number of samples is low, we may have
  ## issues further slicing the numeric data into
  ## groups. The number of groups will depend on the
  ## ratio of the number of folds to the sample size.
  ## At most, we will use quantiles. If the sample
  ## is too small, we just do regular unstratified CV
Guolin Ke's avatar
Guolin Ke committed
489
  if (is.numeric(y)) {
490

491
    cuts <- length(y) %/% k
492
493
    if (cuts < 2L) {
      cuts <- 2L
494
    }
495
496
    if (cuts > 5L) {
      cuts <- 5L
497
498
499
    }
    y <- cut(
      y
500
      , unique(stats::quantile(y, probs = seq.int(0.0, 1.0, length.out = cuts)))
501
502
      , include.lowest = TRUE
    )
503

Guolin Ke's avatar
Guolin Ke committed
504
  }
505

Guolin Ke's avatar
Guolin Ke committed
506
  if (k < length(y)) {
507

508
    ## Reset levels so that the possible levels and
Guolin Ke's avatar
Guolin Ke committed
509
510
511
512
    ## the levels in the vector are the same
    y <- factor(as.character(y))
    numInClass <- table(y)
    foldVector <- vector(mode = "integer", length(y))
513

Guolin Ke's avatar
Guolin Ke committed
514
515
516
    ## For each class, balance the fold allocation as far
    ## as possible, then resample the remainder.
    ## The final assignment of folds is also randomized.
517

518
    for (i in seq_along(numInClass)) {
519

520
      ## Create a vector of integers from 1:k as many times as possible without
Guolin Ke's avatar
Guolin Ke committed
521
522
      ## going over the number of samples in the class. Note that if the number
      ## of samples in a class is less than k, nothing is producd here.
523
      seqVector <- rep(seq_len(k), numInClass[i] %/% k)
524

525
      ## Add enough random integers to get  length(seqVector) == numInClass[i]
526
      if (numInClass[i] %% k > 0L) {
527
        seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k))
528
      }
529

530
      ## Shuffle the integers for fold assignment and assign to this classes's data
531
      foldVector[y == dimnames(numInClass)$y[i]] <- sample(seqVector)
532

Guolin Ke's avatar
Guolin Ke committed
533
    }
534

Guolin Ke's avatar
Guolin Ke committed
535
  } else {
536

Guolin Ke's avatar
Guolin Ke committed
537
    foldVector <- seq(along = y)
538

Guolin Ke's avatar
Guolin Ke committed
539
  }
540

541
  # Return data
Guolin Ke's avatar
Guolin Ke committed
542
  out <- split(seq(along = y), foldVector)
543
544
  names(out) <- NULL
  out
Guolin Ke's avatar
Guolin Ke committed
545
546
}

547
lgb.merge.cv.result <- function(msg, showsd = TRUE) {
548

549
  # Get CV message length
550
  if (length(msg) == 0L) {
551
552
    stop("lgb.cv: size of cv result error")
  }
553

554
  # Get evaluation message length
555
  eval_len <- length(msg[[1L]])
556

557
  # Is evaluation message empty?
558
  if (eval_len == 0L) {
559
560
    stop("lgb.cv: should provide at least one metric for CV")
  }
561

562
  # Get evaluation results using a list apply
563
  eval_result <- lapply(seq_len(eval_len), function(j) {
564
565
    as.numeric(lapply(seq_along(msg), function(i) {
      msg[[i]][[j]]$value }))
Guolin Ke's avatar
Guolin Ke committed
566
  })
567

568
  # Get evaluation
569
  ret_eval <- msg[[1L]]
570

571
572
573
574
  # Go through evaluation length items
  for (j in seq_len(eval_len)) {
    ret_eval[[j]]$value <- mean(eval_result[[j]])
  }
575

576
  # Preinit evaluation error
Guolin Ke's avatar
Guolin Ke committed
577
  ret_eval_err <- NULL
578

579
  # Check for standard deviation
580
  if (showsd) {
581

582
    # Parse standard deviation
583
    for (j in seq_len(eval_len)) {
584
585
      ret_eval_err <- c(
        ret_eval_err
586
        , sqrt(mean(eval_result[[j]] ^ 2L) - mean(eval_result[[j]]) ^ 2L)
587
      )
Guolin Ke's avatar
Guolin Ke committed
588
    }
589

590
    # Convert to list
Guolin Ke's avatar
Guolin Ke committed
591
    ret_eval_err <- as.list(ret_eval_err)
592

Guolin Ke's avatar
Guolin Ke committed
593
  }
594

595
  # Return errors
596
597
598
599
  list(
    eval_list = ret_eval
    , eval_err_list = ret_eval_err
  )
600

601
}