lgb.cv.R 18.4 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
2
#' @importFrom R6 R6Class
CVBooster <- R6::R6Class(
3
  classname = "lgb.CVBooster",
4
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
5
  public = list(
6
    best_iter = -1L,
7
    best_score = NA,
Guolin Ke's avatar
Guolin Ke committed
8
    record_evals = list(),
9
10
    boosters = list(),
    initialize = function(x) {
Guolin Ke's avatar
Guolin Ke committed
11
      self$boosters <- x
12
    },
13
14
15
    reset_parameter = function(new_params) {
      for (x in boosters) { x$reset_parameter(new_params) }
      self
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
    }
  )
)

20
#' @name lgb.cv
James Lamb's avatar
James Lamb committed
21
#' @title Main CV logic for LightGBM
James Lamb's avatar
James Lamb committed
22
23
#' @description Cross validation logic used by LightGBM
#' @inheritParams lgb_shared_params
24
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
25
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
26
#' @param weight vector of response values. If not NULL, will set to dataset
27
#' @param obj objective function, can be character or custom objective function. Examples include
28
29
#'            \code{regression}, \code{regression_l1}, \code{huber},
#'             \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
Guolin Ke's avatar
Guolin Ke committed
30
#' @param eval evaluation function, can be (list of) character or custom eval function
31
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
Guolin Ke's avatar
Guolin Ke committed
32
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
33
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
34
#'                   by the values of outcome labels.
Guolin Ke's avatar
Guolin Ke committed
35
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
36
37
#'              (each element must be a vector of test fold's indices). When folds are supplied,
#'              the \code{nfold} and \code{stratified} parameters are ignored.
Guolin Ke's avatar
Guolin Ke committed
38
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
39
40
41
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
42
43
44
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#'                   into a predictor model which frees up memory and the original datasets
James Lamb's avatar
James Lamb committed
45
46
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#'            \itemize{
47
48
49
#'                \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
#'                \item{\code{num_leaves}: Maximum number of leaves in one tree.}
#'                \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with
James Lamb's avatar
James Lamb committed
50
#'                                 overfit when #data is small. Tree still grow by leaf-wise.}
51
#'                \item{\code{num_threads}: Number of threads for LightGBM. For the best speed, set this to
52
#'                                   the number of real CPU cores, not the number of threads (most
James Lamb's avatar
James Lamb committed
53
54
#'                                   CPU using hyper-threading to generate 2 threads per CPU core).}
#'            }
55
#'
56
#' @return a trained model \code{lgb.CVBooster}.
57
#'
Guolin Ke's avatar
Guolin Ke committed
58
#' @examples
59
60
61
62
63
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2")
64
65
66
#' model <- lgb.cv(
#'   params = params
#'   , data = dtrain
67
68
69
70
71
#'   , nrounds = 10L
#'   , nfold = 3L
#'   , min_data = 1L
#'   , learning_rate = 1.0
#'   , early_stopping_rounds = 5L
72
#' )
73
#' @importFrom data.table data.table setorderv
Guolin Ke's avatar
Guolin Ke committed
74
#' @export
75
76
77
78
79
80
81
82
83
84
85
lgb.cv <- function(params = list()
                   , data
                   , nrounds = 10L
                   , nfold = 3L
                   , label = NULL
                   , weight = NULL
                   , obj = NULL
                   , eval = NULL
                   , verbose = 1L
                   , record = TRUE
                   , eval_freq = 1L
86
                   , showsd = TRUE
87
88
89
90
91
92
93
94
95
96
                   , stratified = TRUE
                   , folds = NULL
                   , init_model = NULL
                   , colnames = NULL
                   , categorical_feature = NULL
                   , early_stopping_rounds = NULL
                   , callbacks = list()
                   , reset_data = FALSE
                   , ...
                   ) {
97

98
99
100
101
102
103
104
105
106
107
108
109
110
  # validate parameters
  if (nrounds <= 0L) {
    stop("nrounds should be greater than zero")
  }

  # If 'data' is not an lgb.Dataset, try to construct one using 'label'
  if (!lgb.is.Dataset(data)) {
    if (is.null(label)) {
      stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
    }
    data <- lgb.Dataset(data, label = label)
  }

111
  # Setup temporary variables
112
  params <- append(params, list(...))
113
114
115
116
117
  params$verbose <- verbose
  params <- lgb.check.obj(params, obj)
  params <- lgb.check.eval(params, eval)
  fobj <- NULL
  feval <- NULL
118

119
  # Check for objective (function or not)
120
  if (is.function(params$objective)) {
Guolin Ke's avatar
Guolin Ke committed
121
122
123
    fobj <- params$objective
    params$objective <- "NONE"
  }
124

125
126
127
128
  # Check for loss (function or not)
  if (is.function(eval)) {
    feval <- eval
  }
129

130
  # Init predictor to empty
Guolin Ke's avatar
Guolin Ke committed
131
  predictor <- NULL
132

133
  # Check for boosting from a trained model
134
  if (is.character(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
135
    predictor <- Predictor$new(init_model)
136
  } else if (lgb.is.Booster(init_model)) {
Guolin Ke's avatar
Guolin Ke committed
137
138
    predictor <- init_model$to_predictor()
  }
139

140
  # Set the iteration to start from / end to (and check for boosting from a trained model, again)
141
  begin_iteration <- 1L
142
  if (!is.null(predictor)) {
143
    begin_iteration <- predictor$current_iter() + 1L
Guolin Ke's avatar
Guolin Ke committed
144
  }
145
  # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
146
  n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
147
  if (any(names(params) %in% n_trees)) {
148
    end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1L]]] - 1L
149
  } else {
150
    end_iteration <- begin_iteration + nrounds - 1L
151
  }
152

153
154
  # Check for weights
  if (!is.null(weight)) {
155
    data$setinfo("weight", weight)
156
  }
157

158
  # Update parameters with parsed parameters
Guolin Ke's avatar
Guolin Ke committed
159
  data$update_params(params)
160

161
  # Create the predictor set
Guolin Ke's avatar
Guolin Ke committed
162
  data$.__enclos_env__$private$set_predictor(predictor)
163

164
165
166
167
  # Write column names
  if (!is.null(colnames)) {
    data$set_colnames(colnames)
  }
168

169
170
171
172
  # Write categorical features
  if (!is.null(categorical_feature)) {
    data$set_categorical_feature(categorical_feature)
  }
173

174
  # Construct datasets, if needed
Guolin Ke's avatar
Guolin Ke committed
175
  data$construct()
176

177
  # Check for folds
178
  if (!is.null(folds)) {
179

180
    # Check for list of folds or for single value
181
    if (!identical(class(folds), "list") || length(folds) < 2L) {
182
      stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
183
    }
184

185
    # Set number of folds
Guolin Ke's avatar
Guolin Ke committed
186
    nfold <- length(folds)
187

Guolin Ke's avatar
Guolin Ke committed
188
  } else {
189

190
    # Check fold value
191
    if (nfold <= 1L) {
192
193
      stop(sQuote("nfold"), " must be > 1")
    }
194

195
    # Create folds
196
    folds <- generate.cv.folds(
197
198
199
200
201
202
      nfold = nfold
      , nrows = nrow(data)
      , stratified = stratified
      , label = getinfo(data, "label")
      , group = getinfo(data, "group")
      , params = params
203
    )
204

Guolin Ke's avatar
Guolin Ke committed
205
  }
206

207
  # Add printing log callback
208
  if (verbose > 0L && eval_freq > 0L) {
Guolin Ke's avatar
Guolin Ke committed
209
210
    callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
  }
211

212
213
214
215
  # Add evaluation log callback
  if (record) {
    callbacks <- add.cb(callbacks, cb.record.evaluation())
  }
216

217
218
219
220
221
  # If early stopping was passed as a parameter in params(), prefer that to keyword argument
  # early_stopping_rounds by overwriting the value in 'early_stopping_rounds'
  early_stop <- .PARAMETER_ALIASES()[["early_stopping_round"]]
  early_stop_param_indx <- names(params) %in% early_stop
  if (any(early_stop_param_indx)) {
222
    first_early_stop_param <- which(early_stop_param_indx)[[1L]]
223
224
225
226
227
228
229
230
231
232
233
    first_early_stop_param_name <- names(params)[[first_early_stop_param]]
    early_stopping_rounds <- params[[first_early_stop_param_name]]
  }

  # Did user pass parameters that indicate they want to use early stopping?
  using_early_stopping_via_args <- !is.null(early_stopping_rounds)

  boosting_param_names <- .PARAMETER_ALIASES()[["boosting"]]
  using_dart <- any(
    sapply(
      X = boosting_param_names
234
235
      , FUN = function(param) {
        identical(params[[param]], "dart")
236
      }
237
238
239
240
    )
  )

  # Cannot use early stopping with 'dart' boosting
241
  if (using_dart) {
242
243
244
245
246
    warning("Early stopping is not available in 'dart' mode.")
    using_early_stopping_via_args <- FALSE

    # Remove the cb.early.stop() function if it was passed in to callbacks
    callbacks <- Filter(
247
      f = function(cb_func) {
248
249
250
251
252
253
254
        !identical(attr(cb_func, "name"), "cb.early.stop")
      }
      , x = callbacks
    )
  }

  # If user supplied early_stopping_rounds, add the early stopping callback
255
  if (using_early_stopping_via_args) {
256
257
258
259
260
261
262
    callbacks <- add.cb(
      callbacks
      , cb.early.stop(
        stopping_rounds = early_stopping_rounds
        , verbose = verbose
      )
    )
Guolin Ke's avatar
Guolin Ke committed
263
  }
264

265
  # Categorize callbacks
Guolin Ke's avatar
Guolin Ke committed
266
  cb <- categorize.callbacks(callbacks)
267

268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
  # Construct booster for each fold. The data.table() code below is used to
  # guarantee that indices are sorted while keeping init_score and weight together
  # with the correct indices. Note that it takes advantage of the fact that
  # someDT$some_column returns NULL is 'some_column' does not exist in the data.table
  bst_folds <- lapply(
    X = seq_along(folds)
    , FUN = function(k) {

      # For learning-to-rank, each fold is a named list with two elements:
      #   * `fold` = an integer vector of row indices
      #   * `group` = an integer vector describing which groups are in the fold
      # For classification or regression tasks, it will just be an integer
      # vector of row indices
      folds_have_group <- "group" %in% names(folds[[k]])
      if (folds_have_group) {
        test_indices <- folds[[k]]$fold
        test_group_indices <- folds[[k]]$group
        test_groups <- getinfo(data, "group")[test_group_indices]
        train_groups <- getinfo(data, "group")[-test_group_indices]
      } else {
        test_indices <- folds[[k]]
      }
      train_indices <- seq_len(nrow(data))[-test_indices]

      # set up test set
      indexDT <- data.table::data.table(
        indices = test_indices
        , weight = getinfo(data, "weight")[test_indices]
        , init_score = getinfo(data, "init_score")[test_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtest <- slice(data, indexDT$indices)
      setinfo(dtest, "weight", indexDT$weight)
      setinfo(dtest, "init_score", indexDT$init_score)

      # set up training set
      indexDT <- data.table::data.table(
        indices = train_indices
        , weight = getinfo(data, "weight")[train_indices]
        , init_score = getinfo(data, "init_score")[train_indices]
      )
      data.table::setorderv(indexDT, "indices", order = 1L)
      dtrain <- slice(data, indexDT$indices)
      setinfo(dtrain, "weight", indexDT$weight)
      setinfo(dtrain, "init_score", indexDT$init_score)

      if (folds_have_group) {
        setinfo(dtest, "group", test_groups)
        setinfo(dtrain, "group", train_groups)
      }

319
320
      booster <- Booster$new(params, dtrain)
      booster$add_valid(dtest, "valid")
321
322
323
324
325
      return(
        list(booster = booster)
      )
    }
  )
326

327
  # Create new booster
Guolin Ke's avatar
Guolin Ke committed
328
  cv_booster <- CVBooster$new(bst_folds)
329

330
331
332
  # Callback env
  env <- CB_ENV$new()
  env$model <- cv_booster
Guolin Ke's avatar
Guolin Ke committed
333
  env$begin_iteration <- begin_iteration
334
  env$end_iteration <- end_iteration
335

336
  # Start training model using number of iterations to start and end with
337
  for (i in seq.int(from = begin_iteration, to = end_iteration)) {
338

339
    # Overwrite iteration in environment
Guolin Ke's avatar
Guolin Ke committed
340
341
    env$iteration <- i
    env$eval_list <- list()
342

343
344
345
346
    # Loop through "pre_iter" element
    for (f in cb$pre_iter) {
      f(env)
    }
347

348
    # Update one boosting iteration
Guolin Ke's avatar
Guolin Ke committed
349
    msg <- lapply(cv_booster$boosters, function(fd) {
350
351
      fd$booster$update(fobj = fobj)
      fd$booster$eval_valid(feval = feval)
Guolin Ke's avatar
Guolin Ke committed
352
    })
353

354
    # Prepare collection of evaluation results
Guolin Ke's avatar
Guolin Ke committed
355
    merged_msg <- lgb.merge.cv.result(msg)
356

357
    # Write evaluation result in environment
Guolin Ke's avatar
Guolin Ke committed
358
    env$eval_list <- merged_msg$eval_list
359

360
    # Check for standard deviation requirement
361
    if (showsd) {
362
363
      env$eval_err_list <- merged_msg$eval_err_list
    }
364

365
366
367
368
    # Loop through env
    for (f in cb$post_iter) {
      f(env)
    }
369

370
    # Check for early stopping and break if needed
371
    if (env$met_early_stop) break
372

Guolin Ke's avatar
Guolin Ke committed
373
  }
374

375
  if (record && is.na(env$best_score)) {
376
377
378
    if (env$eval_list[[1L]]$higher_better[1L] == TRUE) {
      cv_booster$best_iter <- unname(which.max(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]])))
      cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]]
379
    } else {
380
381
      cv_booster$best_iter <- unname(which.min(unlist(cv_booster$record_evals[[2L]][[1L]][[1L]])))
      cv_booster$best_score <- cv_booster$record_evals[[2L]][[1L]][[1L]][[cv_booster$best_iter]]
382
383
    }
  }
384

385
386
387
  if (reset_data) {
    lapply(cv_booster$boosters, function(fd) {
      # Store temporarily model data elsewhere
388
389
      booster_old <- list(
        best_iter = fd$booster$best_iter
390
        , best_score = fd$booster$best_score
391
392
        , record_evals = fd$booster$record_evals
      )
393
394
395
396
397
398
399
      # Reload model
      fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
      fd$booster$best_iter <- booster_old$best_iter
      fd$booster$best_score <- booster_old$best_score
      fd$booster$record_evals <- booster_old$record_evals
    })
  }
400

401
402
  # Return booster
  return(cv_booster)
403

Guolin Ke's avatar
Guolin Ke committed
404
405
406
}

# Generates random (stratified if needed) CV folds
407
generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
408

409
410
  # Check for group existence
  if (is.null(group)) {
411

412
    # Shuffle
413
    rnd_idx <- sample.int(nrows)
414

415
    # Request stratified folds
416
    if (isTRUE(stratified) && params$objective %in% c("binary", "multiclass") && length(label) == length(rnd_idx)) {
417

418
419
420
      y <- label[rnd_idx]
      y <- factor(y)
      folds <- lgb.stratified.folds(y, nfold)
421

422
    } else {
423

424
425
      # Make simple non-stratified folds
      folds <- list()
426

427
      # Loop through each fold
428
      for (i in seq_len(nfold)) {
429
        kstep <- length(rnd_idx) %/% (nfold - i + 1L)
430
        folds[[i]] <- rnd_idx[seq_len(kstep)]
431
        rnd_idx <- rnd_idx[-seq_len(kstep)]
432
      }
433

434
    }
435

Guolin Ke's avatar
Guolin Ke committed
436
  } else {
437

438
439
440
441
    # When doing group, stratified is not possible (only random selection)
    if (nfold > length(group)) {
      stop("\n\tYou requested too many folds for the number of available groups.\n")
    }
442

443
    # Degroup the groups
444
    ungrouped <- inverse.rle(list(lengths = group, values = seq_along(group)))
445

446
    # Can't stratify, shuffle
447
    rnd_idx <- sample.int(length(group))
448

449
    # Make simple non-stratified folds
Guolin Ke's avatar
Guolin Ke committed
450
    folds <- list()
451

452
    # Loop through each fold
453
    for (i in seq_len(nfold)) {
454
      kstep <- length(rnd_idx) %/% (nfold - i + 1L)
455
456
457
458
      folds[[i]] <- list(
        fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
        , group = rnd_idx[seq_len(kstep)]
      )
459
      rnd_idx <- rnd_idx[-seq_len(kstep)]
Guolin Ke's avatar
Guolin Ke committed
460
    }
461

Guolin Ke's avatar
Guolin Ke committed
462
  }
463

464
465
  # Return folds
  return(folds)
466

Guolin Ke's avatar
Guolin Ke committed
467
468
469
470
471
}

# Creates CV folds stratified by the values of y.
# It was borrowed from caret::lgb.stratified.folds and simplified
# by always returning an unnamed list of fold indices.
472
#' @importFrom stats quantile
473
lgb.stratified.folds <- function(y, k = 10L) {
474

475
476
477
478
479
480
481
482
  ## Group the numeric data based on their magnitudes
  ## and sample within those groups.
  ## When the number of samples is low, we may have
  ## issues further slicing the numeric data into
  ## groups. The number of groups will depend on the
  ## ratio of the number of folds to the sample size.
  ## At most, we will use quantiles. If the sample
  ## is too small, we just do regular unstratified CV
Guolin Ke's avatar
Guolin Ke committed
483
  if (is.numeric(y)) {
484

485
    cuts <- length(y) %/% k
486
487
    if (cuts < 2L) {
      cuts <- 2L
488
    }
489
490
    if (cuts > 5L) {
      cuts <- 5L
491
492
493
    }
    y <- cut(
      y
494
      , unique(stats::quantile(y, probs = seq.int(0.0, 1.0, length.out = cuts)))
495
496
      , include.lowest = TRUE
    )
497

Guolin Ke's avatar
Guolin Ke committed
498
  }
499

Guolin Ke's avatar
Guolin Ke committed
500
  if (k < length(y)) {
501

502
    ## Reset levels so that the possible levels and
Guolin Ke's avatar
Guolin Ke committed
503
504
505
506
    ## the levels in the vector are the same
    y <- factor(as.character(y))
    numInClass <- table(y)
    foldVector <- vector(mode = "integer", length(y))
507

Guolin Ke's avatar
Guolin Ke committed
508
509
510
    ## For each class, balance the fold allocation as far
    ## as possible, then resample the remainder.
    ## The final assignment of folds is also randomized.
511

512
    for (i in seq_along(numInClass)) {
513

514
      ## Create a vector of integers from 1:k as many times as possible without
Guolin Ke's avatar
Guolin Ke committed
515
516
      ## going over the number of samples in the class. Note that if the number
      ## of samples in a class is less than k, nothing is producd here.
517
      seqVector <- rep(seq_len(k), numInClass[i] %/% k)
518

519
      ## Add enough random integers to get  length(seqVector) == numInClass[i]
520
      if (numInClass[i] %% k > 0L) {
521
        seqVector <- c(seqVector, sample.int(k, numInClass[i] %% k))
522
      }
523

524
      ## Shuffle the integers for fold assignment and assign to this classes's data
525
      foldVector[y == dimnames(numInClass)$y[i]] <- sample(seqVector)
526

Guolin Ke's avatar
Guolin Ke committed
527
    }
528

Guolin Ke's avatar
Guolin Ke committed
529
  } else {
530

Guolin Ke's avatar
Guolin Ke committed
531
    foldVector <- seq(along = y)
532

Guolin Ke's avatar
Guolin Ke committed
533
  }
534

535
  # Return data
Guolin Ke's avatar
Guolin Ke committed
536
  out <- split(seq(along = y), foldVector)
537
538
  names(out) <- NULL
  out
Guolin Ke's avatar
Guolin Ke committed
539
540
}

541
lgb.merge.cv.result <- function(msg, showsd = TRUE) {
542

543
  # Get CV message length
544
  if (length(msg) == 0L) {
545
546
    stop("lgb.cv: size of cv result error")
  }
547

548
  # Get evaluation message length
549
  eval_len <- length(msg[[1L]])
550

551
  # Is evaluation message empty?
552
  if (eval_len == 0L) {
553
554
    stop("lgb.cv: should provide at least one metric for CV")
  }
555

556
  # Get evaluation results using a list apply
557
  eval_result <- lapply(seq_len(eval_len), function(j) {
558
559
    as.numeric(lapply(seq_along(msg), function(i) {
      msg[[i]][[j]]$value }))
Guolin Ke's avatar
Guolin Ke committed
560
  })
561

562
  # Get evaluation
563
  ret_eval <- msg[[1L]]
564

565
566
567
568
  # Go through evaluation length items
  for (j in seq_len(eval_len)) {
    ret_eval[[j]]$value <- mean(eval_result[[j]])
  }
569

570
  # Preinit evaluation error
Guolin Ke's avatar
Guolin Ke committed
571
  ret_eval_err <- NULL
572

573
  # Check for standard deviation
574
  if (showsd) {
575

576
    # Parse standard deviation
577
    for (j in seq_len(eval_len)) {
578
579
      ret_eval_err <- c(
        ret_eval_err
580
        , sqrt(mean(eval_result[[j]] ^ 2L) - mean(eval_result[[j]]) ^ 2L)
581
      )
Guolin Ke's avatar
Guolin Ke committed
582
    }
583

584
    # Convert to list
Guolin Ke's avatar
Guolin Ke committed
585
    ret_eval_err <- as.list(ret_eval_err)
586

Guolin Ke's avatar
Guolin Ke committed
587
  }
588

589
  # Return errors
590
591
592
593
  list(
    eval_list = ret_eval
    , eval_err_list = ret_eval_err
  )
594

595
}