[R-package] Fix incorrect num_iterations, early_stopping_round in saved models...

[R-package] Fix incorrect num_iterations, early_stopping_round in saved models (fixes #2208, #2468) (#3368) * params to string * add tests * more tests * stop skipping training continuation test (fixes #2468) * remove unnecessary code

[R-package] Fix incorrect num_iterations, early_stopping_round in saved models...
[R-package] Fix incorrect num_iterations, early_stopping_round in saved models (fixes #2208, #2468) (#3368) * params to string * add tests * more tests * stop skipping training continuation test (fixes #2468) * remove unnecessary code
73d65c79 · James Lamb · GitHub · e6048b64 · 73d65c79 · 73d65c79
Unverified Commit 73d65c79 authored Sep 27, 2020 by James Lamb Committed by GitHub Sep 26, 2020
6 changed files
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -111,6 +111,22 @@ lgb.cv <- function(params = list()
  fobj <- NULL
  eval_functions <- list(NULL)

+  # set some parameters, resolving the way they were passed in with other parameters
+  # in `params`.
+  # this ensures that the model stored with Booster$save() correctly represents
+  # what was passed in
+  params <- lgb.check.wrapper_param(
+    main_param_name = "num_iterations"
+    , params = params
+    , alternative_kwarg_value = nrounds
+  )
+  params <- lgb.check.wrapper_param(
+    main_param_name = "early_stopping_round"
+    , params = params
+    , alternative_kwarg_value = early_stopping_rounds
+  )
+  early_stopping_rounds <- params[["early_stopping_round"]]
+
  # Check for objective (function or not)
  if (is.function(params$objective)) {
    fobj <- params$objective
@@ -146,13 +162,7 @@ lgb.cv <- function(params = list()
  if (!is.null(predictor)) {
    begin_iteration <- predictor$current_iter() + 1L
  }
-  # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
-  n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
-  if (any(names(params) %in% n_trees)) {
-    end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1L]]] - 1L
-  } else {
-    end_iteration <- begin_iteration + nrounds - 1L
-  }
+  end_iteration <- begin_iteration + params[["num_iterations"]] - 1L

  # Check interaction constraints
  cnames <- NULL
@@ -227,18 +237,8 @@ lgb.cv <- function(params = list()
    callbacks <- add.cb(callbacks, cb.record.evaluation())
  }

-  # If early stopping was passed as a parameter in params(), prefer that to keyword argument
-  # early_stopping_rounds by overwriting the value in 'early_stopping_rounds'
-  early_stop <- .PARAMETER_ALIASES()[["early_stopping_round"]]
-  early_stop_param_indx <- names(params) %in% early_stop
-  if (any(early_stop_param_indx)) {
-    first_early_stop_param <- which(early_stop_param_indx)[[1L]]
-    first_early_stop_param_name <- names(params)[[first_early_stop_param]]
-    early_stopping_rounds <- params[[first_early_stop_param_name]]
-  }
-
  # Did user pass parameters that indicate they want to use early stopping?
-  using_early_stopping_via_args <- !is.null(early_stopping_rounds) && early_stopping_rounds > 0L
+  using_early_stopping <- !is.null(early_stopping_rounds) && early_stopping_rounds > 0L

  boosting_param_names <- .PARAMETER_ALIASES()[["boosting"]]
  using_dart <- any(
@@ -253,7 +253,7 @@ lgb.cv <- function(params = list()
  # Cannot use early stopping with 'dart' boosting
  if (using_dart) {
    warning("Early stopping is not available in 'dart' mode.")
-    using_early_stopping_via_args <- FALSE
+    using_early_stopping <- FALSE

    # Remove the cb.early.stop() function if it was passed in to callbacks
    callbacks <- Filter(
@@ -265,7 +265,7 @@ lgb.cv <- function(params = list()
  }

  # If user supplied early_stopping_rounds, add the early stopping callback
-  if (using_early_stopping_via_args) {
+  if (using_early_stopping) {
    callbacks <- add.cb(
      callbacks
      , cb.early.stop(

--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -89,6 +89,22 @@ lgb.train <- function(params = list(),
  fobj <- NULL
  eval_functions <- list(NULL)

+  # set some parameters, resolving the way they were passed in with other parameters
+  # in `params`.
+  # this ensures that the model stored with Booster$save() correctly represents
+  # what was passed in
+  params <- lgb.check.wrapper_param(
+    main_param_name = "num_iterations"
+    , params = params
+    , alternative_kwarg_value = nrounds
+  )
+  params <- lgb.check.wrapper_param(
+    main_param_name = "early_stopping_round"
+    , params = params
+    , alternative_kwarg_value = early_stopping_rounds
+  )
+  early_stopping_rounds <- params[["early_stopping_round"]]
+
  # Check for objective (function or not)
  if (is.function(params$objective)) {
    fobj <- params$objective
@@ -124,13 +140,7 @@ lgb.train <- function(params = list(),
  if (!is.null(predictor)) {
    begin_iteration <- predictor$current_iter() + 1L
  }
-  # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
-  n_trees <- .PARAMETER_ALIASES()[["num_iterations"]]
-  if (any(names(params) %in% n_trees)) {
-    end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1L]]] - 1L
-  } else {
-    end_iteration <- begin_iteration + nrounds - 1L
-  }
+  end_iteration <- begin_iteration + params[["num_iterations"]] - 1L

  # Check interaction constraints
  cnames <- NULL
@@ -198,18 +208,8 @@ lgb.train <- function(params = list(),
    callbacks <- add.cb(callbacks, cb.record.evaluation())
  }

-  # If early stopping was passed as a parameter in params(), prefer that to keyword argument
-  # early_stopping_rounds by overwriting the value in 'early_stopping_rounds'
-  early_stop <- .PARAMETER_ALIASES()[["early_stopping_round"]]
-  early_stop_param_indx <- names(params) %in% early_stop
-  if (any(early_stop_param_indx)) {
-    first_early_stop_param <- which(early_stop_param_indx)[[1L]]
-    first_early_stop_param_name <- names(params)[[first_early_stop_param]]
-    early_stopping_rounds <- params[[first_early_stop_param_name]]
-  }
-
  # Did user pass parameters that indicate they want to use early stopping?
-  using_early_stopping_via_args <- !is.null(early_stopping_rounds) && early_stopping_rounds > 0L
+  using_early_stopping <- !is.null(early_stopping_rounds) && early_stopping_rounds > 0L

  boosting_param_names <- .PARAMETER_ALIASES()[["boosting"]]
  using_dart <- any(
@@ -224,7 +224,7 @@ lgb.train <- function(params = list(),
  # Cannot use early stopping with 'dart' boosting
  if (using_dart) {
    warning("Early stopping is not available in 'dart' mode.")
-    using_early_stopping_via_args <- FALSE
+    using_early_stopping <- FALSE

    # Remove the cb.early.stop() function if it was passed in to callbacks
    callbacks <- Filter(
@@ -236,7 +236,7 @@ lgb.train <- function(params = list(),
  }

  # If user supplied early_stopping_rounds, add the early stopping callback
-  if (using_early_stopping_via_args) {
+  if (using_early_stopping) {
    callbacks <- add.cb(
      callbacks
      , cb.early.stop(

--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -349,3 +349,62 @@ lgb.check.eval <- function(params, eval) {

  return(params)
 }
+
+
+# [description]
+#
+#     Resolve differences between passed-in keyword arguments, parameters,
+#     and parameter aliases. This function exists because some functions in the
+#     package take in parameters through their own keyword arguments other than
+#     the `params` list.
+#
+#     If the same underlying parameter is provided multiple
+#     ways, the first item in this list is used:
+#
+#         1. the main (non-alias) parameter found in `params`
+#         2. the first alias of that parameter found in `params`
+#         3. the keyword argument passed in
+#
+#     For example, "num_iterations" can also be provided to lgb.train()
+#     via keyword "nrounds". lgb.train() will choose one value for this parameter
+#     based on the first match in this list:
+#
+#         1. params[["num_iterations]]
+#         2. the first alias of "num_iterations" found in params
+#         3. the nrounds keyword argument
+#
+#     If multiple aliases are found in `params` for the same parameter, they are
+#     all removed before returning `params`.
+#
+# [return]
+#     params with num_iterations set to the chosen value, and other aliases
+#     of num_iterations removed
+lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_value) {
+
+  aliases <- .PARAMETER_ALIASES()[[main_param_name]]
+  aliases_provided <- names(params)[names(params) %in% aliases]
+  aliases_provided <- aliases_provided[aliases_provided != main_param_name]
+
+  # prefer the main parameter
+  if (!is.null(params[[main_param_name]])) {
+    for (param in aliases_provided) {
+      params[[param]] <- NULL
+    }
+    return(params)
+  }
+
+  # if the main parameter wasn't proovided, prefer the first alias
+  if (length(aliases_provided) > 0L) {
+    first_param <- aliases_provided[1L]
+    params[[main_param_name]] <- params[[first_param]]
+    for (param in aliases_provided) {
+      params[[param]] <- NULL
+    }
+    return(params)
+  }
+
+  # if not provided in params at all, use the alternative value provided
+  # through a keyword argument from lgb.train(), lgb.cv(), etc.
+  params[[main_param_name]] <- alternative_kwarg_value
+  return(params)
+}
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -228,7 +228,6 @@ test_that("lightgbm() performs evaluation on validation sets if they are provide
 context("training continuation")

 test_that("training continuation works", {
-  testthat::skip("This test is currently broken. See issue #2468 for details.")
  dtrain <- lgb.Dataset(
    train$data
    , label = train$label
@@ -242,21 +241,19 @@ test_that("training continuation works", {
    , learning_rate = 1.0
  )

-  # for the reference, use 10 iterations at once:
+  # train for 10 consecutive iterations
  bst <- lgb.train(param, dtrain, nrounds = 10L, watchlist)
  err_bst <- lgb.get.eval.result(bst, "train", "binary_logloss", 10L)
-  # first 5 iterations:
+
+  #  train for 5 iterations, save, load, train for 5 more
  bst1 <- lgb.train(param, dtrain, nrounds = 5L, watchlist)
-  # test continuing from a model in file
  model_file <- tempfile(fileext = ".model")
  lgb.save(bst1, model_file)
-  # continue for 5 more:
  bst2 <- lgb.train(param, dtrain, nrounds = 5L, watchlist, init_model = bst1)
  err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10L)
-  expect_lt(abs(err_bst - err_bst2), 0.01)

-  bst2 <- lgb.train(param, dtrain, nrounds = 5L, watchlist, init_model = model_file)
-  err_bst2 <- lgb.get.eval.result(bst2, "train", "binary_logloss", 10L)
+  # evaluation metrics should be nearly identical for the model trained in 10 coonsecutive
+  # iterations and the one trained in 5-then-5.
  expect_lt(abs(err_bst - err_bst2), 0.01)
 })


--- a/R-package/tests/testthat/test_lgb.Booster.R
+++ b/R-package/tests/testthat/test_lgb.Booster.R
@@ -445,3 +445,166 @@ test_that("Saving a model with different feature importance types works", {
        model_string <- bst$save_model_to_string(feature_importance_type = UNSUPPORTED_IMPORTANCE)
    }, "Unknown importance type")
 })
+
+.params_from_model_string <- function(model_str) {
+    file_lines <- strsplit(model_str, "\n")[[1L]]
+    start_indx <- which(grepl("^parameters\\:$", file_lines)) + 1L
+    blank_line_indices <- which(file_lines == "")
+    end_indx <- blank_line_indices[blank_line_indices > start_indx][1L] - 1L
+    params <- file_lines[start_indx: end_indx]
+    return(params)
+}
+
+test_that("all parameters are stored correctly with save_model_to_string()", {
+    dtrain <- lgb.Dataset(
+        data = matrix(rnorm(500L), nrow = 100L)
+        , label = rnorm(100L)
+    )
+    nrounds <- 4L
+    bst <- lgb.train(
+        params = list(
+            objective = "regression"
+            , metric = "l2"
+        )
+        , data = dtrain
+        , nrounds = nrounds
+        , verbose = 0L
+    )
+
+    model_str <- bst$save_model_to_string()
+    params_in_file <- .params_from_model_string(model_str = model_str)
+
+    # parameters should match what was passed from the R package
+    expect_equal(sum(grepl(pattern = "^\\[metric\\:", x = params_in_file)), 1L)
+    expect_equal(sum(params_in_file == "[metric: l2]"), 1L)
+
+    expect_equal(sum(grepl(pattern = "^\\[num_iterations\\:", x = params_in_file)), 1L)
+    expect_equal(sum(params_in_file == "[num_iterations: 4]"), 1L)
+
+    expect_equal(sum(grepl(pattern = "^\\[objective\\:", x = params_in_file)), 1L)
+    expect_equal(sum(params_in_file == "[objective: regression]"), 1L)
+
+    expect_equal(sum(grepl(pattern = "^\\[verbosity\\:", x = params_in_file)), 1L)
+    expect_equal(sum(params_in_file == "[verbosity: 0]"), 1L)
+
+    # early stopping should be off by default
+    expect_equal(sum(grepl(pattern = "^\\[early_stopping_round\\:", x = params_in_file)), 1L)
+    expect_equal(sum(params_in_file == "[early_stopping_round: 0]"), 1L)
+})
+
+test_that("early_stopping, num_iterations are stored correctly in model string even with aliases", {
+    dtrain <- lgb.Dataset(
+        data = matrix(rnorm(500L), nrow = 100L)
+        , label = rnorm(100L)
+    )
+    dvalid <- lgb.Dataset(
+        data = matrix(rnorm(500L), nrow = 100L)
+        , label = rnorm(100L)
+    )
+
+    # num_iterations values (all different)
+    num_iterations <- 4L
+    num_boost_round <- 2L
+    n_iter <- 3L
+    nrounds_kwarg <- 6L
+
+    # early_stopping_round values (all different)
+    early_stopping_round <- 2L
+    early_stopping_round_kwarg <- 3L
+    n_iter_no_change <- 4L
+
+    params <- list(
+        objective = "regression"
+        , metric = "l2"
+        , num_boost_round = num_boost_round
+        , num_iterations = num_iterations
+        , n_iter = n_iter
+        , early_stopping_round = early_stopping_round
+        , n_iter_no_change = n_iter_no_change
+    )
+
+    bst <- lgb.train(
+        params = params
+        , data = dtrain
+        , nrounds = nrounds_kwarg
+        , early_stopping_rounds = early_stopping_round_kwarg
+        , valids = list(
+            "random_valid" = dvalid
+        )
+        , verbose = 0L
+    )
+
+    model_str <- bst$save_model_to_string()
+    params_in_file <- .params_from_model_string(model_str = model_str)
+
+    # parameters should match what was passed from the R package, and the "main" (non-alias)
+    # params values in `params` should be preferred to keyword argumentts or aliases
+    expect_equal(sum(grepl(pattern = "^\\[num_iterations\\:", x = params_in_file)), 1L)
+    expect_equal(sum(params_in_file == sprintf("[num_iterations: %s]", num_iterations)), 1L)
+    expect_equal(sum(grepl(pattern = "^\\[early_stopping_round\\:", x = params_in_file)), 1L)
+    expect_equal(sum(params_in_file == sprintf("[early_stopping_round: %s]", early_stopping_round)), 1L)
+
+    # none of the aliases shouold have been written to the model file
+    expect_equal(sum(grepl(pattern = "^\\[num_boost_round\\:", x = params_in_file)), 0L)
+    expect_equal(sum(grepl(pattern = "^\\[n_iter\\:", x = params_in_file)), 0L)
+    expect_equal(sum(grepl(pattern = "^\\[n_iter_no_change\\:", x = params_in_file)), 0L)
+
+})
+
+# this is almost identical to the test above it, but for lgb.cv(). A lot of code
+# is duplicated between lgb.train() and lgb.cv(), and this will catch cases where
+# one is updated and the other isn't
+test_that("lgb.cv() correctly handles passing through params to the model file", {
+    dtrain <- lgb.Dataset(
+        data = matrix(rnorm(500L), nrow = 100L)
+        , label = rnorm(100L)
+    )
+
+    # num_iterations values (all different)
+    num_iterations <- 4L
+    num_boost_round <- 2L
+    n_iter <- 3L
+    nrounds_kwarg <- 6L
+
+    # early_stopping_round values (all different)
+    early_stopping_round <- 2L
+    early_stopping_round_kwarg <- 3L
+    n_iter_no_change <- 4L
+
+    params <- list(
+        objective = "regression"
+        , metric = "l2"
+        , num_boost_round = num_boost_round
+        , num_iterations = num_iterations
+        , n_iter = n_iter
+        , early_stopping_round = early_stopping_round
+        , n_iter_no_change = n_iter_no_change
+    )
+
+    cv_bst <- lgb.cv(
+        params = params
+        , data = dtrain
+        , nrounds = nrounds_kwarg
+        , early_stopping_rounds = early_stopping_round_kwarg
+        , nfold = 3L
+        , verbose = 0L
+    )
+
+    for (bst in cv_bst$boosters) {
+        model_str <- bst[["booster"]]$save_model_to_string()
+        params_in_file <- .params_from_model_string(model_str = model_str)
+
+        # parameters should match what was passed from the R package, and the "main" (non-alias)
+        # params values in `params` should be preferred to keyword argumentts or aliases
+        expect_equal(sum(grepl(pattern = "^\\[num_iterations\\:", x = params_in_file)), 1L)
+        expect_equal(sum(params_in_file == sprintf("[num_iterations: %s]", num_iterations)), 1L)
+        expect_equal(sum(grepl(pattern = "^\\[early_stopping_round\\:", x = params_in_file)), 1L)
+        expect_equal(sum(params_in_file == sprintf("[early_stopping_round: %s]", early_stopping_round)), 1L)
+
+        # none of the aliases shouold have been written to the model file
+        expect_equal(sum(grepl(pattern = "^\\[num_boost_round\\:", x = params_in_file)), 0L)
+        expect_equal(sum(grepl(pattern = "^\\[n_iter\\:", x = params_in_file)), 0L)
+        expect_equal(sum(grepl(pattern = "^\\[n_iter_no_change\\:", x = params_in_file)), 0L)
+    }
+
+})
--- a/R-package/tests/testthat/test_utils.R
+++ b/R-package/tests/testthat/test_utils.R
@@ -124,3 +124,61 @@ test_that("lgb.check.eval drops duplicate metrics and preserves order", {
    expect_named(params, "metric")
    expect_identical(params[["metric"]], list("l1", "l2", "rmse"))
 })
+
+context("lgb.check.wrapper_param")
+
+test_that("lgb.check.wrapper_param() uses passed-in keyword arg if no alias found in params", {
+    kwarg_val <- sample(seq_len(100L), size = 1L)
+    params <- lgb.check.wrapper_param(
+        main_param_name = "num_iterations"
+        , params = list()
+        , alternative_kwarg_value = kwarg_val
+    )
+    expect_equal(params[["num_iterations"]], kwarg_val)
+})
+
+test_that("lgb.check.wrapper_param() prefers main parameter to alias and keyword arg", {
+    num_iterations <- sample(seq_len(100L), size = 1L)
+    kwarg_val <- sample(seq_len(100L), size = 1L)
+    params <- lgb.check.wrapper_param(
+        main_param_name = "num_iterations"
+        , params = list(
+            num_iterations = num_iterations
+            , num_tree = sample(seq_len(100L), size = 1L)
+            , n_estimators = sample(seq_len(100L), size = 1L)
+        )
+        , alternative_kwarg_value = kwarg_val
+    )
+    expect_equal(params[["num_iterations"]], num_iterations)
+
+    # aliases should be removed
+    expect_identical(params, list(num_iterations = num_iterations))
+})
+
+test_that("lgb.check.wrapper_param() prefers alias to keyword arg", {
+    n_estimators <- sample(seq_len(100L), size = 1L)
+    num_tree <- sample(seq_len(100L), size = 1L)
+    kwarg_val <- sample(seq_len(100L), size = 1L)
+    params <- lgb.check.wrapper_param(
+        main_param_name = "num_iterations"
+        , params = list(
+            num_tree = num_tree
+            , n_estimators = n_estimators
+        )
+        , alternative_kwarg_value = kwarg_val
+    )
+    expect_equal(params[["num_iterations"]], num_tree)
+    expect_identical(params, list(num_iterations = num_tree))
+
+    # switching the order should switch which one is chosen
+    params2 <- lgb.check.wrapper_param(
+        main_param_name = "num_iterations"
+        , params = list(
+            n_estimators = n_estimators
+            , num_tree = num_tree
+        )
+        , alternative_kwarg_value = kwarg_val
+    )
+    expect_equal(params2[["num_iterations"]], n_estimators)
+    expect_identical(params2, list(num_iterations = n_estimators))
+})