[R-package] fixed sorting issues in lgb.cv() when using a model with...

[R-package] fixed sorting issues in lgb.cv() when using a model with observation weights (fixes #2572) (#2573)

[R-package] fixed sorting issues in lgb.cv() when using a model with...
[R-package] fixed sorting issues in lgb.cv() when using a model with observation weights (fixes #2572) (#2573)
dd16fa99 · James Lamb · GitHub · b588cae2 · dd16fa99 · dd16fa99
Unverified Commit dd16fa99 authored Jan 15, 2020 by James Lamb Committed by GitHub Jan 15, 2020
Show whitespace changes
Inline Side-by-side

Showing with 207 additions and 35 deletions

R-package/R/lgb.cv.R R-package/R/lgb.cv.R +64 -35

R-package/tests/testthat/test_learning_to_rank.R R-package/tests/testthat/test_learning_to_rank.R +143 -0

No files found.
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -70,6 +70,7 @@ CVBooster <- R6::R6Class(
 #'   , learning_rate = 1.0
 #'   , early_stopping_rounds = 5L
 #' )
+#' @importFrom data.table data.table setorderv
 #' @export
 lgb.cv <- function(params = list()
                   , data
@@ -95,8 +96,7 @@ lgb.cv <- function(params = list()
                   ) {

  # Setup temporary variables
-  addiction_params <- list(...)
-  params <- append(params, addiction_params)
+  params <- append(params, list(...))
  params$verbose <- verbose
  params <- lgb.check.obj(params, obj)
  params <- lgb.check.eval(params, eval)
@@ -193,12 +193,12 @@ lgb.cv <- function(params = list()

    # Create folds
    folds <- generate.cv.folds(
-      nfold
-      , nrow(data)
-      , stratified
-      , getinfo(data, "label")
-      , getinfo(data, "group")
-      , params
+      nfold = nfold
+      , nrows = nrow(data)
+      , stratified = stratified
+      , label = getinfo(data, "label")
+      , group = getinfo(data, "group")
+      , params = params
    )

  }
@@ -264,35 +264,64 @@ lgb.cv <- function(params = list()
  # Categorize callbacks
  cb <- categorize.callbacks(callbacks)

-  # Construct booster using a list apply, check if requires group or not
-  if (!is.list(folds[[1L]])) {
-    bst_folds <- lapply(seq_along(folds), function(k) {
-      dtest <- slice(data, folds[[k]])
-      dtrain <- slice(data, seq_len(nrow(data))[-folds[[k]]])
-      setinfo(dtrain, "weight", getinfo(data, "weight")[-folds[[k]]])
-      setinfo(dtrain, "init_score", getinfo(data, "init_score")[-folds[[k]]])
-      setinfo(dtest, "weight", getinfo(data, "weight")[folds[[k]]])
-      setinfo(dtest, "init_score", getinfo(data, "init_score")[folds[[k]]])
-      booster <- Booster$new(params, dtrain)
-      booster$add_valid(dtest, "valid")
-      list(booster = booster)
-    })
+  # Construct booster for each fold. The data.table() code below is used to
+  # guarantee that indices are sorted while keeping init_score and weight together
+  # with the correct indices. Note that it takes advantage of the fact that
+  # someDT$some_column returns NULL is 'some_column' does not exist in the data.table
+  bst_folds <- lapply(
+    X = seq_along(folds)
+    , FUN = function(k) {
+
+      # For learning-to-rank, each fold is a named list with two elements:
+      #   * `fold` = an integer vector of row indices
+      #   * `group` = an integer vector describing which groups are in the fold
+      # For classification or regression tasks, it will just be an integer
+      # vector of row indices
+      folds_have_group <- "group" %in% names(folds[[k]])
+      if (folds_have_group) {
+        test_indices <- folds[[k]]$fold
+        test_group_indices <- folds[[k]]$group
+        test_groups <- getinfo(data, "group")[test_group_indices]
+        train_groups <- getinfo(data, "group")[-test_group_indices]
      } else {
-    bst_folds <- lapply(seq_along(folds), function(k) {
-      dtest <- slice(data, folds[[k]]$fold)
-      dtrain <- slice(data, (seq_len(nrow(data)))[-folds[[k]]$fold])
-      setinfo(dtrain, "weight", getinfo(data, "weight")[-folds[[k]]$fold])
-      setinfo(dtrain, "init_score", getinfo(data, "init_score")[-folds[[k]]$fold])
-      setinfo(dtrain, "group", getinfo(data, "group")[-folds[[k]]$group])
-      setinfo(dtest, "weight", getinfo(data, "weight")[folds[[k]]$fold])
-      setinfo(dtest, "init_score", getinfo(data, "init_score")[folds[[k]]$fold])
-      setinfo(dtest, "group", getinfo(data, "group")[folds[[k]]$group])
+        test_indices <- folds[[k]]
+      }
+      train_indices <- seq_len(nrow(data))[-test_indices]
+
+      # set up test set
+      indexDT <- data.table::data.table(
+        indices = test_indices
+        , weight = getinfo(data, "weight")[test_indices]
+        , init_score = getinfo(data, "init_score")[test_indices]
+      )
+      data.table::setorderv(indexDT, "indices", order = 1L)
+      dtest <- slice(data, indexDT$indices)
+      setinfo(dtest, "weight", indexDT$weight)
+      setinfo(dtest, "init_score", indexDT$init_score)
+
+      # set up training set
+      indexDT <- data.table::data.table(
+        indices = train_indices
+        , weight = getinfo(data, "weight")[train_indices]
+        , init_score = getinfo(data, "init_score")[train_indices]
+      )
+      data.table::setorderv(indexDT, "indices", order = 1L)
+      dtrain <- slice(data, indexDT$indices)
+      setinfo(dtrain, "weight", indexDT$weight)
+      setinfo(dtrain, "init_score", indexDT$init_score)
+
+      if (folds_have_group) {
+        setinfo(dtest, "group", test_groups)
+        setinfo(dtrain, "group", train_groups)
+      }
+
      booster <- Booster$new(params, dtrain)
      booster$add_valid(dtest, "valid")
+      return(
        list(booster = booster)
-    })
+      )
    }
-
+  )

  # Create new booster
  cv_booster <- CVBooster$new(bst_folds)

--- a/R-package/tests/testthat/test_learning_to_rank.R
+++ b/R-package/tests/testthat/test_learning_to_rank.R
+context("Learning to rank")
+
+# numerical tolerance to use when checking metric values
+TOLERANCE <- 1e-06
+
+test_that("learning-to-rank with lgb.train() works as expected", {
+    set.seed(708L)
+    data(agaricus.train, package = "lightgbm")
+    # just keep a few features,to generate an model with imperfect fit
+    train <- agaricus.train
+    train_data <- train$data[1L:6000L, 1L:20L]
+    dtrain <- lgb.Dataset(
+        train_data
+        , label = train$label[1L:6000L]
+        , group = rep(150L, 40L)
+    )
+    ndcg_at <- "1,2,3"
+    eval_names <-  paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]])
+    params <- list(
+        objective = "lambdarank"
+        , metric = "ndcg"
+        , ndcg_at = ndcg_at
+        , metric_freq = 1L
+        , max_position = 3L
+        , learning_rate = 0.001
+    )
+    model <- lgb.train(
+        params = params
+        , data = dtrain
+        , nrounds = 10L
+    )
+    expect_true(lgb.is.Booster(model))
+
+    dumped_model <- jsonlite::fromJSON(
+        model$dump_model()
+    )
+    expect_equal(dumped_model[["objective"]], "lambdarank")
+    expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L)
+
+    # check that evaluation results make sense (0.0 < nDCG < 1.0)
+    eval_results <- model$eval_train()
+    expect_equal(length(eval_results), length(eval_names))
+    for (result in eval_results) {
+        expect_true(result[["value"]] > 0.0 && result[["value"]] < 1.0)
+        expect_true(result[["higher_better"]])
+        expect_identical(result[["data_name"]], "training")
+    }
+    expect_identical(sapply(eval_results, function(x) {x$name}), eval_names)
+    expect_equal(eval_results[[1L]][["value"]], 0.825)
+    expect_true(abs(eval_results[[2L]][["value"]] - 0.795986) < TOLERANCE)
+    expect_true(abs(eval_results[[3L]][["value"]] - 0.7734639) < TOLERANCE)
+})
+
+test_that("learning-to-rank with lgb.cv() works as expected", {
+    set.seed(708L)
+    data(agaricus.train, package = "lightgbm")
+    # just keep a few features,to generate an model with imperfect fit
+    train <- agaricus.train
+    train_data <- train$data[1L:6000L, 1L:20L]
+    dtrain <- lgb.Dataset(
+        train_data
+        , label = train$label[1L:6000L]
+        , group = rep(150L, 40L)
+    )
+    ndcg_at <- "1,2,3"
+    eval_names <-  paste0("ndcg@", strsplit(ndcg_at, ",")[[1L]])
+    params <- list(
+        objective = "lambdarank"
+        , metric = "ndcg"
+        , ndcg_at = ndcg_at
+        , metric_freq = 1L
+        , max_position = 3L
+        , label_gain = "0,1,3"
+    )
+    nfold <- 4L
+    nrounds <- 10L
+    cv_bst <- lgb.cv(
+        params = params
+        , data = dtrain
+        , nrounds = nrounds
+        , nfold = nfold
+        , min_data = 1L
+        , learning_rate = 0.01
+    )
+    expect_is(cv_bst, "lgb.CVBooster")
+    expect_equal(length(cv_bst$boosters), nfold)
+
+    # "valid" should contain results for each metric
+    eval_results <- cv_bst$record_evals[["valid"]]
+    eval_names <-  c("ndcg@1", "ndcg@2", "ndcg@3")
+    expect_identical(names(eval_results), eval_names)
+
+    # check that best score and iter make sense (0.0 < nDCG < 1.0)
+    best_iter <- cv_bst$best_iter
+    best_score <- cv_bst$best_score
+    expect_true(best_iter > 0L && best_iter <= nrounds)
+    expect_true(best_score > 0.0 && best_score < 1.0)
+    expect_true(abs(best_score - 0.775) < TOLERANCE)
+
+    # best_score should be set for the first metric
+    first_metric <- eval_names[[1L]]
+    expect_equal(best_score, eval_results[[first_metric]][["eval"]][[best_iter]])
+
+    for (eval_name in eval_names) {
+        results_for_this_metric <- eval_results[[eval_name]]
+
+        # each set of metrics should have eval and eval_err
+        expect_identical(names(results_for_this_metric), c("eval", "eval_err"))
+
+        # there should be one "eval" and "eval_err" per round
+        expect_equal(length(results_for_this_metric[["eval"]]), nrounds)
+        expect_equal(length(results_for_this_metric[["eval_err"]]), nrounds)
+
+        # check that evaluation results make sense (0.0 < nDCG < 1.0)
+        all_evals <- unlist(results_for_this_metric[["eval"]])
+        expect_true(all(all_evals > 0.0 & all_evals < 1.0))
+    }
+
+    # first and last value of each metric should be as expected
+    ndcg1_values <- c(0.725, 0.75, 0.75, 0.775, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75)
+    expect_true(all(abs(unlist(eval_results[["ndcg@1"]][["eval"]]) - ndcg1_values) < TOLERANCE))
+
+    ndcg2_values <- c(
+        0.6863147, 0.720986, 0.7306574, 0.745986, 0.7306574,
+        0.720986, 0.7403287, 0.7403287, 0.7403287, 0.7306574
+    )
+    expect_true(all(abs(unlist(eval_results[["ndcg@2"]][["eval"]]) - ndcg2_values) < TOLERANCE))
+
+    ndcg3_values <- c(
+        0.6777939, 0.6984639, 0.711732, 0.7234639, 0.711732,
+        0.7101959, 0.719134, 0.719134, 0.725, 0.711732
+    )
+    expect_true(all(abs(unlist(eval_results[["ndcg@3"]][["eval"]]) - ndcg3_values) < TOLERANCE))
+
+    # check details of each booster
+    for (bst in cv_bst$boosters) {
+        dumped_model <- jsonlite::fromJSON(
+            bst$booster$dump_model()
+        )
+        expect_equal(dumped_model[["objective"]], "lambdarank")
+        expect_equal(dumped_model[["max_feature_idx"]], ncol(train_data) - 1L)
+    }
+})