[R-package] refactor and improvements to lgb.convert() functions (fixes #2678, #2681) (#3269)

* [R-package] improvements to lgb.convert() functions (fixes #2678, #2681) * more stuff * update docs * remove lgb.convert() * put internal functions back * update index

[R-package] refactor and improvements to lgb.convert() functions (fixes #2678, #2681) (#3269)
* [R-package] improvements to lgb.convert() functions (fixes #2678, #2681) * more stuff * update docs * remove lgb.convert() * put internal functions back * update index
083b02af · James Lamb · GitHub · c454d5f8 · 083b02af · c454d5f8
Unverified Commit 083b02af authored Aug 05, 2020 by James Lamb Committed by GitHub Aug 05, 2020
11 changed files
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -14,7 +14,6 @@ export(lgb.Dataset.create.valid)
 export(lgb.Dataset.save)
 export(lgb.Dataset.set.categorical)
 export(lgb.Dataset.set.reference)
-export(lgb.convert)
 export(lgb.convert_with_rules)
 export(lgb.cv)
 export(lgb.dump)

--- a/R-package/R/lgb.convert.R
+++ b/R-package/R/lgb.convert.R
-#' @name lgb.convert
-#' @title Data preparator for LightGBM datasets (integer)
-#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
-#'              Factors and characters are converted to integer.
-#'              Please use \code{\link{lgb.convert_with_rules}} if you want to apply this transformation to
-#'              other datasets. This is useful if you have a specific need for integer dataset instead
-#'              of numeric dataset.
-#'
-#'              NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare}.
-#' @param data A data.frame or data.table to prepare.
-#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
-#'         for input in \code{lgb.Dataset}.
-#'
-#' @examples
-#' \dontrun{
-#' data(iris)
-#'
-#' str(iris)
-#'
-#' # Convert all factors/chars to integer
-#' str(lgb.convert(data = iris))
-#'
-#' # When lightgbm package is installed, and you do not want to load it
-#' # You can still use the function!
-#' lgb.unloader()
-#' str(lgb.convert(data = iris))
-#' # 'data.frame':	150 obs. of  5 variables:
-#' # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
-#' # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
-#' # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
-#' # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#' # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
-#' }
-#' @export
-lgb.convert <- function(data) {
-  # data.table not behaving like data.frame
-  if (inherits(data, "data.table")) {
-    # Get data classes
-    list_classes <- vapply(data, class, character(1L))
-    # Convert characters to integer
-    is_char <- which(list_classes == "character")
-    if (length(is_char) > 0L) {
-      data[, (is_char) := lapply(.SD, function(x) {as.integer(as.factor(x))}), .SDcols = is_char]
-    }
-    # Convert factors to integer
-    is_fact <- c(which(list_classes == "factor"), is_char)
-    if (length(is_fact) > 0L) {
-      data[, (is_fact) := lapply(.SD, function(x) {as.integer(x)}), .SDcols = is_fact]
-    }
-  } else {
-    # Default routine (data.frame)
-    if (inherits(data, "data.frame")) {
-      # Get data classes
-      list_classes <- vapply(data, class, character(1L))
-      # Convert characters to factors to numeric (integer is more efficient actually)
-      is_char <- which(list_classes == "character")
-      if (length(is_char) > 0L) {
-        data[is_char] <- lapply(data[is_char], function(x) {as.integer(as.factor(x))})
-      }
-      # Convert factors to numeric (integer is more efficient actually)
-      is_fact <- which(list_classes == "factor")
-      if (length(is_fact) > 0L) {
-        data[is_fact] <- lapply(data[is_fact], function(x) {as.integer(x)})
-      }
-    } else {
-      stop(
-        "lgb.convert: you provided "
-        , paste(class(data), collapse = " & ")
-        , " but data should have class data.frame or data.table"
-      )
-    }
-  }
-  return(data)
-}
--- a/R-package/R/lgb.convert_with_rules.R
+++ b/R-package/R/lgb.convert_with_rules.R
+# [description] get all column classes of a data.table or data.frame.
+#               This function collapses the result of class() into a single string
+.get_column_classes <- function(df) {
+    return(
+        vapply(
+            X = df
+            , FUN = function(x) {paste0(class(x), collapse = ",")}
+            , FUN.VALUE = character(1L)
+        )
+    )
+}
+# [description] check a data frame or data table for columns tthat are any
+#               type other than numeric and integer. This is used by lgb.convert()
+#               and lgb.convert_with_rules() too warn if more action is needed by users
+#               before a dataset can be converted to a lgb.Dataset.
+.warn_for_unconverted_columns <- function(df, function_name) {
+    column_classes <- .get_column_classes(df)
+    unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
+    if (length(unconverted_columns) > 0L) {
+        col_detail_string <- paste0(
+            paste0(
+                names(unconverted_columns)
+                , " ("
+                , unconverted_columns
+                , ")"
+            )
+            , collapse = ", "
+        )
+        msg <- paste0(
+            function_name
+            , ": "
+            , length(unconverted_columns)
+            , " columns are not numeric or integer. These need to be dropped or converted to "
+            , "be used in an lgb.Dataset object. "
+            , col_detail_string
+        )
+        warning(msg)
+    }
+    return(invisible(NULL))
+}
+.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
+.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}
 #' @name lgb.convert_with_rules
 #' @title Data preparator for LightGBM datasets with rules (integer)
 #' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
-#'              Factors and characters are converted to integer.
+#'              Factor, character, and logical columns are converted to integer. Missing values
-#'              In addition, keeps rules created so you can convert other datasets using this converter.
+#'              in factors and characters will be filled with 0L. Missing values in logicals
-#'              This is useful if you have a specific need for integer dataset instead of numeric dataset.
+#'              will be filled with -1L.
+#'
+#'              This function returns and optionally takes in "rules" the describe exactly
+#'              how to convert values in columns.
+#'
+#'              Columns that contain only NA values will be converted by this function but will
+#'              not show up in the returned \code{rules}.
 #'
 #'              NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
 #' @param data A data.frame or data.table to prepare.
-#' @param rules A set of rules from the data preparator, if already used.
+#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
+#'              where names are column names in \code{data} and values are named character
+#'              vectors whose names are column values and whose values are new values to
+#'              replace them with.
 #' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
-#'         The data must be converted to a matrix format (\code{as.matrix}) for input in
+#'         Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
 #'         \code{lgb.Dataset}.
 #'
 #' @examples
@@ -18,7 +73,7 @@
 #'
 #' str(iris)
 #'
-#' new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter
+#' new_iris <- lgb.convert_with_rules(data = iris)
 #' str(new_iris$data)
 #'
 #' data(iris) # Erase iris dataset
@@ -54,130 +109,98 @@
 #' @export
 lgb.convert_with_rules <- function(data, rules = NULL) {
-  # data.table not behaving like data.frame
+    column_classes <- .get_column_classes(data)
-  if (inherits(data, "data.table")) {
-    # Must use existing rules
-    if (!is.null(rules)) {
-      # Loop through rules
-      for (i in names(rules)) {
-        data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
+    is_char <- which(column_classes == "character")
-        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
+    is_factor <- which(column_classes == "factor")
+    is_logical <- which(column_classes == "logical")
-      }
+    is_data_table <- data.table::is.data.table(data)
+    is_data_frame <- is.data.frame(data)
-    } else {
+    if (!(is_data_table || is_data_frame)) {
+        stop(
-      # Get data classes
+            "lgb.convert_with_rules: you provided "
-      list_classes <- vapply(data, class, character(1L))
+            , paste(class(data), collapse = " & ")
+            , " but data should have class data.frame or data.table"
-      # Map characters/factors
+        )
-      is_fix <- which(list_classes %in% c("character", "factor"))
+    }
-      rules <- list()
-      # Need to create rules?
+    # if user didn't provide rules, create them
-      if (length(is_fix) > 0L) {
+    if (is.null(rules)) {
+        rules <- list()
+        columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))
-        # Go through all characters/factors
+        for (i in columns_to_fix) {
-        for (i in is_fix) {
-          # Store column elsewhere
+          col_values <- data[[i]]
-          mini_data <- data[[i]]
          # Get unique values
-          if (is.factor(mini_data)) {
+          if (is.factor(col_values)) {
-            mini_unique <- levels(mini_data) # Factor
+              unique_vals <- levels(col_values)
-            mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
+              unique_vals <- unique_vals[!is.na(unique_vals)]
-          } else {
+              mini_numeric <- seq_along(unique_vals) # respect ordinal
-            mini_unique <- as.factor(unique(mini_data)) # Character
+          } else if (is.character(col_values)) {
-            mini_numeric <- as.integer(mini_unique) # No respect of ordinality
+              unique_vals <- as.factor(unique(col_values))
+              unique_vals <- unique_vals[!is.na(unique_vals)]
+              mini_numeric <- as.integer(unique_vals)  # no respect for ordinal
+          } else if (is.logical(col_values)) {
+              unique_vals <- c(FALSE, TRUE)
+              mini_numeric <- c(0L, 1L)
          }
-          # Create rules
+          # don't add rules for all-NA columns
-          indexed <- colnames(data)[i] # Index value
+          if (length(unique_vals) > 0L) {
-          rules[[indexed]] <- mini_numeric # Numeric content
+              col_name <- names(data)[i]
-          names(rules[[indexed]]) <- mini_unique # Character equivalent
+              rules[[col_name]] <- mini_numeric
+              names(rules[[col_name]]) <- unique_vals
-          # Apply to real data column
+          }
-          data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
        }
-      }
    }
-  } else {
+    for (col_name in names(rules)) {
+        if (column_classes[[col_name]] == "logical") {
-    # Must use existing rules
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
-    if (!is.null(rules)) {
+        } else {
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
-      # Loop through rules
-      for (i in names(rules)) {
-        data[[i]] <- unname(rules[[i]][data[[i]]])
-        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
-      }
-    } else {
-      # Default routine (data.frame)
-      if (inherits(data, "data.frame")) {
-        # Get data classes
-        list_classes <- vapply(data, class, character(1L))
-        # Map characters/factors
-        is_fix <- which(list_classes %in% c("character", "factor"))
-        rules <- list()
-        # Need to create rules?
-        if (length(is_fix) > 0L) {
-          # Go through all characters/factors
-          for (i in is_fix) {
-            # Store column elsewhere
-            mini_data <- data[[i]]
-            # Get unique values
-            if (is.factor(mini_data)) {
-              mini_unique <- levels(mini_data) # Factor
-              mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
-            } else {
-              mini_unique <- as.factor(unique(mini_data)) # Character
-              mini_numeric <- as.integer(mini_unique) # No respect of ordinality
-            }
-            # Create rules
-            indexed <- colnames(data)[i] # Index value
-            rules[[indexed]] <- mini_numeric # Numeric content
-            names(rules[[indexed]]) <- mini_unique # Character equivalent
-            # Apply to real data column
-            data[[i]] <- unname(rules[[indexed]][mini_data])
-          }
        }
+        if (is_data_table) {
+            data.table::set(
+                data
+                , j = col_name
+                , value = unname(rules[[col_name]][data[[col_name]]])
+            )
+            data[is.na(get(col_name)), (col_name) := default_value_for_na]
+        } else {
+            data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
+            data[is.na(data[col_name]), col_name] <- default_value_for_na
+        }
+    }
-      } else {
+    # if any all-NA columns exist, they won't be in rules. Convert them
+    all_na_cols <- which(
-        stop(
+        sapply(
-          "lgb.convert_with_rules: you provided "
+            X = data
-          , paste(class(data), collapse = " & ")
+            , FUN = function(x) {
-          , " but data should have class data.frame"
+                (is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
+            }
        )
+    )
-      }
+    for (col_name in all_na_cols) {
+        if (column_classes[[col_name]] == "logical") {
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
+        } else {
+            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
+        }
+        if (is_data_table) {
+            data[, (col_name) := rep(default_value_for_na, .N)]
+        } else {
+            data[[col_name]] <- default_value_for_na
+        }
    }
-  }
+    .warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")
-  return(list(data = data, rules = rules))
+    return(list(data = data, rules = rules))
 }
--- a/R-package/R/removed.R
+++ b/R-package/R/removed.R
@@ -4,7 +4,7 @@
 #' @param ... catch-all too match old calls
 #' @export
 lgb.prepare <- function(...) {
-    stop("lgb.prepare() was removed in LightGBM 3.0.0. Please use lgb.convert()")
+    stop("lgb.prepare() was removed in LightGBM 3.0.0. Please use lgb.convert_with_rules()")
 }
 #' @title removed functions
@@ -13,7 +13,7 @@ lgb.prepare <- function(...) {
 #' @param ... catch-all too match old calls
 #' @export
 lgb.prepare2 <- function(...) {
-    stop("lgb.prepare2() was removed in LightGBM 3.0.0. Please use lgb.convert()")
+    stop("lgb.prepare2() was removed in LightGBM 3.0.0. Please use lgb.convert_with_rules()")
 }
 #' @title removed functions

--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
 basic_walkthrough               Basic feature walkthrough
 boost_from_prediction           Boosting from existing prediction
-categorical_features_prepare     Categorical Feature Preparation
 categorical_features_rules       Categorical Feature Preparation with Rules
 cross_validation                Cross Validation
 early_stopping                  Early Stop in training

--- a/R-package/demo/categorical_features_prepare.R
+++ b/R-package/demo/categorical_features_prepare.R
-# Here we are going to try training a model with categorical features
-# Load libraries
-library(data.table)
-library(lightgbm)
-# Load data and look at the structure
-#
-# Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
-# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
-# $ job      : chr  "unemployed" "services" "management" "management" ...
-# $ marital  : chr  "married" "married" "single" "married" ...
-# $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
-# $ default  : chr  "no" "no" "no" "no" ...
-# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
-# $ housing  : chr  "no" "yes" "yes" "yes" ...
-# $ loan     : chr  "no" "yes" "no" "yes" ...
-# $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
-# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
-# $ month    : chr  "oct" "may" "apr" "jun" ...
-# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
-# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
-# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
-# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
-# $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
-# $ y        : chr  "no" "no" "no" "no" ...
-data(bank, package = "lightgbm")
-str(bank)
-# We must now transform the data to fit in LightGBM
-# For this task, we use lgb.prepare
-# The function transforms the data into a fittable data
-#
-# Classes 'data.table' and 'data.frame':	4521 obs. of  17 variables:
-# $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
-# $ job      : num  11 8 5 5 2 5 7 10 3 8 ...
-# $ marital  : num  2 2 3 2 2 3 2 2 2 2 ...
-# $ education: num  1 2 3 3 2 3 3 2 3 1 ...
-# $ default  : num  1 1 1 1 1 1 1 1 1 1 ...
-# $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
-# $ housing  : num  1 2 2 2 2 1 2 2 2 2 ...
-# $ loan     : num  1 2 1 2 1 1 1 1 1 2 ...
-# $ contact  : num  1 1 1 3 3 1 1 1 3 1 ...
-# $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
-# $ month    : num  11 9 1 7 9 4 9 9 9 1 ...
-# $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
-# $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
-# $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
-# $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
-# $ poutcome : num  4 1 1 4 4 1 2 4 4 1 ...
-# $ y        : num  1 1 1 1 1 1 1 1 1 1 ...
-bank <- lgb.prepare(data = bank)
-str(bank)
-# Remove 1 to label because it must be between 0 and 1
-bank$y <- bank$y - 1L
-# Data input to LightGBM must be a matrix, without the label
-my_data <- as.matrix(bank[, 1L:16L, with = FALSE])
-# Creating the LightGBM dataset with categorical features
-# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
-lgb_data <- lgb.Dataset(
-    data = my_data
-    , label = bank$y
-    , categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L)
-)
-# We can now train a model
-params <- list(
-    objective = "binary"
-    , metric = "l2"
-    , min_data = 1L
-    , learning_rate = 0.1
-    , min_data = 0L
-    , min_hessian = 1.0
-    , max_depth = 2L
-)
-model <- lgb.train(
-    params = params
-    , data = lgb_data
-    , nrounds = 100L
-    , valids = list(train = lgb_data)
-)
-# Try to find split_feature: 2
-# If you find it, it means it used a categorical feature in the first tree
-lgb.dump(model, num_iteration = 1L)
--- a/R-package/man/lgb.convert.Rd
+++ b/R-package/man/lgb.convert.Rd
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/lgb.convert.R
-\name{lgb.convert}
-\alias{lgb.convert}
-\title{Data preparator for LightGBM datasets (integer)}
-\usage{
-lgb.convert(data)
-}
-\arguments{
-\item{data}{A data.frame or data.table to prepare.}
-}
-\value{
-The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
-        for input in \code{lgb.Dataset}.
-}
-\description{
-Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
-             Factors and characters are converted to integer.
-             Please use \code{\link{lgb.convert_with_rules}} if you want to apply this transformation to
-             other datasets. This is useful if you have a specific need for integer dataset instead
-             of numeric dataset.
-             NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare}.
-}
-\examples{
-\dontrun{
-data(iris)
-str(iris)
-# Convert all factors/chars to integer
-str(lgb.convert(data = iris))
-# When lightgbm package is installed, and you do not want to load it
-# You can still use the function!
-lgb.unloader()
-str(lgb.convert(data = iris))
-# 'data.frame':	150 obs. of  5 variables:
-# $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
-# $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
-# $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
-# $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-# $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
-}
-}
--- a/R-package/man/lgb.convert_with_rules.Rd
+++ b/R-package/man/lgb.convert_with_rules.Rd
@@ -9,18 +9,27 @@ lgb.convert_with_rules(data, rules = NULL)
 \arguments{
 \item{data}{A data.frame or data.table to prepare.}
-\item{rules}{A set of rules from the data preparator, if already used.}
+\item{rules}{A set of rules from the data preparator, if already used. This should be an R list,
+where names are column names in \code{data} and values are named character
+vectors whose names are column values and whose values are new values to
+replace them with.}
 }
 \value{
 A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
-        The data must be converted to a matrix format (\code{as.matrix}) for input in
+        Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
        \code{lgb.Dataset}.
 }
 \description{
 Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
-             Factors and characters are converted to integer.
+             Factor, character, and logical columns are converted to integer. Missing values
-             In addition, keeps rules created so you can convert other datasets using this converter.
+             in factors and characters will be filled with 0L. Missing values in logicals
-             This is useful if you have a specific need for integer dataset instead of numeric dataset.
+             will be filled with -1L.
+             This function returns and optionally takes in "rules" the describe exactly
+             how to convert values in columns.
+             Columns that contain only NA values will be converted by this function but will
+             not show up in the returned \code{rules}.
             NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
 }
@@ -30,7 +39,7 @@ data(iris)
 str(iris)
-new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter
+new_iris <- lgb.convert_with_rules(data = iris)
 str(new_iris$data)
 data(iris) # Erase iris dataset

--- a/R-package/pkgdown/_pkgdown.yml
+++ b/R-package/pkgdown/_pkgdown.yml
@@ -65,7 +65,6 @@ reference:
    - '`lgb.Dataset.save`'
    - '`lgb.Dataset.set.categorical`'
    - '`lgb.Dataset.set.reference`'
-    - '`lgb.convert`'
    - '`lgb.convert_with_rules`'
  - title: Machine Learning
    desc: Train models with LightGBM

--- a/R-package/tests/testthat/test_lgb.convert.R
+++ b/R-package/tests/testthat/test_lgb.convert.R
-context("lgb.convert()")
-test_that("lgb.convert() rejects inputs that are not a data.table or data.frame", {
-    bad_inputs <- list(
-        matrix(1.0:10.0, 2L, 5L)
-        , TRUE
-        , c("a", "b")
-        , NA
-        , 10L
-        , lgb.Dataset(
-            data = matrix(1.0:10.0, 2L, 5L)
-            , params = list()
-        )
-    )
-    for (bad_input in bad_inputs) {
-        expect_error({
-            converted_dataset <- lgb.convert(bad_input)
-        }, regexp = "lgb.convert: you provided", fixed = TRUE)
-    }
-})
-test_that("lgb.convert() should work correctly for a dataset with only character columns", {
-    testDF <- data.frame(
-        col1 = c("a", "b", "c")
-        , col2 =  c("green", "green", "red")
-        , stringsAsFactors = FALSE
-    )
-    testDT <- data.table::as.data.table(testDF)
-    for (input_data in list(testDF, testDT)) {
-        converted_dataset <- lgb.convert(input_data)
-        expect_identical(class(input_data), class(converted_dataset))
-        expect_identical(class(converted_dataset[["col1"]]), "integer")
-        expect_identical(class(converted_dataset[["col2"]]), "integer")
-        expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
-        expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
-    }
-})
-test_that("lgb.convert() should work correctly for a dataset with only factor columns", {
-    testDF <- data.frame(
-        col1 = as.factor(c("a", "b", "c"))
-        , col2 =  as.factor(c("green", "green", "red"))
-        , stringsAsFactors = FALSE
-    )
-    testDT <- data.table::as.data.table(testDF)
-    for (input_data in list(testDF, testDT)) {
-        converted_dataset <- lgb.convert(input_data)
-        expect_identical(class(input_data), class(converted_dataset))
-        expect_identical(class(converted_dataset[["col1"]]), "integer")
-        expect_identical(class(converted_dataset[["col2"]]), "integer")
-        expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
-        expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
-    }
-})
-test_that("lgb.convert() should not change a dataset with only integer columns", {
-    testDF <- data.frame(
-        col1 = 11L:15L
-        , col2 = 16L:20L
-        , stringsAsFactors = FALSE
-    )
-    testDT <- data.table::as.data.table(testDF)
-    for (input_data in list(testDF, testDT)) {
-        converted_dataset <- lgb.convert(input_data)
-        expect_identical(converted_dataset, input_data)
-    }
-})
-test_that("lgb.convert() should work correctly for a dataset with numeric, factor, and character columns", {
-    testDF <- data.frame(
-        character_col = c("a", "b", "c")
-        , numeric_col = c(1.0, 9.0, 10.0)
-        , factor_col = as.factor(c("n", "n", "y"))
-        , stringsAsFactors = FALSE
-    )
-    testDT <- data.table::as.data.table(testDF)
-    for (input_data in list(testDF, testDT)) {
-        converted_dataset <- lgb.convert(input_data)
-        expect_identical(class(input_data), class(converted_dataset))
-        expect_identical(class(converted_dataset[["character_col"]]), "integer")
-        expect_identical(class(converted_dataset[["factor_col"]]), "integer")
-        expect_identical(converted_dataset[["character_col"]], c(1L, 2L, 3L))
-        expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
-        # today, lgb.convert() does  not convert numeric  columns
-        expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
-        expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
-    }
-})
-test_that("lgb.convert() should work correctly for a dataset with missing values", {
-    testDF <- data.frame(
-        character_col = c("a", NA_character_, "c")
-        , na_col = rep(NA, 3L)
-        , na_real_col = rep(NA_real_, 3L)
-        , na_int_col = rep(NA_integer_,  3L)
-        , na_character_col = rep(NA_character_, 3L)
-        , numeric_col = c(1.0, 9.0, NA_real_)
-        , factor_col = as.factor(c("n", "n", "y"))
-        , integer_col = c(1L, 9L, NA_integer_)
-        , stringsAsFactors = FALSE
-    )
-    testDT <- data.table::as.data.table(testDF)
-    for (input_data in list(testDF, testDT)) {
-        converted_dataset <- lgb.convert(input_data)
-        expect_identical(class(input_data), class(converted_dataset))
-        expect_identical(class(converted_dataset[["character_col"]]), "integer")
-        expect_identical(converted_dataset[["character_col"]], c(1L, NA_integer_, 2L))
-        expect_identical(class(converted_dataset[["integer_col"]]), "integer")
-        expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
-        expect_identical(class(converted_dataset[["factor_col"]]), "integer")
-        expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
-        # NAs of any type should be converted to numeric
-        for (col in c("na_int_col", "na_character_col")) {
-            expect_identical(class(converted_dataset[[col]]), "integer")
-            expect_identical(converted_dataset[[col]], rep(NA_integer_, nrow(converted_dataset)))
-        }
-        # today, lgb.convert() does not convert logical columns
-        expect_identical(class(converted_dataset[["na_col"]]), "logical")
-        # today, lgb.convert() does not convert numeric columns to integer
-        expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
-        expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
-        expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
-        expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, NA_real_))
-    }
-})
-test_that("lgb.convert() should modify data.tables in-place", {
-    testDT <- data.table::data.table(
-        character_col = c("a", NA_character_, "c")
-        , na_col = rep(NA, 3L)
-        , na_real_col = rep(NA_real_, 3L)
-        , na_int_col = rep(NA_integer_,  3L)
-        , na_character_col = rep(NA_character_, 3L)
-        , numeric_col = c(1.0, 9.0, NA_real_)
-        , factor_col = as.factor(c("n", "n", "y"))
-        , integer_col = c(1L, 9L, NA_integer_)
-    )
-    resultDT <- lgb.convert(testDT)
-    expect_identical(resultDT, testDT)
-})
--- a/R-package/tests/testthat/test_lgb.convert_with_rules.R
+++ b/R-package/tests/testthat/test_lgb.convert_with_rules.R
@@ -117,7 +117,7 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with num
    }
 })
-test_that("lgb.convert_with_rules() should work correctly for a dataset with missing values", {
+test_that("lgb.convert_with_rules() should convert missing values to the expected value", {
    testDF <- data.frame(
        character_col = c("a", NA_character_, "c")
        , na_col = rep(NA, 3L)
@@ -137,24 +137,26 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with mis
        expect_identical(class(input_data), class(converted_dataset))
        expect_identical(class(converted_dataset[["character_col"]]), "integer")
-        expect_identical(converted_dataset[["character_col"]], c(1L, NA_integer_, 2L))
+        expect_identical(converted_dataset[["character_col"]], c(1L, 0L, 2L))
+        # does not try to fill 0s in for already-integer columns
        expect_identical(class(converted_dataset[["integer_col"]]), "integer")
        expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
+        expect_identical(class(converted_dataset[["na_int_col"]]), "integer")
+        expect_identical(converted_dataset[["na_int_col"]], rep(NA_integer_, nrow(converted_dataset)))
        expect_identical(class(converted_dataset[["factor_col"]]), "integer")
        expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
-        # NAs of any type should be converted to numeric
+        # NAs in character columns should be converted to 0
-        for (col in c("na_int_col", "na_character_col")) {
+        expect_identical(class(converted_dataset[["na_character_col"]]), "integer")
-            expect_identical(class(converted_dataset[[col]]), "integer")
+        expect_identical(converted_dataset[["na_character_col"]], rep(0L, nrow(converted_dataset)))
-            expect_identical(converted_dataset[[col]], rep(NA_integer_, nrow(converted_dataset)))
-        }
-        # today, lgb.convert_with_rules() does not convert logical columns
+        # logical should be converted to integer
-        expect_identical(class(converted_dataset[["na_col"]]), "logical")
+        expect_identical(class(converted_dataset[["na_col"]]), "integer")
+        expect_identical(converted_dataset[["na_col"]], rep(-1L, 3L))
-        # today, lgb.convert_with_rules() does not convert numeric columns to integer
+        # lgb.convert_with_rules() should not convert numeric columns to integer
        expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
        expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
        expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
@@ -164,9 +166,9 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with mis
        rules <- conversion_result$rules
        expect_is(rules, "list")
        expect_length(rules, 3L)
-        expect_identical(rules[["character_col"]], stats::setNames(c(1L, NA_integer_, 2L), c("a", NA, "c")))
+        expect_identical(rules[["character_col"]], c("a" = 1L, "c" = 2L))
-        expect_identical(rules[["na_character_col"]], stats::setNames(NA_integer_, NA))
        expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
+        expect_identical(rules[["na_col"]], stats::setNames(c(0L, 1L), c(FALSE, TRUE)))
    }
 })
@@ -183,9 +185,11 @@ test_that("lgb.convert_with_rules() should work correctly if you provide your ow
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    # value used by lgb.convert_with_rules() when it encounters a categorical value that
    # is not in the provided rules
    UNKNOWN_FACTOR_VALUE <- 0L
+    UNKNOWN_LOGICAL_VALUE <- -1L
    for (input_data in list(testDF, testDT)) {
        custom_rules <- list(
            "character_col" = c(
@@ -212,11 +216,15 @@ test_that("lgb.convert_with_rules() should work correctly if you provide your ow
        expect_identical(class(converted_dataset[["factor_col"]]), "integer")
        expect_identical(converted_dataset[["factor_col"]], c(65L, 65L, 66L, 66L, 65L, 65L))
-        # columns not specified in rules are not going to be converted
+        # columns not specified in rules are not going to be converted, unless they are all NA
-        for (col in c("na_col", "na_real_col", "na_int_col", "na_character_col", "numeric_col", "integer_col")) {
+        for (col in c("na_real_col", "na_int_col", "numeric_col", "integer_col")) {
            expect_identical(converted_dataset[[col]], input_data[[col]])
        }
+        # non-numeric/integer columns that are all NA should have been filled in
+        expect_identical(converted_dataset[["na_col"]], rep(UNKNOWN_LOGICAL_VALUE, 6L))
+        expect_identical(converted_dataset[["na_character_col"]], rep(UNKNOWN_FACTOR_VALUE, 6L))
        # the rules you passed in should be returned unchanged
        rules <- conversion_result$rules
        expect_identical(rules, custom_rules)