Unverified Commit 083b02af authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] refactor and improvements to lgb.convert() functions (fixes #2678, #2681) (#3269)

* [R-package] improvements to lgb.convert() functions (fixes #2678, #2681)

* more stuff

* update docs

* remove lgb.convert()

* put internal functions back

* update index
parent c454d5f8
...@@ -14,7 +14,6 @@ export(lgb.Dataset.create.valid) ...@@ -14,7 +14,6 @@ export(lgb.Dataset.create.valid)
export(lgb.Dataset.save) export(lgb.Dataset.save)
export(lgb.Dataset.set.categorical) export(lgb.Dataset.set.categorical)
export(lgb.Dataset.set.reference) export(lgb.Dataset.set.reference)
export(lgb.convert)
export(lgb.convert_with_rules) export(lgb.convert_with_rules)
export(lgb.cv) export(lgb.cv)
export(lgb.dump) export(lgb.dump)
......
#' @name lgb.convert
#' @title Data preparator for LightGBM datasets (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to integer.
#' Please use \code{\link{lgb.convert_with_rules}} if you want to apply this transformation to
#' other datasets. This is useful if you have a specific need for integer dataset instead
#' of numeric dataset.
#'
#' NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare}.
#' @param data A data.frame or data.table to prepare.
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#'
#' @examples
#' \dontrun{
#' data(iris)
#'
#' str(iris)
#'
#' # Convert all factors/chars to integer
#' str(lgb.convert(data = iris))
#'
#' # When lightgbm package is installed, and you do not want to load it
#' # You can still use the function!
#' lgb.unloader()
#' str(lgb.convert(data = iris))
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
#' }
#' @export
lgb.convert <- function(data) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
# Get data classes
list_classes <- vapply(data, class, character(1L))
# Convert characters to integer
is_char <- which(list_classes == "character")
if (length(is_char) > 0L) {
data[, (is_char) := lapply(.SD, function(x) {as.integer(as.factor(x))}), .SDcols = is_char]
}
# Convert factors to integer
is_fact <- c(which(list_classes == "factor"), is_char)
if (length(is_fact) > 0L) {
data[, (is_fact) := lapply(.SD, function(x) {as.integer(x)}), .SDcols = is_fact]
}
} else {
# Default routine (data.frame)
if (inherits(data, "data.frame")) {
# Get data classes
list_classes <- vapply(data, class, character(1L))
# Convert characters to factors to numeric (integer is more efficient actually)
is_char <- which(list_classes == "character")
if (length(is_char) > 0L) {
data[is_char] <- lapply(data[is_char], function(x) {as.integer(as.factor(x))})
}
# Convert factors to numeric (integer is more efficient actually)
is_fact <- which(list_classes == "factor")
if (length(is_fact) > 0L) {
data[is_fact] <- lapply(data[is_fact], function(x) {as.integer(x)})
}
} else {
stop(
"lgb.convert: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame or data.table"
)
}
}
return(data)
}
# [description] get all column classes of a data.table or data.frame.
# This function collapses the result of class() into a single string
.get_column_classes <- function(df) {
return(
vapply(
X = df
, FUN = function(x) {paste0(class(x), collapse = ",")}
, FUN.VALUE = character(1L)
)
)
}
# [description] check a data frame or data table for columns tthat are any
# type other than numeric and integer. This is used by lgb.convert()
# and lgb.convert_with_rules() too warn if more action is needed by users
# before a dataset can be converted to a lgb.Dataset.
.warn_for_unconverted_columns <- function(df, function_name) {
column_classes <- .get_column_classes(df)
unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
if (length(unconverted_columns) > 0L) {
col_detail_string <- paste0(
paste0(
names(unconverted_columns)
, " ("
, unconverted_columns
, ")"
)
, collapse = ", "
)
msg <- paste0(
function_name
, ": "
, length(unconverted_columns)
, " columns are not numeric or integer. These need to be dropped or converted to "
, "be used in an lgb.Dataset object. "
, col_detail_string
)
warning(msg)
}
return(invisible(NULL))
}
.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}
#' @name lgb.convert_with_rules #' @name lgb.convert_with_rules
#' @title Data preparator for LightGBM datasets with rules (integer) #' @title Data preparator for LightGBM datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. #' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to integer. #' Factor, character, and logical columns are converted to integer. Missing values
#' In addition, keeps rules created so you can convert other datasets using this converter. #' in factors and characters will be filled with 0L. Missing values in logicals
#' This is useful if you have a specific need for integer dataset instead of numeric dataset. #' will be filled with -1L.
#'
#' This function returns and optionally takes in "rules" the describe exactly
#' how to convert values in columns.
#'
#' Columns that contain only NA values will be converted by this function but will
#' not show up in the returned \code{rules}.
#' #'
#' NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}. #' NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
#' @param data A data.frame or data.table to prepare. #' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used. #' @param rules A set of rules from the data preparator, if already used. This should be an R list,
#' where names are column names in \code{data} and values are named character
#' vectors whose names are column values and whose values are new values to
#' replace them with.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). #' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input in #' Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
#' \code{lgb.Dataset}. #' \code{lgb.Dataset}.
#' #'
#' @examples #' @examples
...@@ -18,7 +73,7 @@ ...@@ -18,7 +73,7 @@
#' #'
#' str(iris) #' str(iris)
#' #'
#' new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter #' new_iris <- lgb.convert_with_rules(data = iris)
#' str(new_iris$data) #' str(new_iris$data)
#' #'
#' data(iris) # Erase iris dataset #' data(iris) # Erase iris dataset
...@@ -54,130 +109,98 @@ ...@@ -54,130 +109,98 @@
#' @export #' @export
lgb.convert_with_rules <- function(data, rules = NULL) { lgb.convert_with_rules <- function(data, rules = NULL) {
# data.table not behaving like data.frame column_classes <- .get_column_classes(data)
if (inherits(data, "data.table")) {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]])) is_char <- which(column_classes == "character")
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer is_factor <- which(column_classes == "factor")
is_logical <- which(column_classes == "logical")
} is_data_table <- data.table::is.data.table(data)
is_data_frame <- is.data.frame(data)
} else { if (!(is_data_table || is_data_frame)) {
stop(
# Get data classes "lgb.convert_with_rules: you provided "
list_classes <- vapply(data, class, character(1L)) , paste(class(data), collapse = " & ")
, " but data should have class data.frame or data.table"
# Map characters/factors )
is_fix <- which(list_classes %in% c("character", "factor")) }
rules <- list()
# Need to create rules? # if user didn't provide rules, create them
if (length(is_fix) > 0L) { if (is.null(rules)) {
rules <- list()
columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))
# Go through all characters/factors for (i in columns_to_fix) {
for (i in is_fix) {
# Store column elsewhere col_values <- data[[i]]
mini_data <- data[[i]]
# Get unique values # Get unique values
if (is.factor(mini_data)) { if (is.factor(col_values)) {
mini_unique <- levels(mini_data) # Factor unique_vals <- levels(col_values)
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed unique_vals <- unique_vals[!is.na(unique_vals)]
} else { mini_numeric <- seq_along(unique_vals) # respect ordinal
mini_unique <- as.factor(unique(mini_data)) # Character } else if (is.character(col_values)) {
mini_numeric <- as.integer(mini_unique) # No respect of ordinality unique_vals <- as.factor(unique(col_values))
unique_vals <- unique_vals[!is.na(unique_vals)]
mini_numeric <- as.integer(unique_vals) # no respect for ordinal
} else if (is.logical(col_values)) {
unique_vals <- c(FALSE, TRUE)
mini_numeric <- c(0L, 1L)
} }
# Create rules # don't add rules for all-NA columns
indexed <- colnames(data)[i] # Index value if (length(unique_vals) > 0L) {
rules[[indexed]] <- mini_numeric # Numeric content col_name <- names(data)[i]
names(rules[[indexed]]) <- mini_unique # Character equivalent rules[[col_name]] <- mini_numeric
names(rules[[col_name]]) <- unique_vals
# Apply to real data column }
data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
} }
}
} }
} else { for (col_name in names(rules)) {
if (column_classes[[col_name]] == "logical") {
# Must use existing rules default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
if (!is.null(rules)) { } else {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
# Loop through rules
for (i in names(rules)) {
data[[i]] <- unname(rules[[i]][data[[i]]])
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
}
} else {
# Default routine (data.frame)
if (inherits(data, "data.frame")) {
# Get data classes
list_classes <- vapply(data, class, character(1L))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0L) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.integer(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data[[i]] <- unname(rules[[indexed]][mini_data])
}
} }
if (is_data_table) {
data.table::set(
data
, j = col_name
, value = unname(rules[[col_name]][data[[col_name]]])
)
data[is.na(get(col_name)), (col_name) := default_value_for_na]
} else {
data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
data[is.na(data[col_name]), col_name] <- default_value_for_na
}
}
} else { # if any all-NA columns exist, they won't be in rules. Convert them
all_na_cols <- which(
stop( sapply(
"lgb.convert_with_rules: you provided " X = data
, paste(class(data), collapse = " & ") , FUN = function(x) {
, " but data should have class data.frame" (is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
}
) )
)
} for (col_name in all_na_cols) {
if (column_classes[[col_name]] == "logical") {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
} else {
default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
}
if (is_data_table) {
data[, (col_name) := rep(default_value_for_na, .N)]
} else {
data[[col_name]] <- default_value_for_na
}
} }
} .warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")
return(list(data = data, rules = rules)) return(list(data = data, rules = rules))
} }
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#' @param ... catch-all too match old calls #' @param ... catch-all too match old calls
#' @export #' @export
lgb.prepare <- function(...) { lgb.prepare <- function(...) {
stop("lgb.prepare() was removed in LightGBM 3.0.0. Please use lgb.convert()") stop("lgb.prepare() was removed in LightGBM 3.0.0. Please use lgb.convert_with_rules()")
} }
#' @title removed functions #' @title removed functions
...@@ -13,7 +13,7 @@ lgb.prepare <- function(...) { ...@@ -13,7 +13,7 @@ lgb.prepare <- function(...) {
#' @param ... catch-all too match old calls #' @param ... catch-all too match old calls
#' @export #' @export
lgb.prepare2 <- function(...) { lgb.prepare2 <- function(...) {
stop("lgb.prepare2() was removed in LightGBM 3.0.0. Please use lgb.convert()") stop("lgb.prepare2() was removed in LightGBM 3.0.0. Please use lgb.convert_with_rules()")
} }
#' @title removed functions #' @title removed functions
......
basic_walkthrough Basic feature walkthrough basic_walkthrough Basic feature walkthrough
boost_from_prediction Boosting from existing prediction boost_from_prediction Boosting from existing prediction
categorical_features_prepare Categorical Feature Preparation
categorical_features_rules Categorical Feature Preparation with Rules categorical_features_rules Categorical Feature Preparation with Rules
cross_validation Cross Validation cross_validation Cross Validation
early_stopping Early Stop in training early_stopping Early Stop in training
......
# Here we are going to try training a model with categorical features
# Load libraries
library(data.table)
library(lightgbm)
# Load data and look at the structure
#
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables:
# $ age : int 30 33 35 30 59 35 36 39 41 43 ...
# $ job : chr "unemployed" "services" "management" "management" ...
# $ marital : chr "married" "married" "single" "married" ...
# $ education: chr "primary" "secondary" "tertiary" "tertiary" ...
# $ default : chr "no" "no" "no" "no" ...
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
# $ housing : chr "no" "yes" "yes" "yes" ...
# $ loan : chr "no" "yes" "no" "yes" ...
# $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
# $ day : int 19 11 16 3 5 23 14 6 14 17 ...
# $ month : chr "oct" "may" "apr" "jun" ...
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
# $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
# $ y : chr "no" "no" "no" "no" ...
data(bank, package = "lightgbm")
str(bank)
# We must now transform the data to fit in LightGBM
# For this task, we use lgb.prepare
# The function transforms the data into a fittable data
#
# Classes 'data.table' and 'data.frame': 4521 obs. of 17 variables:
# $ age : int 30 33 35 30 59 35 36 39 41 43 ...
# $ job : num 11 8 5 5 2 5 7 10 3 8 ...
# $ marital : num 2 2 3 2 2 3 2 2 2 2 ...
# $ education: num 1 2 3 3 2 3 3 2 3 1 ...
# $ default : num 1 1 1 1 1 1 1 1 1 1 ...
# $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
# $ housing : num 1 2 2 2 2 1 2 2 2 2 ...
# $ loan : num 1 2 1 2 1 1 1 1 1 2 ...
# $ contact : num 1 1 1 3 3 1 1 1 3 1 ...
# $ day : int 19 11 16 3 5 23 14 6 14 17 ...
# $ month : num 11 9 1 7 9 4 9 9 9 1 ...
# $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
# $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
# $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
# $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
# $ poutcome : num 4 1 1 4 4 1 2 4 4 1 ...
# $ y : num 1 1 1 1 1 1 1 1 1 1 ...
bank <- lgb.prepare(data = bank)
str(bank)
# Remove 1 to label because it must be between 0 and 1
bank$y <- bank$y - 1L
# Data input to LightGBM must be a matrix, without the label
my_data <- as.matrix(bank[, 1L:16L, with = FALSE])
# Creating the LightGBM dataset with categorical features
# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
lgb_data <- lgb.Dataset(
data = my_data
, label = bank$y
, categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L)
)
# We can now train a model
params <- list(
objective = "binary"
, metric = "l2"
, min_data = 1L
, learning_rate = 0.1
, min_data = 0L
, min_hessian = 1.0
, max_depth = 2L
)
model <- lgb.train(
params = params
, data = lgb_data
, nrounds = 100L
, valids = list(train = lgb_data)
)
# Try to find split_feature: 2
# If you find it, it means it used a categorical feature in the first tree
lgb.dump(model, num_iteration = 1L)
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.convert.R
\name{lgb.convert}
\alias{lgb.convert}
\title{Data preparator for LightGBM datasets (integer)}
\usage{
lgb.convert(data)
}
\arguments{
\item{data}{A data.frame or data.table to prepare.}
}
\value{
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
for input in \code{lgb.Dataset}.
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to integer.
Please use \code{\link{lgb.convert_with_rules}} if you want to apply this transformation to
other datasets. This is useful if you have a specific need for integer dataset instead
of numeric dataset.
NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare}.
}
\examples{
\dontrun{
data(iris)
str(iris)
# Convert all factors/chars to integer
str(lgb.convert(data = iris))
# When lightgbm package is installed, and you do not want to load it
# You can still use the function!
lgb.unloader()
str(lgb.convert(data = iris))
# 'data.frame': 150 obs. of 5 variables:
# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
# $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
}
}
...@@ -9,18 +9,27 @@ lgb.convert_with_rules(data, rules = NULL) ...@@ -9,18 +9,27 @@ lgb.convert_with_rules(data, rules = NULL)
\arguments{ \arguments{
\item{data}{A data.frame or data.table to prepare.} \item{data}{A data.frame or data.table to prepare.}
\item{rules}{A set of rules from the data preparator, if already used.} \item{rules}{A set of rules from the data preparator, if already used. This should be an R list,
where names are column names in \code{data} and values are named character
vectors whose names are column values and whose values are new values to
replace them with.}
} }
\value{ \value{
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
The data must be converted to a matrix format (\code{as.matrix}) for input in Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
\code{lgb.Dataset}. \code{lgb.Dataset}.
} }
\description{ \description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to integer. Factor, character, and logical columns are converted to integer. Missing values
In addition, keeps rules created so you can convert other datasets using this converter. in factors and characters will be filled with 0L. Missing values in logicals
This is useful if you have a specific need for integer dataset instead of numeric dataset. will be filled with -1L.
This function returns and optionally takes in "rules" the describe exactly
how to convert values in columns.
Columns that contain only NA values will be converted by this function but will
not show up in the returned \code{rules}.
NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}. NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
} }
...@@ -30,7 +39,7 @@ data(iris) ...@@ -30,7 +39,7 @@ data(iris)
str(iris) str(iris)
new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter new_iris <- lgb.convert_with_rules(data = iris)
str(new_iris$data) str(new_iris$data)
data(iris) # Erase iris dataset data(iris) # Erase iris dataset
......
...@@ -65,7 +65,6 @@ reference: ...@@ -65,7 +65,6 @@ reference:
- '`lgb.Dataset.save`' - '`lgb.Dataset.save`'
- '`lgb.Dataset.set.categorical`' - '`lgb.Dataset.set.categorical`'
- '`lgb.Dataset.set.reference`' - '`lgb.Dataset.set.reference`'
- '`lgb.convert`'
- '`lgb.convert_with_rules`' - '`lgb.convert_with_rules`'
- title: Machine Learning - title: Machine Learning
desc: Train models with LightGBM desc: Train models with LightGBM
......
context("lgb.convert()")
test_that("lgb.convert() rejects inputs that are not a data.table or data.frame", {
bad_inputs <- list(
matrix(1.0:10.0, 2L, 5L)
, TRUE
, c("a", "b")
, NA
, 10L
, lgb.Dataset(
data = matrix(1.0:10.0, 2L, 5L)
, params = list()
)
)
for (bad_input in bad_inputs) {
expect_error({
converted_dataset <- lgb.convert(bad_input)
}, regexp = "lgb.convert: you provided", fixed = TRUE)
}
})
test_that("lgb.convert() should work correctly for a dataset with only character columns", {
testDF <- data.frame(
col1 = c("a", "b", "c")
, col2 = c("green", "green", "red")
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "integer")
expect_identical(class(converted_dataset[["col2"]]), "integer")
expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
}
})
test_that("lgb.convert() should work correctly for a dataset with only factor columns", {
testDF <- data.frame(
col1 = as.factor(c("a", "b", "c"))
, col2 = as.factor(c("green", "green", "red"))
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "integer")
expect_identical(class(converted_dataset[["col2"]]), "integer")
expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
}
})
test_that("lgb.convert() should not change a dataset with only integer columns", {
testDF <- data.frame(
col1 = 11L:15L
, col2 = 16L:20L
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.convert(input_data)
expect_identical(converted_dataset, input_data)
}
})
test_that("lgb.convert() should work correctly for a dataset with numeric, factor, and character columns", {
testDF <- data.frame(
character_col = c("a", "b", "c")
, numeric_col = c(1.0, 9.0, 10.0)
, factor_col = as.factor(c("n", "n", "y"))
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "integer")
expect_identical(class(converted_dataset[["factor_col"]]), "integer")
expect_identical(converted_dataset[["character_col"]], c(1L, 2L, 3L))
expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
# today, lgb.convert() does not convert numeric columns
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
}
})
test_that("lgb.convert() should work correctly for a dataset with missing values", {
testDF <- data.frame(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
, na_real_col = rep(NA_real_, 3L)
, na_int_col = rep(NA_integer_, 3L)
, na_character_col = rep(NA_character_, 3L)
, numeric_col = c(1.0, 9.0, NA_real_)
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "integer")
expect_identical(converted_dataset[["character_col"]], c(1L, NA_integer_, 2L))
expect_identical(class(converted_dataset[["integer_col"]]), "integer")
expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
expect_identical(class(converted_dataset[["factor_col"]]), "integer")
expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
# NAs of any type should be converted to numeric
for (col in c("na_int_col", "na_character_col")) {
expect_identical(class(converted_dataset[[col]]), "integer")
expect_identical(converted_dataset[[col]], rep(NA_integer_, nrow(converted_dataset)))
}
# today, lgb.convert() does not convert logical columns
expect_identical(class(converted_dataset[["na_col"]]), "logical")
# today, lgb.convert() does not convert numeric columns to integer
expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, NA_real_))
}
})
test_that("lgb.convert() should modify data.tables in-place", {
testDT <- data.table::data.table(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
, na_real_col = rep(NA_real_, 3L)
, na_int_col = rep(NA_integer_, 3L)
, na_character_col = rep(NA_character_, 3L)
, numeric_col = c(1.0, 9.0, NA_real_)
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
)
resultDT <- lgb.convert(testDT)
expect_identical(resultDT, testDT)
})
...@@ -117,7 +117,7 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with num ...@@ -117,7 +117,7 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with num
} }
}) })
test_that("lgb.convert_with_rules() should work correctly for a dataset with missing values", { test_that("lgb.convert_with_rules() should convert missing values to the expected value", {
testDF <- data.frame( testDF <- data.frame(
character_col = c("a", NA_character_, "c") character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L) , na_col = rep(NA, 3L)
...@@ -137,24 +137,26 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with mis ...@@ -137,24 +137,26 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with mis
expect_identical(class(input_data), class(converted_dataset)) expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "integer") expect_identical(class(converted_dataset[["character_col"]]), "integer")
expect_identical(converted_dataset[["character_col"]], c(1L, NA_integer_, 2L)) expect_identical(converted_dataset[["character_col"]], c(1L, 0L, 2L))
# does not try to fill 0s in for already-integer columns
expect_identical(class(converted_dataset[["integer_col"]]), "integer") expect_identical(class(converted_dataset[["integer_col"]]), "integer")
expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_)) expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
expect_identical(class(converted_dataset[["na_int_col"]]), "integer")
expect_identical(converted_dataset[["na_int_col"]], rep(NA_integer_, nrow(converted_dataset)))
expect_identical(class(converted_dataset[["factor_col"]]), "integer") expect_identical(class(converted_dataset[["factor_col"]]), "integer")
expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L)) expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
# NAs of any type should be converted to numeric # NAs in character columns should be converted to 0
for (col in c("na_int_col", "na_character_col")) { expect_identical(class(converted_dataset[["na_character_col"]]), "integer")
expect_identical(class(converted_dataset[[col]]), "integer") expect_identical(converted_dataset[["na_character_col"]], rep(0L, nrow(converted_dataset)))
expect_identical(converted_dataset[[col]], rep(NA_integer_, nrow(converted_dataset)))
}
# today, lgb.convert_with_rules() does not convert logical columns # logical should be converted to integer
expect_identical(class(converted_dataset[["na_col"]]), "logical") expect_identical(class(converted_dataset[["na_col"]]), "integer")
expect_identical(converted_dataset[["na_col"]], rep(-1L, 3L))
# today, lgb.convert_with_rules() does not convert numeric columns to integer # lgb.convert_with_rules() should not convert numeric columns to integer
expect_identical(class(converted_dataset[["na_real_col"]]), "numeric") expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset))) expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric") expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
...@@ -164,9 +166,9 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with mis ...@@ -164,9 +166,9 @@ test_that("lgb.convert_with_rules() should work correctly for a dataset with mis
rules <- conversion_result$rules rules <- conversion_result$rules
expect_is(rules, "list") expect_is(rules, "list")
expect_length(rules, 3L) expect_length(rules, 3L)
expect_identical(rules[["character_col"]], stats::setNames(c(1L, NA_integer_, 2L), c("a", NA, "c"))) expect_identical(rules[["character_col"]], c("a" = 1L, "c" = 2L))
expect_identical(rules[["na_character_col"]], stats::setNames(NA_integer_, NA))
expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L)) expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
expect_identical(rules[["na_col"]], stats::setNames(c(0L, 1L), c(FALSE, TRUE)))
} }
}) })
...@@ -183,9 +185,11 @@ test_that("lgb.convert_with_rules() should work correctly if you provide your ow ...@@ -183,9 +185,11 @@ test_that("lgb.convert_with_rules() should work correctly if you provide your ow
, stringsAsFactors = FALSE , stringsAsFactors = FALSE
) )
testDT <- data.table::as.data.table(testDF) testDT <- data.table::as.data.table(testDF)
# value used by lgb.convert_with_rules() when it encounters a categorical value that # value used by lgb.convert_with_rules() when it encounters a categorical value that
# is not in the provided rules # is not in the provided rules
UNKNOWN_FACTOR_VALUE <- 0L UNKNOWN_FACTOR_VALUE <- 0L
UNKNOWN_LOGICAL_VALUE <- -1L
for (input_data in list(testDF, testDT)) { for (input_data in list(testDF, testDT)) {
custom_rules <- list( custom_rules <- list(
"character_col" = c( "character_col" = c(
...@@ -212,11 +216,15 @@ test_that("lgb.convert_with_rules() should work correctly if you provide your ow ...@@ -212,11 +216,15 @@ test_that("lgb.convert_with_rules() should work correctly if you provide your ow
expect_identical(class(converted_dataset[["factor_col"]]), "integer") expect_identical(class(converted_dataset[["factor_col"]]), "integer")
expect_identical(converted_dataset[["factor_col"]], c(65L, 65L, 66L, 66L, 65L, 65L)) expect_identical(converted_dataset[["factor_col"]], c(65L, 65L, 66L, 66L, 65L, 65L))
# columns not specified in rules are not going to be converted # columns not specified in rules are not going to be converted, unless they are all NA
for (col in c("na_col", "na_real_col", "na_int_col", "na_character_col", "numeric_col", "integer_col")) { for (col in c("na_real_col", "na_int_col", "numeric_col", "integer_col")) {
expect_identical(converted_dataset[[col]], input_data[[col]]) expect_identical(converted_dataset[[col]], input_data[[col]])
} }
# non-numeric/integer columns that are all NA should have been filled in
expect_identical(converted_dataset[["na_col"]], rep(UNKNOWN_LOGICAL_VALUE, 6L))
expect_identical(converted_dataset[["na_character_col"]], rep(UNKNOWN_FACTOR_VALUE, 6L))
# the rules you passed in should be returned unchanged # the rules you passed in should be returned unchanged
rules <- conversion_result$rules rules <- conversion_result$rules
expect_identical(rules, custom_rules) expect_identical(rules, custom_rules)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment