"vscode:/vscode.git/clone" did not exist on "d0d70716ce8351ec82d1ad42d25ff3779ee5a94d"
Unverified Commit fecac8e7 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] deprecate lgb.prepare() and lgb.prepare2() (#3095)



* [R-package] deprecate lgb.prepare() and lgb.prepare2()

* linting

* renaming

* updated docs

* linting

* Apply suggestions from code review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* one more comment fix

* remove comment about int being dangerous

* empty commit
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent fed57520
......@@ -14,6 +14,8 @@ export(lgb.Dataset.create.valid)
export(lgb.Dataset.save)
export(lgb.Dataset.set.categorical)
export(lgb.Dataset.set.reference)
export(lgb.convert)
export(lgb.convert_with_rules)
export(lgb.cv)
export(lgb.dump)
export(lgb.get.eval.result)
......
#' @name lgb.prepare2
#' @name lgb.convert
#' @title Data preparator for LightGBM datasets (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' Please use \code{\link{lgb.prepare_rules2}} if you want to apply this transformation to
#' Factors and characters are converted to integer.
#' Please use \code{\link{lgb.convert_with_rules}} if you want to apply this transformation to
#' other datasets. This is useful if you have a specific need for integer dataset instead
#' of numeric dataset. Note that there are programs which do not support integer-only
#' input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
#' of numeric dataset.
#'
#' NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare}.
#' @param data A data.frame or data.table to prepare.
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
......@@ -16,13 +17,13 @@
#' str(iris)
#'
#' # Convert all factors/chars to integer
#' str(lgb.prepare2(data = iris))
#' str(lgb.convert(data = iris))
#'
#' \dontrun{
#' # When lightgbm package is installed, and you do not want to load it
#' # You can still use the function!
#' lgb.unloader()
#' str(lightgbm::lgb.prepare2(data = iris))
#' str(lightgbm::lgb.convert(data = iris))
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
......@@ -32,7 +33,7 @@
#' }
#'
#' @export
lgb.prepare2 <- function(data) {
lgb.convert <- function(data) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
......@@ -75,7 +76,7 @@ lgb.prepare2 <- function(data) {
} else {
stop(
"lgb.prepare2: you provided "
"lgb.convert: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame or data.table"
)
......
#' @name lgb.prepare_rules2
#' @name lgb.convert_with_rules
#' @title Data preparator for LightGBM datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' Factors and characters are converted to integer.
#' In addition, keeps rules created so you can convert other datasets using this converter.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Note that there are programs which do not support integer-only input.
#' Consider this as a half memory technique which is dangerous, especially for LightGBM.
#'
#' NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
......@@ -17,7 +17,7 @@
#'
#' str(iris)
#'
#' new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
#' new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter
#' str(new_iris$data)
#'
#' data(iris) # Erase iris dataset
......@@ -25,7 +25,7 @@
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
#' newer_iris <- lgb.convert_with_rules(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
......@@ -46,12 +46,12 @@
#' , "virginica" = 1L
#' )
#' )
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
#' newest_iris <- lgb.convert_with_rules(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#'
#' @importFrom data.table set
#' @export
lgb.prepare_rules2 <- function(data, rules = NULL) {
lgb.convert_with_rules <- function(data, rules = NULL) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
......@@ -166,7 +166,7 @@ lgb.prepare_rules2 <- function(data, rules = NULL) {
} else {
stop(
"lgb.prepare_rules2: you provided "
"lgb.convert_with_rules: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
......
#' @name lgb.prepare
#' @title Data preparator for LightGBM datasets (numeric)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric without integers. Please use
#' \code{\link{lgb.prepare_rules}} if you want to apply this transformation to other datasets.
#' @param data A data.frame or data.table to prepare.
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#'
#' @examples
#' data(iris)
#'
#' str(iris)
#'
#' str(lgb.prepare(data = iris)) # Convert all factors/chars to numeric
#'
#' \dontrun{
#' # When lightgbm package is installed, and you do not want to load it
#' # You can still use the function!
#' lgb.unloader()
#' str(lightgbm::lgb.prepare(data = iris))
#' # 'data.frame': 150 obs. of 5 variables:
#' # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
#' # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
#' # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
#' # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
#' # $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
#' }
#'
#' @export
lgb.prepare <- function(data) {
# data.table not behaving like data.frame
if ("data.table" %in% class(data)) {
# Get data classes
list_classes <- sapply(data, class)
# Convert characters to factors only (we can change them to numeric after)
is_char <- which(list_classes == "character")
if (length(is_char) > 0L) {
data[, (is_char) := lapply(.SD, function(x) {as.numeric(as.factor(x))}), .SDcols = is_char]
}
# Convert factors to numeric (integer is more efficient actually)
is_fact <- c(which(list_classes == "factor"), is_char)
if (length(is_fact) > 0L) {
data[, (is_fact) := lapply(.SD, function(x) {as.numeric(x)}), .SDcols = is_fact]
}
} else {
# Default routine (data.frame)
if ("data.frame" %in% class(data)) {
# Get data classes
list_classes <- sapply(data, class)
# Convert characters to factors to numeric
is_char <- which(list_classes == "character")
if (length(is_char) > 0L) {
data[is_char] <- lapply(data[is_char], function(x) {as.numeric(as.factor(x))})
}
# Convert factors to numeric
is_fact <- which(list_classes == "factor")
if (length(is_fact) > 0L) {
data[is_fact] <- lapply(data[is_fact], function(x) {as.numeric(x)})
}
} else {
stop(
"lgb.prepare: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame or data.table"
)
}
}
return(data)
}
#' @name lgb.prepare_rules
#' @title Data preparator for LightGBM datasets with rules (numeric)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric. In addition, keeps rules created
#' so you can convert other datasets using this converter.
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input
#' in \code{lgb.Dataset}.
#'
#' @examples
#' data(iris)
#'
#' str(iris)
#'
#' new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
#' str(new_iris$data)
#'
#' data(iris) # Erase iris dataset
#' iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
#'
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
#' newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
#'
#' # Unknown factor is now zero, perfect for sparse datasets
#' newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
#'
#' newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
#'
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
#'
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(
#' Species = c(
#' "setosa" = 3L
#' , "versicolor" = 2L
#' , "virginica" = 1L
#' )
#' )
#' newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#'
#' @importFrom data.table set
#' @export
lgb.prepare_rules <- function(data, rules = NULL) {
# data.table not behaving like data.frame
if (inherits(data, "data.table")) {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data.table::set(data, j = i, value = unname(rules[[i]][data[[i]]]))
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s
}
} else {
# Get data classes
list_classes <- vapply(data, class, character(1L))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0L) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- numeric(length(mini_unique))
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data.table::set(data, j = i, value = unname(rules[[indexed]][mini_data]))
}
}
}
} else {
# Must use existing rules
if (!is.null(rules)) {
# Loop through rules
for (i in names(rules)) {
data[[i]] <- unname(rules[[i]][data[[i]]])
data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s
}
} else {
# Default routine (data.frame)
if (inherits(data, "data.frame")) {
# Get data classes
list_classes <- vapply(data, class, character(1L))
# Map characters/factors
is_fix <- which(list_classes %in% c("character", "factor"))
rules <- list()
# Need to create rules?
if (length(is_fix) > 0L) {
# Go through all characters/factors
for (i in is_fix) {
# Store column elsewhere
mini_data <- data[[i]]
# Get unique values
if (is.factor(mini_data)) {
mini_unique <- levels(mini_data) # Factor
mini_numeric <- numeric(length(mini_unique))
mini_numeric[seq_along(mini_unique)] <- seq_along(mini_unique) # Respect ordinal if needed
} else {
mini_unique <- as.factor(unique(mini_data)) # Character
mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
}
# Create rules
indexed <- colnames(data)[i] # Index value
rules[[indexed]] <- mini_numeric # Numeric content
names(rules[[indexed]]) <- mini_unique # Character equivalent
# Apply to real data column
data[[i]] <- unname(rules[[indexed]][mini_data])
}
}
} else {
stop(
"lgb.prepare_rules: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
}
}
}
return(list(data = data, rules = rules))
}
#' @title removed functions
#' @name lgb.prepare
#' @description removed functions
#' @param ... catch-all too match old calls
#' @export
lgb.prepare <- function(...) {
stop("lgb.prepare() was removed in LightGBM 3.0.0. Please use lgb.convert()")
}
#' @title removed functions
#' @name lgb.prepare2
#' @description removed functions
#' @param ... catch-all too match old calls
#' @export
lgb.prepare2 <- function(...) {
stop("lgb.prepare2() was removed in LightGBM 3.0.0. Please use lgb.convert()")
}
#' @title removed functions
#' @name lgb.prepare_rules
#' @description removed functions
#' @param ... catch-all too match old calls
#' @export
lgb.prepare_rules <- function(...) {
stop("lgb.prepare_rules() was removed in LightGBM 3.0.0. Please use lgb.convert_with_rules()")
}
#' @title removed functions
#' @name lgb.prepare_rules2
#' @description removed functions
#' @param ... catch-all too match old calls
#' @export
lgb.prepare_rules2 <- function(...) {
stop("lgb.prepare_rules2() was removed in LightGBM 3.0.0. Please use lgb.convert_with_rules()")
}
......@@ -32,7 +32,7 @@ bank_train <- bank[1L:4000L, ]
bank_test <- bank[4001L:4521L, ]
# We must now transform the data to fit in LightGBM
# For this task, we use lgb.prepare
# For this task, we use lgb.convert_with_rules
# The function transforms the data into a fittable data
#
# Classes 'data.table' and 'data.frame': 521 obs. of 17 variables:
......@@ -53,9 +53,9 @@ bank_test <- bank[4001L:4521L, ]
# $ previous : int 1 0 0 0 0 2 0 0 0 1 ...
# $ poutcome : num 1 4 4 4 4 1 4 4 4 3 ...
# $ y : num 1 1 1 1 1 1 1 1 1 2 ...
bank_rules <- lgb.prepare_rules(data = bank_train)
bank_rules <- lgb.convert_with_rules(data = bank_train)
bank_train <- bank_rules$data
bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data
bank_test <- lgb.convert_with_rules(data = bank_test, rules = bank_rules$rules)$data
str(bank_test)
# Remove 1 to label because it must be between 0 and 1
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.convert.R
\name{lgb.convert}
\alias{lgb.convert}
\title{Data preparator for LightGBM datasets (integer)}
\usage{
lgb.convert(data)
}
\arguments{
\item{data}{A data.frame or data.table to prepare.}
}
\value{
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
for input in \code{lgb.Dataset}.
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to integer.
Please use \code{\link{lgb.convert_with_rules}} if you want to apply this transformation to
other datasets. This is useful if you have a specific need for integer dataset instead
of numeric dataset.
NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare}.
}
\examples{
data(iris)
str(iris)
# Convert all factors/chars to integer
str(lgb.convert(data = iris))
\dontrun{
# When lightgbm package is installed, and you do not want to load it
# You can still use the function!
lgb.unloader()
str(lightgbm::lgb.convert(data = iris))
# 'data.frame': 150 obs. of 5 variables:
# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
# $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.convert_with_rules.R
\name{lgb.convert_with_rules}
\alias{lgb.convert_with_rules}
\title{Data preparator for LightGBM datasets with rules (integer)}
\usage{
lgb.convert_with_rules(data, rules = NULL)
}
\arguments{
\item{data}{A data.frame or data.table to prepare.}
\item{rules}{A set of rules from the data preparator, if already used.}
}
\value{
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
The data must be converted to a matrix format (\code{as.matrix}) for input in
\code{lgb.Dataset}.
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to integer.
In addition, keeps rules created so you can convert other datasets using this converter.
This is useful if you have a specific need for integer dataset instead of numeric dataset.
NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
}
\examples{
data(iris)
str(iris)
new_iris <- lgb.convert_with_rules(data = iris) # Autoconverter
str(new_iris$data)
data(iris) # Erase iris dataset
iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
# Use conversion using known rules
# Unknown factors become 0, excellent for sparse datasets
newer_iris <- lgb.convert_with_rules(data = iris, rules = new_iris$rules)
# Unknown factor is now zero, perfect for sparse datasets
newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
# Is the newly created dataset equal? YES!
all.equal(new_iris$data, newer_iris$data)
# Can we test our own rules?
data(iris) # Erase iris dataset
# We remapped values differently
personal_rules <- list(
Species = c(
"setosa" = 3L
, "versicolor" = 2L
, "virginica" = 1L
)
)
newest_iris <- lgb.convert_with_rules(data = iris, rules = personal_rules)
str(newest_iris$data) # SUCCESS!
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.prepare.R
% Please edit documentation in R/removed.R
\name{lgb.prepare}
\alias{lgb.prepare}
\title{Data preparator for LightGBM datasets (numeric)}
\title{removed functions}
\usage{
lgb.prepare(data)
lgb.prepare(...)
}
\arguments{
\item{data}{A data.frame or data.table to prepare.}
}
\value{
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
for input in \code{lgb.Dataset}.
\item{...}{catch-all too match old calls}
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric without integers. Please use
\code{\link{lgb.prepare_rules}} if you want to apply this transformation to other datasets.
}
\examples{
data(iris)
str(iris)
str(lgb.prepare(data = iris)) # Convert all factors/chars to numeric
\dontrun{
# When lightgbm package is installed, and you do not want to load it
# You can still use the function!
lgb.unloader()
str(lightgbm::lgb.prepare(data = iris))
# 'data.frame': 150 obs. of 5 variables:
# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
# $ Species : num 1 1 1 1 1 1 1 1 1 1 ...
}
removed functions
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.prepare2.R
% Please edit documentation in R/removed.R
\name{lgb.prepare2}
\alias{lgb.prepare2}
\title{Data preparator for LightGBM datasets (integer)}
\title{removed functions}
\usage{
lgb.prepare2(data)
lgb.prepare2(...)
}
\arguments{
\item{data}{A data.frame or data.table to prepare.}
}
\value{
The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
for input in \code{lgb.Dataset}.
\item{...}{catch-all too match old calls}
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric (specifically: integer).
Please use \code{\link{lgb.prepare_rules2}} if you want to apply this transformation to
other datasets. This is useful if you have a specific need for integer dataset instead
of numeric dataset. Note that there are programs which do not support integer-only
input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
}
\examples{
data(iris)
str(iris)
# Convert all factors/chars to integer
str(lgb.prepare2(data = iris))
\dontrun{
# When lightgbm package is installed, and you do not want to load it
# You can still use the function!
lgb.unloader()
str(lightgbm::lgb.prepare2(data = iris))
# 'data.frame': 150 obs. of 5 variables:
# $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
# $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
# $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
# $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
# $ Species : int 1 1 1 1 1 1 1 1 1 1 ...
}
removed functions
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.prepare_rules.R
% Please edit documentation in R/removed.R
\name{lgb.prepare_rules}
\alias{lgb.prepare_rules}
\title{Data preparator for LightGBM datasets with rules (numeric)}
\title{removed functions}
\usage{
lgb.prepare_rules(data, rules = NULL)
lgb.prepare_rules(...)
}
\arguments{
\item{data}{A data.frame or data.table to prepare.}
\item{rules}{A set of rules from the data preparator, if already used.}
}
\value{
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
The data must be converted to a matrix format (\code{as.matrix}) for input
in \code{lgb.Dataset}.
\item{...}{catch-all too match old calls}
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric. In addition, keeps rules created
so you can convert other datasets using this converter.
}
\examples{
data(iris)
str(iris)
new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
str(new_iris$data)
data(iris) # Erase iris dataset
iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
# Use conversion using known rules
# Unknown factors become 0, excellent for sparse datasets
newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
# Unknown factor is now zero, perfect for sparse datasets
newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
# Is the newly created dataset equal? YES!
all.equal(new_iris$data, newer_iris$data)
# Can we test our own rules?
data(iris) # Erase iris dataset
# We remapped values differently
personal_rules <- list(
Species = c(
"setosa" = 3L
, "versicolor" = 2L
, "virginica" = 1L
)
)
newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
str(newest_iris$data) # SUCCESS!
removed functions
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.prepare_rules2.R
% Please edit documentation in R/removed.R
\name{lgb.prepare_rules2}
\alias{lgb.prepare_rules2}
\title{Data preparator for LightGBM datasets with rules (integer)}
\title{removed functions}
\usage{
lgb.prepare_rules2(data, rules = NULL)
lgb.prepare_rules2(...)
}
\arguments{
\item{data}{A data.frame or data.table to prepare.}
\item{rules}{A set of rules from the data preparator, if already used.}
}
\value{
A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
The data must be converted to a matrix format (\code{as.matrix}) for input in
\code{lgb.Dataset}.
\item{...}{catch-all too match old calls}
}
\description{
Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
Factors and characters are converted to numeric (specifically: integer).
In addition, keeps rules created so you can convert other datasets using this converter.
This is useful if you have a specific need for integer dataset instead of numeric dataset.
Note that there are programs which do not support integer-only input.
Consider this as a half memory technique which is dangerous, especially for LightGBM.
}
\examples{
data(iris)
str(iris)
new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
str(new_iris$data)
data(iris) # Erase iris dataset
iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
# Use conversion using known rules
# Unknown factors become 0, excellent for sparse datasets
newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
# Unknown factor is now zero, perfect for sparse datasets
newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
# Is the newly created dataset equal? YES!
all.equal(new_iris$data, newer_iris$data)
# Can we test our own rules?
data(iris) # Erase iris dataset
# We remapped values differently
personal_rules <- list(
Species = c(
"setosa" = 3L
, "versicolor" = 2L
, "virginica" = 1L
)
)
newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
str(newest_iris$data) # SUCCESS!
removed functions
}
......@@ -65,13 +65,11 @@ reference:
- '`lgb.Dataset.save`'
- '`lgb.Dataset.set.categorical`'
- '`lgb.Dataset.set.reference`'
- '`lgb.convert`'
- '`lgb.convert_with_rules`'
- title: Machine Learning
desc: Train models with LightGBM
contents:
- '`lgb.prepare`'
- '`lgb.prepare2`'
- '`lgb.prepare_rules`'
- '`lgb.prepare_rules2`'
- '`lightgbm`'
- '`lgb.train`'
- '`lgb.cv`'
......
context("lgb.prepare2()")
context("lgb.convert()")
test_that("lgb.prepare2() rejects inputs that are not a data.table or data.frame", {
test_that("lgb.convert() rejects inputs that are not a data.table or data.frame", {
bad_inputs <- list(
matrix(1.0:10.0, 2L, 5L)
, TRUE
......@@ -14,12 +14,12 @@ test_that("lgb.prepare2() rejects inputs that are not a data.table or data.frame
)
for (bad_input in bad_inputs) {
expect_error({
converted_dataset <- lgb.prepare2(bad_input)
}, regexp = "lgb.prepare2: you provided", fixed = TRUE)
converted_dataset <- lgb.convert(bad_input)
}, regexp = "lgb.convert: you provided", fixed = TRUE)
}
})
test_that("lgb.prepare2() should work correctly for a dataset with only character columns", {
test_that("lgb.convert() should work correctly for a dataset with only character columns", {
testDF <- data.frame(
col1 = c("a", "b", "c")
, col2 = c("green", "green", "red")
......@@ -27,7 +27,7 @@ test_that("lgb.prepare2() should work correctly for a dataset with only characte
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare2(input_data)
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "integer")
expect_identical(class(converted_dataset[["col2"]]), "integer")
......@@ -36,7 +36,7 @@ test_that("lgb.prepare2() should work correctly for a dataset with only characte
}
})
test_that("lgb.prepare2() should work correctly for a dataset with only factor columns", {
test_that("lgb.convert() should work correctly for a dataset with only factor columns", {
testDF <- data.frame(
col1 = as.factor(c("a", "b", "c"))
, col2 = as.factor(c("green", "green", "red"))
......@@ -44,7 +44,7 @@ test_that("lgb.prepare2() should work correctly for a dataset with only factor c
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare2(input_data)
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "integer")
expect_identical(class(converted_dataset[["col2"]]), "integer")
......@@ -53,7 +53,7 @@ test_that("lgb.prepare2() should work correctly for a dataset with only factor c
}
})
test_that("lgb.prepare2() should not change a dataset with only integer columns", {
test_that("lgb.convert() should not change a dataset with only integer columns", {
testDF <- data.frame(
col1 = 11L:15L
, col2 = 16L:20L
......@@ -61,12 +61,12 @@ test_that("lgb.prepare2() should not change a dataset with only integer columns"
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare2(input_data)
converted_dataset <- lgb.convert(input_data)
expect_identical(converted_dataset, input_data)
}
})
test_that("lgb.prepare2() should work correctly for a dataset with numeric, factor, and character columns", {
test_that("lgb.convert() should work correctly for a dataset with numeric, factor, and character columns", {
testDF <- data.frame(
character_col = c("a", "b", "c")
, numeric_col = c(1.0, 9.0, 10.0)
......@@ -75,20 +75,20 @@ test_that("lgb.prepare2() should work correctly for a dataset with numeric, fact
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare2(input_data)
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "integer")
expect_identical(class(converted_dataset[["factor_col"]]), "integer")
expect_identical(converted_dataset[["character_col"]], c(1L, 2L, 3L))
expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
# today, lgb.prepare2() does not convert numeric columns
# today, lgb.convert() does not convert numeric columns
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
}
})
test_that("lgb.prepare2() should work correctly for a dataset with missing values", {
test_that("lgb.convert() should work correctly for a dataset with missing values", {
testDF <- data.frame(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
......@@ -102,7 +102,7 @@ test_that("lgb.prepare2() should work correctly for a dataset with missing value
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare2(input_data)
converted_dataset <- lgb.convert(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "integer")
......@@ -120,10 +120,10 @@ test_that("lgb.prepare2() should work correctly for a dataset with missing value
expect_identical(converted_dataset[[col]], rep(NA_integer_, nrow(converted_dataset)))
}
# today, lgb.prepare2() does not convert logical columns
# today, lgb.convert() does not convert logical columns
expect_identical(class(converted_dataset[["na_col"]]), "logical")
# today, lgb.prepare2() does not convert numeric columns to integer
# today, lgb.convert() does not convert numeric columns to integer
expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
......@@ -131,7 +131,7 @@ test_that("lgb.prepare2() should work correctly for a dataset with missing value
}
})
test_that("lgb.prepare2() should modify data.tables in-place", {
test_that("lgb.convert() should modify data.tables in-place", {
testDT <- data.table::data.table(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
......@@ -142,6 +142,6 @@ test_that("lgb.prepare2() should modify data.tables in-place", {
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
)
resultDT <- lgb.prepare2(testDT)
resultDT <- lgb.convert(testDT)
expect_identical(resultDT, testDT)
})
context("lgb.prepare_rules2()")
context("lgb.convert_with_rules()")
test_that("lgb.prepare_rules2() rejects inputs that are not a data.table or data.frame", {
test_that("lgb.convert_with_rules() rejects inputs that are not a data.table or data.frame", {
bad_inputs <- list(
matrix(1.0:10.0, 2L, 5L)
, TRUE
......@@ -14,12 +14,12 @@ test_that("lgb.prepare_rules2() rejects inputs that are not a data.table or data
)
for (bad_input in bad_inputs) {
expect_error({
conversion_result <- lgb.prepare_rules2(bad_input)
}, regexp = "lgb.prepare_rules2: you provided", fixed = TRUE)
conversion_result <- lgb.convert_with_rules(bad_input)
}, regexp = "lgb.convert_with_rules: you provided", fixed = TRUE)
}
})
test_that("lgb.prepare_rules2() should work correctly for a dataset with only character columns", {
test_that("lgb.convert_with_rules() should work correctly for a dataset with only character columns", {
testDF <- data.frame(
col1 = c("a", "b", "c")
, col2 = c("green", "green", "red")
......@@ -27,7 +27,7 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with only ch
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules2(input_data)
conversion_result <- lgb.convert_with_rules(input_data)
# dataset should have been converted to integer
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
......@@ -44,7 +44,7 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with only ch
}
})
test_that("lgb.prepare_rules2() should work correctly for a dataset with only factor columns", {
test_that("lgb.convert_with_rules() should work correctly for a dataset with only factor columns", {
testDF <- data.frame(
col1 = as.factor(c("a", "b", "c"))
, col2 = as.factor(c("green", "green", "red"))
......@@ -52,7 +52,7 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with only fa
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules2(input_data)
conversion_result <- lgb.convert_with_rules(input_data)
# dataset should have been converted to integer
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
......@@ -69,7 +69,7 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with only fa
}
})
test_that("lgb.prepare_rules2() should not change a dataset with only integer columns", {
test_that("lgb.convert_with_rules() should not change a dataset with only integer columns", {
testDF <- data.frame(
col1 = 11L:15L
, col2 = 16L:20L
......@@ -77,7 +77,7 @@ test_that("lgb.prepare_rules2() should not change a dataset with only integer co
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules2(input_data)
conversion_result <- lgb.convert_with_rules(input_data)
# dataset should have been converted to integer
converted_dataset <- conversion_result[["data"]]
expect_identical(converted_dataset, input_data)
......@@ -87,7 +87,7 @@ test_that("lgb.prepare_rules2() should not change a dataset with only integer co
}
})
test_that("lgb.prepare_rules2() should work correctly for a dataset with numeric, factor, and character columns", {
test_that("lgb.convert_with_rules() should work correctly for a dataset with numeric, factor, and character columns", {
testDF <- data.frame(
character_col = c("a", "b", "c")
, numeric_col = c(1.0, 9.0, 10.0)
......@@ -96,7 +96,7 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with numeric
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules2(input_data)
conversion_result <- lgb.convert_with_rules(input_data)
# dataset should have been converted to numeric
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
......@@ -111,13 +111,13 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with numeric
expect_identical(rules[["character_col"]], c("a" = 1L, "b" = 2L, "c" = 3L))
expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
# today, lgb.prepare2() does not convert numeric columns
# today, lgb.convert_with_rules() does not convert numeric columns
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
}
})
test_that("lgb.prepare_rules2() should work correctly for a dataset with missing values", {
test_that("lgb.convert_with_rules() should work correctly for a dataset with missing values", {
testDF <- data.frame(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
......@@ -131,7 +131,7 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with missing
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules2(input_data)
conversion_result <- lgb.convert_with_rules(input_data)
# dataset should have been converted to integer
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
......@@ -151,10 +151,10 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with missing
expect_identical(converted_dataset[[col]], rep(NA_integer_, nrow(converted_dataset)))
}
# today, lgb.prepare2() does not convert logical columns
# today, lgb.convert_with_rules() does not convert logical columns
expect_identical(class(converted_dataset[["na_col"]]), "logical")
# today, lgb.prepare2() does not convert numeric columns to integer
# today, lgb.convert_with_rules() does not convert numeric columns to integer
expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
......@@ -170,7 +170,7 @@ test_that("lgb.prepare_rules2() should work correctly for a dataset with missing
}
})
test_that("lgb.prepare_rules2() should work correctly if you provide your own well-formed rules", {
test_that("lgb.convert_with_rules() should work correctly if you provide your own well-formed rules", {
testDF <- data.frame(
character_col = c("a", NA_character_, "c", "a", "a", "c")
, na_col = rep(NA, 6L)
......@@ -183,7 +183,7 @@ test_that("lgb.prepare_rules2() should work correctly if you provide your own we
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
# value used by lgb.prepare_rules2() when it encounters a categorical value that
# value used by lgb.convert_with_rules() when it encounters a categorical value that
# is not in the provided rules
UNKNOWN_FACTOR_VALUE <- 0L
for (input_data in list(testDF, testDT)) {
......@@ -197,7 +197,7 @@ test_that("lgb.prepare_rules2() should work correctly if you provide your own we
, "y" = 66L
)
)
conversion_result <- lgb.prepare_rules2(
conversion_result <- lgb.convert_with_rules(
data = input_data
, rules = custom_rules
)
......@@ -223,7 +223,7 @@ test_that("lgb.prepare_rules2() should work correctly if you provide your own we
}
})
test_that("lgb.prepare_rules2() should modify data.tables in-place", {
test_that("lgb.convert_with_rules() should modify data.tables in-place", {
testDT <- data.table::data.table(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
......@@ -234,7 +234,7 @@ test_that("lgb.prepare_rules2() should modify data.tables in-place", {
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
)
conversion_result <- lgb.prepare_rules2(testDT)
conversion_result <- lgb.convert_with_rules(testDT)
resultDT <- conversion_result[["data"]]
expect_identical(resultDT, testDT)
})
context("lgb.prepare()")
test_that("lgb.prepare() rejects inputs that are not a data.table or data.frame", {
bad_inputs <- list(
matrix(1.0:10.0, 2L, 5L)
, TRUE
, c("a", "b")
, NA
, 10L
, lgb.Dataset(
data = matrix(1.0:10.0, 2L, 5L)
, params = list()
)
)
for (bad_input in bad_inputs) {
expect_error({
converted_dataset <- lgb.prepare(bad_input)
}, regexp = "lgb.prepare: you provided", fixed = TRUE)
}
})
test_that("lgb.prepare() should work correctly for a dataset with only character columns", {
testDF <- data.frame(
col1 = c("a", "b", "c")
, col2 = c("green", "green", "red")
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "numeric")
expect_identical(class(converted_dataset[["col2"]]), "numeric")
expect_identical(converted_dataset[["col1"]], c(1.0, 2.0, 3.0))
expect_identical(converted_dataset[["col2"]], c(1.0, 1.0, 2.0))
}
})
test_that("lgb.prepare() should work correctly for a dataset with only factor columns", {
testDF <- data.frame(
col1 = as.factor(c("a", "b", "c"))
, col2 = as.factor(c("green", "green", "red"))
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "numeric")
expect_identical(class(converted_dataset[["col2"]]), "numeric")
expect_identical(converted_dataset[["col1"]], c(1.0, 2.0, 3.0))
expect_identical(converted_dataset[["col2"]], c(1.0, 1.0, 2.0))
}
})
test_that("lgb.prepare() should not change a dataset with only numeric columns", {
testDF <- data.frame(
col1 = 11.0:15.0
, col2 = 16.0:20.0
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare(input_data)
expect_identical(converted_dataset, input_data)
}
})
test_that("lgb.prepare() should work correctly for a dataset with numeric, factor, and character columns", {
testDF <- data.frame(
character_col = c("a", "b", "c")
, numeric_col = c(1.0, 9.0, 10.0)
, factor_col = as.factor(c("n", "n", "y"))
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "numeric")
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(class(converted_dataset[["factor_col"]]), "numeric")
expect_identical(converted_dataset[["character_col"]], c(1.0, 2.0, 3.0))
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
expect_identical(converted_dataset[["factor_col"]], c(1.0, 1.0, 2.0))
}
})
test_that("lgb.prepare() should work correctly for a dataset with missing values", {
testDF <- data.frame(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
, na_real_col = rep(NA_real_, 3L)
, na_int_col = rep(NA_integer_, 3L)
, na_character_col = rep(NA_character_, 3L)
, numeric_col = c(1.0, 9.0, NA_real_)
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
converted_dataset <- lgb.prepare(input_data)
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "numeric")
expect_identical(converted_dataset[["character_col"]], c(1.0, NA_real_, 2.0))
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, NA_real_))
expect_identical(class(converted_dataset[["factor_col"]]), "numeric")
expect_identical(converted_dataset[["factor_col"]], c(1.0, 1.0, 2.0))
# NAs of any type should be converted to numeric
for (col in c("na_real_col", "na_character_col")) {
expect_identical(class(converted_dataset[[col]]), "numeric")
expect_identical(converted_dataset[[col]], rep(NA_real_, nrow(converted_dataset)))
}
# today, lgb.prepare() does not convert logical columns
expect_identical(class(converted_dataset[["na_col"]]), "logical")
# today, lgb.prepare() does not convert integer columns to numeric
expect_identical(class(converted_dataset[["na_int_col"]]), "integer")
expect_identical(converted_dataset[["na_int_col"]], rep(NA_integer_, nrow(converted_dataset)))
expect_identical(class(converted_dataset[["integer_col"]]), "integer")
expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
}
})
test_that("lgb.prepare() should modify data.tables in-place", {
testDT <- data.table::data.table(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
, na_real_col = rep(NA_real_, 3L)
, na_int_col = rep(NA_integer_, 3L)
, na_character_col = rep(NA_character_, 3L)
, numeric_col = c(1.0, 9.0, NA_real_)
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
)
resultDT <- lgb.prepare(testDT)
expect_identical(resultDT, testDT)
})
context("lgb.prepare_rules()")
test_that("lgb.prepare_rules() rejects inputs that are not a data.table or data.frame", {
bad_inputs <- list(
matrix(1.0:10.0, 2L, 5L)
, TRUE
, c("a", "b")
, NA
, 10L
, lgb.Dataset(
data = matrix(1.0:10.0, 2L, 5L)
, params = list()
)
)
for (bad_input in bad_inputs) {
expect_error({
conversion_result <- lgb.prepare_rules(bad_input)
}, regexp = "lgb.prepare_rules: you provided", fixed = TRUE)
}
})
test_that("lgb.prepare_rules() should work correctly for a dataset with only character columns", {
testDF <- data.frame(
col1 = c("a", "b", "c")
, col2 = c("green", "green", "red")
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules(input_data)
# dataset should have been converted to numeric
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "numeric")
expect_identical(class(converted_dataset[["col2"]]), "numeric")
expect_identical(converted_dataset[["col1"]], c(1.0, 2.0, 3.0))
expect_identical(converted_dataset[["col2"]], c(1.0, 1.0, 2.0))
# rules should be returned and correct
rules <- conversion_result$rules
expect_is(rules, "list")
expect_length(rules, ncol(input_data))
expect_identical(rules[["col1"]], c("a" = 1.0, "b" = 2.0, "c" = 3.0))
expect_identical(rules[["col2"]], c("green" = 1.0, "red" = 2.0))
}
})
test_that("lgb.prepare_rules() should work correctly for a dataset with only factor columns", {
testDF <- data.frame(
col1 = as.factor(c("a", "b", "c"))
, col2 = as.factor(c("green", "green", "red"))
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules(input_data)
# dataset should have been converted to numeric
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["col1"]]), "numeric")
expect_identical(class(converted_dataset[["col2"]]), "numeric")
expect_identical(converted_dataset[["col1"]], c(1.0, 2.0, 3.0))
expect_identical(converted_dataset[["col2"]], c(1.0, 1.0, 2.0))
# rules should be returned and correct
rules <- conversion_result$rules
expect_is(rules, "list")
expect_length(rules, ncol(input_data))
expect_identical(rules[["col1"]], c("a" = 1.0, "b" = 2.0, "c" = 3.0))
expect_identical(rules[["col2"]], c("green" = 1.0, "red" = 2.0))
}
})
test_that("lgb.prepare_rules() should not change a dataset with only numeric columns", {
testDF <- data.frame(
col1 = 11.0:15.0
, col2 = 16.0:20.0
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules(input_data)
# dataset should have been converted to numeric
converted_dataset <- conversion_result[["data"]]
expect_identical(converted_dataset, input_data)
# rules should be returned and correct
rules <- conversion_result$rules
expect_identical(rules, list())
}
})
test_that("lgb.prepare_rules() should work correctly for a dataset with numeric, factor, and character columns", {
testDF <- data.frame(
character_col = c("a", "b", "c")
, numeric_col = c(1.0, 9.0, 10.0)
, factor_col = as.factor(c("n", "n", "y"))
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules(input_data)
# dataset should have been converted to numeric
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "numeric")
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(class(converted_dataset[["factor_col"]]), "numeric")
expect_identical(converted_dataset[["character_col"]], c(1.0, 2.0, 3.0))
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
expect_identical(converted_dataset[["factor_col"]], c(1.0, 1.0, 2.0))
# rules should be returned and correct
rules <- conversion_result$rules
expect_is(rules, "list")
expect_length(rules, 2L)
expect_identical(rules[["character_col"]], c("a" = 1.0, "b" = 2.0, "c" = 3.0))
expect_identical(rules[["factor_col"]], c("n" = 1.0, "y" = 2.0))
}
})
test_that("lgb.prepare_rules() should work correctly for a dataset with missing values", {
testDF <- data.frame(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
, na_real_col = rep(NA_real_, 3L)
, na_int_col = rep(NA_integer_, 3L)
, na_character_col = rep(NA_character_, 3L)
, numeric_col = c(1.0, 9.0, NA_real_)
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
for (input_data in list(testDF, testDT)) {
conversion_result <- lgb.prepare_rules(input_data)
# dataset should have been converted to numeric
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "numeric")
expect_identical(converted_dataset[["character_col"]], c(1.0, NA_real_, 2.0))
expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, NA_real_))
expect_identical(class(converted_dataset[["factor_col"]]), "numeric")
expect_identical(converted_dataset[["factor_col"]], c(1.0, 1.0, 2.0))
# NAs of any type should be converted to numeric
for (col in c("na_real_col", "na_character_col")) {
expect_identical(class(converted_dataset[[col]]), "numeric")
expect_identical(converted_dataset[[col]], rep(NA_real_, nrow(converted_dataset)))
}
# today, lgb.prepare_rules() does not convert logical columns
expect_identical(class(converted_dataset[["na_col"]]), "logical")
# today, lgb.prepare_rules() does not convert integer columns to numeric
expect_identical(class(converted_dataset[["na_int_col"]]), "integer")
expect_identical(converted_dataset[["na_int_col"]], rep(NA_integer_, nrow(converted_dataset)))
expect_identical(class(converted_dataset[["integer_col"]]), "integer")
expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
# rules should be returned and correct
rules <- conversion_result$rules
expect_is(rules, "list")
expect_length(rules, 3L)
expect_identical(rules[["character_col"]], stats::setNames(c(1.0, NA_real_, 2.0), c("a", NA, "c")))
expect_identical(rules[["na_character_col"]], stats::setNames(NA_real_, NA))
expect_identical(rules[["factor_col"]], c("n" = 1.0, "y" = 2.0))
}
})
test_that("lgb.prepare_rules() should work correctly if you provide your own well-formed rules", {
testDF <- data.frame(
character_col = c("a", NA_character_, "c", "a", "a", "c")
, na_col = rep(NA, 6L)
, na_real_col = rep(NA_real_, 6L)
, na_int_col = rep(NA_integer_, 6L)
, na_character_col = rep(NA_character_, 6L)
, numeric_col = c(1.0, 9.0, NA_real_, 10.0, 11.0, 12.0)
, factor_col = as.factor(c("n", "n", "y", "y", "n", "n"))
, integer_col = c(1L, 9L, NA_integer_, 1L, 1L, 1L)
, stringsAsFactors = FALSE
)
testDT <- data.table::as.data.table(testDF)
# value used by lgb.prepare_rules() when it encounters a categorical value that
# is not in the provided rules
UNKNOWN_FACTOR_VALUE <- 0.0
for (input_data in list(testDF, testDT)) {
custom_rules <- list(
"character_col" = c(
"a" = 5.0
, "c" = -10.2
)
, "factor_col" = c(
"n" = 65.0
, "y" = 65.01
)
)
conversion_result <- lgb.prepare_rules(
data = input_data
, rules = custom_rules
)
# dataset should have been converted to numeric
converted_dataset <- conversion_result[["data"]]
expect_identical(class(input_data), class(converted_dataset))
expect_identical(class(converted_dataset[["character_col"]]), "numeric")
expect_identical(converted_dataset[["character_col"]], c(5.0, UNKNOWN_FACTOR_VALUE, -10.2, 5.0, 5.0, -10.2))
expect_identical(class(converted_dataset[["factor_col"]]), "numeric")
expect_identical(converted_dataset[["factor_col"]], c(65.0, 65.0, 65.01, 65.01, 65.0, 65.0))
# columns not specified in rules are not going to be converted
for (col in c("na_col", "na_real_col", "na_int_col", "na_character_col", "numeric_col", "integer_col")) {
expect_identical(converted_dataset[[col]], input_data[[col]])
}
# the rules you passed in should be returned unchanged
rules <- conversion_result$rules
expect_identical(rules, custom_rules)
}
})
test_that("lgb.prepare_rules() should modify data.tables in-place", {
testDT <- data.table::data.table(
character_col = c("a", NA_character_, "c")
, na_col = rep(NA, 3L)
, na_real_col = rep(NA_real_, 3L)
, na_int_col = rep(NA_integer_, 3L)
, na_character_col = rep(NA_character_, 3L)
, numeric_col = c(1.0, 9.0, NA_real_)
, factor_col = as.factor(c("n", "n", "y"))
, integer_col = c(1L, 9L, NA_integer_)
)
conversion_result <- lgb.prepare_rules(testDT)
resultDT <- conversion_result[["data"]]
expect_identical(resultDT, testDT)
})
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment