Unverified Commit 4ee6399d authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] deprecate the use of 'info' in Dataset (#4573)



* [R-package] introduce keyword arguments for Dataset properties

* adding examples

* trying to fix loading issues

* add check on use of info and fix examples

* eliminate one '...'

* Update R-package/R/lgb.Dataset.R
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>

* Update R-package/R/lgb.Dataset.R

* Dataset get_params() is public
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 39421265
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#' group rows together as ordered results from the same set of candidate results
#' to be ranked. For example, if you have a 100-document dataset with
#' \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#' where the first 10 records are in the first group, records 11-30 are in the
#' second group, etc.
#' @param info a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
#' is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
#' directly.
#' @keywords internal
NULL
# [description] List of valid keys for "info" arguments in lgb.Dataset. # [description] List of valid keys for "info" arguments in lgb.Dataset.
# Wrapped in a function to take advantage of lazy evaluation # Wrapped in a function to take advantage of lazy evaluation
# (so it doesn't matter what order R sources files during installation). # (so it doesn't matter what order R sources files during installation).
...@@ -35,7 +53,10 @@ Dataset <- R6::R6Class( ...@@ -35,7 +53,10 @@ Dataset <- R6::R6Class(
free_raw_data = TRUE, free_raw_data = TRUE,
used_indices = NULL, used_indices = NULL,
info = list(), info = list(),
...) { label = NULL,
weight = NULL,
group = NULL,
init_score = NULL) {
# validate inputs early to avoid unnecessary computation # validate inputs early to avoid unnecessary computation
if (!(is.null(reference) || lgb.is.Dataset(reference))) { if (!(is.null(reference) || lgb.is.Dataset(reference))) {
...@@ -45,25 +66,25 @@ Dataset <- R6::R6Class( ...@@ -45,25 +66,25 @@ Dataset <- R6::R6Class(
stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor")) stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
} }
# Check for additional parameters if (length(info) > 0L) {
additional_params <- list(...) warning(paste0(
"lgb.Dataset: found fields passed through 'info'. "
# Check if attribute key is in the known attribute list , "As of v3.3.0, this behavior is deprecated, and support for it will be removed in a future release. "
for (key in names(additional_params)) { , "To suppress this warning, use keyword arguments 'label', 'weight', 'group', or 'init_score' directly"
))
# Key existing }
if (key %in% .INFO_KEYS()) {
# Store as info
info[[key]] <- additional_params[[key]]
} else {
# Store as param
params[[key]] <- additional_params[[key]]
}
if (!is.null(label)) {
info[["label"]] <- label
}
if (!is.null(weight)) {
info[["weight"]] <- weight
}
if (!is.null(group)) {
info[["group"]] <- group
}
if (!is.null(init_score)) {
info[["init_score"]] <- init_score
} }
# Check for matrix format # Check for matrix format
...@@ -93,12 +114,34 @@ Dataset <- R6::R6Class( ...@@ -93,12 +114,34 @@ Dataset <- R6::R6Class(
create_valid = function(data, create_valid = function(data,
info = list(), info = list(),
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL,
params = list(),
...) { ...) {
additional_params <- list(...)
if (length(additional_params) > 0L) {
warning(paste0(
"Dataset$create_valid(): Found the following passed through '...': "
, paste(names(additional_params), collapse = ", ")
, ". These will be used, but in future releases of lightgbm, this warning will become an error. "
, "Add these to 'params' instead. "
, "See ?lgb.Dataset.create.valid for documentation on how to call this function."
))
}
# anything passed into '...' should be overwritten by things passed to 'params'
params <- modifyList(additional_params, params)
# the Dataset's existing parameters should be overwritten by any passed in to this call
params <- modifyList(self$get_params(), params)
# Create new dataset # Create new dataset
ret <- Dataset$new( ret <- Dataset$new(
data = data data = data
, params = private$params , params = params
, reference = self , reference = self
, colnames = private$colnames , colnames = private$colnames
, categorical_feature = private$categorical_feature , categorical_feature = private$categorical_feature
...@@ -106,7 +149,10 @@ Dataset <- R6::R6Class( ...@@ -106,7 +149,10 @@ Dataset <- R6::R6Class(
, free_raw_data = private$free_raw_data , free_raw_data = private$free_raw_data
, used_indices = NULL , used_indices = NULL
, info = info , info = info
, ... , label = label
, weight = weight
, group = group
, init_score = init_score
) )
return(invisible(ret)) return(invisible(ret))
...@@ -711,6 +757,7 @@ Dataset <- R6::R6Class( ...@@ -711,6 +757,7 @@ Dataset <- R6::R6Class(
#' @title Construct \code{lgb.Dataset} object #' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix #' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#' or local file (that was created previously by saving an \code{lgb.Dataset}). #' or local file (that was created previously by saving an \code{lgb.Dataset}).
#' @inheritParams lgb_shared_dataset_params
#' @param data a \code{matrix} object, a \code{dgCMatrix} object, #' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#' a character representing a path to a text file (CSV, TSV, or LibSVM), #' a character representing a path to a text file (CSV, TSV, or LibSVM),
#' or a character representing a path to a binary \code{lgb.Dataset} file #' or a character representing a path to a binary \code{lgb.Dataset} file
...@@ -730,8 +777,7 @@ Dataset <- R6::R6Class( ...@@ -730,8 +777,7 @@ Dataset <- R6::R6Class(
#' This reduces LightGBM's memory consumption, but it means that the Dataset object #' This reduces LightGBM's memory consumption, but it means that the Dataset object
#' cannot be changed after it has been constructed. If you'd prefer to be able to #' cannot be changed after it has been constructed. If you'd prefer to be able to
#' change the Dataset object after construction, set \code{free_raw_data = FALSE}. #' change the Dataset object after construction, set \code{free_raw_data = FALSE}.
#' @param info a list of information of the \code{lgb.Dataset} object #' @param ... other parameters passed to \code{params}
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
#' #'
#' @return constructed dataset #' @return constructed dataset
#' #'
...@@ -753,8 +799,24 @@ lgb.Dataset <- function(data, ...@@ -753,8 +799,24 @@ lgb.Dataset <- function(data,
categorical_feature = NULL, categorical_feature = NULL,
free_raw_data = TRUE, free_raw_data = TRUE,
info = list(), info = list(),
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL,
...) { ...) {
additional_params <- list(...)
params <- modifyList(params, additional_params)
if (length(additional_params) > 0L) {
warning(paste0(
"lgb.Dataset: Found the following passed through '...': "
, paste(names(additional_params), collapse = ", ")
, ". These will be used, but in future releases of lightgbm, this warning will become an error. "
, "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
))
}
# Create new dataset # Create new dataset
return( return(
invisible(Dataset$new( invisible(Dataset$new(
...@@ -767,7 +829,10 @@ lgb.Dataset <- function(data, ...@@ -767,7 +829,10 @@ lgb.Dataset <- function(data,
, free_raw_data = free_raw_data , free_raw_data = free_raw_data
, used_indices = NULL , used_indices = NULL
, info = info , info = info
, ... , label = label
, weight = weight
, group = group
, init_score = init_score
)) ))
) )
...@@ -776,12 +841,18 @@ lgb.Dataset <- function(data, ...@@ -776,12 +841,18 @@ lgb.Dataset <- function(data,
#' @name lgb.Dataset.create.valid #' @name lgb.Dataset.create.valid
#' @title Construct validation data #' @title Construct validation data
#' @description Construct validation data according to training data #' @description Construct validation data according to training data
#' @inheritParams lgb_shared_dataset_params
#' @param dataset \code{lgb.Dataset} object, training data #' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object, #' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#' a character representing a path to a text file (CSV, TSV, or LibSVM), #' a character representing a path to a text file (CSV, TSV, or LibSVM),
#' or a character representing a path to a binary \code{Dataset} file #' or a character representing a path to a binary \code{Dataset} file
#' @param info a list of information of the \code{lgb.Dataset} object #' @param params a list of parameters. See
#' @param ... other information to pass to \code{info}. #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#' The "Dataset Parameters" section of the documentation} for a list of parameters
#' and valid values. If this is an empty list (the default), the validation Dataset
#' will have the same parameters as the Dataset passed to argument \code{dataset}.
#' @param ... additional \code{lgb.Dataset} parameters.
#' NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.
#' #'
#' @return constructed dataset #' @return constructed dataset
#' #'
...@@ -793,17 +864,81 @@ lgb.Dataset <- function(data, ...@@ -793,17 +864,81 @@ lgb.Dataset <- function(data,
#' data(agaricus.test, package = "lightgbm") #' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test #' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#' data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#' , file = train_file
#' , sep = ","
#' , col.names = TRUE
#' , row.names = FALSE
#' , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#' data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#' , file = valid_file
#' , sep = ","
#' , col.names = FALSE
#' , row.names = FALSE
#' , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#' data = train_file
#' , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#' data = valid_file
#' , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
#' } #' }
#' @importFrom utils modifyList
#' @export #' @export
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) { lgb.Dataset.create.valid <- function(dataset,
data,
info = list(),
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL,
params = list(),
...) {
# Check if dataset is not a dataset # Check if dataset is not a dataset
if (!lgb.is.Dataset(x = dataset)) { if (!lgb.is.Dataset(x = dataset)) {
stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object") stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
} }
additional_params <- list(...)
if (length(additional_params) > 0L) {
warning(paste0(
"lgb.Dataset.create.valid: Found the following passed through '...': "
, paste(names(additional_params), collapse = ", ")
, ". These will be used, but in future releases of lightgbm, this warning will become an error. "
, "Add these to 'params' instead. See ?lgb.Dataset.create.valid for documentation on how to call this function."
))
}
# Create validation dataset # Create validation dataset
return(invisible(dataset$create_valid(data = data, info = info, ...))) return(invisible(
dataset$create_valid(
data = data
, info = info
, label = label
, weight = weight
, group = group
, init_score = init_score
, params = utils::modifyList(params, additional_params)
)
))
} }
......
...@@ -12,6 +12,10 @@ lgb.Dataset( ...@@ -12,6 +12,10 @@ lgb.Dataset(
categorical_feature = NULL, categorical_feature = NULL,
free_raw_data = TRUE, free_raw_data = TRUE,
info = list(), info = list(),
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL,
... ...
) )
} }
...@@ -41,9 +45,24 @@ This reduces LightGBM's memory consumption, but it means that the Dataset object ...@@ -41,9 +45,24 @@ This reduces LightGBM's memory consumption, but it means that the Dataset object
cannot be changed after it has been constructed. If you'd prefer to be able to cannot be changed after it has been constructed. If you'd prefer to be able to
change the Dataset object after construction, set \code{free_raw_data = FALSE}.} change the Dataset object after construction, set \code{free_raw_data = FALSE}.}
\item{info}{a list of information of the \code{lgb.Dataset} object} \item{info}{a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
directly.}
\item{...}{other information to pass to \code{info} or parameters pass to \code{params}} \item{label}{vector of labels to use as the target variable}
\item{weight}{numeric vector of sample weights}
\item{group}{used for learning-to-rank tasks. An integer vector describing how to
group rows together as ordered results from the same set of candidate results
to be ranked. For example, if you have a 100-document dataset with
\code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the
second group, etc.}
\item{init_score}{initial score is the base prediction lightgbm will boost from}
\item{...}{other parameters passed to \code{params}}
} }
\value{ \value{
constructed dataset constructed dataset
......
...@@ -4,7 +4,17 @@ ...@@ -4,7 +4,17 @@
\alias{lgb.Dataset.create.valid} \alias{lgb.Dataset.create.valid}
\title{Construct validation data} \title{Construct validation data}
\usage{ \usage{
lgb.Dataset.create.valid(dataset, data, info = list(), ...) lgb.Dataset.create.valid(
dataset,
data,
info = list(),
label = NULL,
weight = NULL,
group = NULL,
init_score = NULL,
params = list(),
...
)
} }
\arguments{ \arguments{
\item{dataset}{\code{lgb.Dataset} object, training data} \item{dataset}{\code{lgb.Dataset} object, training data}
...@@ -13,9 +23,31 @@ lgb.Dataset.create.valid(dataset, data, info = list(), ...) ...@@ -13,9 +23,31 @@ lgb.Dataset.create.valid(dataset, data, info = list(), ...)
a character representing a path to a text file (CSV, TSV, or LibSVM), a character representing a path to a text file (CSV, TSV, or LibSVM),
or a character representing a path to a binary \code{Dataset} file} or a character representing a path to a binary \code{Dataset} file}
\item{info}{a list of information of the \code{lgb.Dataset} object} \item{info}{a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
directly.}
\item{...}{other information to pass to \code{info}.} \item{label}{vector of labels to use as the target variable}
\item{weight}{numeric vector of sample weights}
\item{group}{used for learning-to-rank tasks. An integer vector describing how to
group rows together as ordered results from the same set of candidate results
to be ranked. For example, if you have a 100-document dataset with
\code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the
second group, etc.}
\item{init_score}{initial score is the base prediction lightgbm will boost from}
\item{params}{a list of parameters. See
\href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
The "Dataset Parameters" section of the documentation} for a list of parameters
and valid values. If this is an empty list (the default), the validation Dataset
will have the same parameters as the Dataset passed to argument \code{dataset}.}
\item{...}{additional \code{lgb.Dataset} parameters.
NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.}
} }
\value{ \value{
constructed dataset constructed dataset
...@@ -31,5 +63,40 @@ dtrain <- lgb.Dataset(train$data, label = train$label) ...@@ -31,5 +63,40 @@ dtrain <- lgb.Dataset(train$data, label = train$label)
data(agaricus.test, package = "lightgbm") data(agaricus.test, package = "lightgbm")
test <- agaricus.test test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
# parameters can be changed between the training data and validation set,
# for example to account for training data in a text file with a header row
# and validation data in a text file without it
train_file <- tempfile(pattern = "train_", fileext = ".csv")
write.table(
data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
, file = train_file
, sep = ","
, col.names = TRUE
, row.names = FALSE
, quote = FALSE
)
valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
write.table(
data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
, file = valid_file
, sep = ","
, col.names = FALSE
, row.names = FALSE
, quote = FALSE
)
dtrain <- lgb.Dataset(
data = train_file
, params = list(has_header = TRUE)
)
dtrain$construct()
dvalid <- lgb.Dataset(
data = valid_file
, params = list(has_header = FALSE)
)
dvalid$construct()
} }
} }
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lgb.Dataset.R
\name{lgb_shared_dataset_params}
\alias{lgb_shared_dataset_params}
\title{Shared Dataset parameter docs}
\arguments{
\item{label}{vector of labels to use as the target variable}
\item{weight}{numeric vector of sample weights}
\item{init_score}{initial score is the base prediction lightgbm will boost from}
\item{group}{used for learning-to-rank tasks. An integer vector describing how to
group rows together as ordered results from the same set of candidate results
to be ranked. For example, if you have a 100-document dataset with
\code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
where the first 10 records are in the first group, records 11-30 are in the
second group, etc.}
\item{info}{a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
directly.}
}
\description{
Parameter docs for fields used in \code{lgb.Dataset} construction
}
\keyword{internal}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment