Unverified Commit a70e8327 authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] [python-package] deprecate Dataset arguments to cv() and train() (#6446)

parent ae55f32b
...@@ -25,8 +25,8 @@ CVBooster <- R6::R6Class( ...@@ -25,8 +25,8 @@ CVBooster <- R6::R6Class(
#' @description Cross validation logic used by LightGBM #' @description Cross validation logic used by LightGBM
#' @inheritParams lgb_shared_params #' @inheritParams lgb_shared_params
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples. #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}} #' @param label Deprecated. See "Deprecated Arguments" section below.
#' @param weight vector of response values. If not NULL, will set to dataset #' @param weight Deprecated. See "Deprecated Arguments" section below.
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation. #' @param showsd \code{boolean}, whether to show standard deviation of cross validation.
#' This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a #' This parameter defaults to \code{TRUE}. Setting it to \code{FALSE} can lead to a
...@@ -36,10 +36,8 @@ CVBooster <- R6::R6Class( ...@@ -36,10 +36,8 @@ CVBooster <- R6::R6Class(
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
#' (each element must be a vector of test fold's indices). When folds are supplied, #' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored. #' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param colnames Deprecated. See "Deprecated Arguments" section below.
#' @param categorical_feature categorical features. This can either be a character vector of feature #' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
#' names or an integer vector with the indices of the features (e.g.
#' \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param callbacks List of callback functions that are applied at each iteration. #' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#' into a predictor model which frees up memory and the original datasets #' into a predictor model which frees up memory and the original datasets
...@@ -70,6 +68,13 @@ CVBooster <- R6::R6Class( ...@@ -70,6 +68,13 @@ CVBooster <- R6::R6Class(
#' , nfold = 3L #' , nfold = 3L
#' ) #' )
#' } #' }
#'
#' @section Deprecated Arguments:
#'
#' A future release of \code{lightgbm} will require passing an \code{lgb.Dataset}
#' to argument \code{'data'}. It will also remove support for passing arguments
#' \code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}.
#'
#' @importFrom data.table data.table setorderv #' @importFrom data.table data.table setorderv
#' @export #' @export
lgb.cv <- function(params = list() lgb.cv <- function(params = list()
...@@ -102,12 +107,32 @@ lgb.cv <- function(params = list() ...@@ -102,12 +107,32 @@ lgb.cv <- function(params = list()
# If 'data' is not an lgb.Dataset, try to construct one using 'label' # If 'data' is not an lgb.Dataset, try to construct one using 'label'
if (!.is_Dataset(x = data)) { if (!.is_Dataset(x = data)) {
warning(paste0(
"Passing anything other than an lgb.Dataset object to lgb.cv() is deprecated. "
, "Either pass an lgb.Dataset object, or use lightgbm()."
))
if (is.null(label)) { if (is.null(label)) {
stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'") stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
} }
data <- lgb.Dataset(data = data, label = label) data <- lgb.Dataset(data = data, label = label)
} }
# raise deprecation warnings if necessary
# ref: https://github.com/microsoft/LightGBM/issues/6435
args <- names(match.call())
if ("categorical_feature" %in% args) {
.emit_dataset_kwarg_warning("categorical_feature", "lgb.cv")
}
if ("colnames" %in% args) {
.emit_dataset_kwarg_warning("colnames", "lgb.cv")
}
if ("label" %in% args) {
.emit_dataset_kwarg_warning("label", "lgb.cv")
}
if ("weight" %in% args) {
.emit_dataset_kwarg_warning("weight", "lgb.cv")
}
# set some parameters, resolving the way they were passed in with other parameters # set some parameters, resolving the way they were passed in with other parameters
# in `params`. # in `params`.
# this ensures that the model stored with Booster$save() correctly represents # this ensures that the model stored with Booster$save() correctly represents
......
...@@ -6,10 +6,8 @@ ...@@ -6,10 +6,8 @@
#' @inheritParams lgb_shared_params #' @inheritParams lgb_shared_params
#' @param valids a list of \code{lgb.Dataset} objects, used for validation #' @param valids a list of \code{lgb.Dataset} objects, used for validation
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param colnames Deprecated. See "Deprecated Arguments" section below.
#' @param categorical_feature categorical features. This can either be a character vector of feature #' @param categorical_feature Deprecated. See "Deprecated Arguments" section below.
#' names or an integer vector with the indices of the features (e.g.
#' \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param callbacks List of callback functions that are applied at each iteration. #' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
#' booster model into a predictor model which frees up memory and the #' booster model into a predictor model which frees up memory and the
...@@ -43,6 +41,13 @@ ...@@ -43,6 +41,13 @@
#' , early_stopping_rounds = 3L #' , early_stopping_rounds = 3L
#' ) #' )
#' } #' }
#'
#' @section Deprecated Arguments:
#'
#' A future release of \code{lightgbm} will remove support for passing arguments
#' \code{'categorical_feature'} and \code{'colnames'}. Pass those things to
#' \code{lgb.Dataset} instead.
#'
#' @export #' @export
lgb.train <- function(params = list(), lgb.train <- function(params = list(),
data, data,
...@@ -78,6 +83,16 @@ lgb.train <- function(params = list(), ...@@ -78,6 +83,16 @@ lgb.train <- function(params = list(),
} }
} }
# raise deprecation warnings if necessary
# ref: https://github.com/microsoft/LightGBM/issues/6435
args <- names(match.call())
if ("categorical_feature" %in% args) {
.emit_dataset_kwarg_warning("categorical_feature", "lgb.train")
}
if ("colnames" %in% args) {
.emit_dataset_kwarg_warning("colnames", "lgb.train")
}
# set some parameters, resolving the way they were passed in with other parameters # set some parameters, resolving the way they were passed in with other parameters
# in `params`. # in `params`.
# this ensures that the model stored with Booster$save() correctly represents # this ensures that the model stored with Booster$save() correctly represents
......
...@@ -144,6 +144,12 @@ NULL ...@@ -144,6 +144,12 @@ NULL
#' #'
#' \emph{New in version 4.0.0} #' \emph{New in version 4.0.0}
#' #'
#' @param colnames Character vector of features. Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
#' @param categorical_feature categorical features. This can either be a character vector of feature
#' names or an integer vector with the indices of the features (e.g.
#' \code{c(1L, 10L)} to say "the first and tenth columns").
#' Only used if \code{data} is not an \code{\link{lgb.Dataset}}.
#'
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
#' \itemize{ #' \itemize{
#' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} #' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
...@@ -152,10 +158,6 @@ NULL ...@@ -152,10 +158,6 @@ NULL
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} #' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
#' \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function} #' \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
#' \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}} #' \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
#' \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
#' \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
#' names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
#' say "the first and tenth columns").}
#' \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model #' \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
#' into a predictor model which frees up memory and the original datasets} #' into a predictor model which frees up memory and the original datasets}
#' } #' }
...@@ -176,6 +178,8 @@ lightgbm <- function(data, ...@@ -176,6 +178,8 @@ lightgbm <- function(data,
objective = "auto", objective = "auto",
init_score = NULL, init_score = NULL,
num_threads = NULL, num_threads = NULL,
colnames = NULL,
categorical_feature = NULL,
...) { ...) {
# validate inputs early to avoid unnecessary computation # validate inputs early to avoid unnecessary computation
...@@ -221,7 +225,14 @@ lightgbm <- function(data, ...@@ -221,7 +225,14 @@ lightgbm <- function(data,
# Check whether data is lgb.Dataset, if not then create lgb.Dataset manually # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
if (!.is_Dataset(x = dtrain)) { if (!.is_Dataset(x = dtrain)) {
dtrain <- lgb.Dataset(data = data, label = label, weight = weights, init_score = init_score) dtrain <- lgb.Dataset(
data = data
, label = label
, weight = weights
, init_score = init_score
, categorical_feature = categorical_feature
, colnames = colnames
)
} }
train_args <- list( train_args <- list(
......
...@@ -260,3 +260,19 @@ ...@@ -260,3 +260,19 @@
return(a == b) return(a == b)
} }
} }
# ref: https://github.com/microsoft/LightGBM/issues/6435
.emit_dataset_kwarg_warning <- function(calling_function, argname) {
msg <- sprintf(
paste0(
"Argument '%s' to %s() is deprecated and will be removed in a future release. "
, "Set '%s' with lgb.Dataset() instead. "
, "See https://github.com/microsoft/LightGBM/issues/6435."
)
, argname
, calling_function
, argname
)
warning(msg)
return(invisible(NULL))
}
...@@ -41,9 +41,9 @@ may allow you to pass other types of data like \code{matrix} and then separately ...@@ -41,9 +41,9 @@ may allow you to pass other types of data like \code{matrix} and then separately
\item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.} \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
\item{label}{Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}} \item{label}{Deprecated. See "Deprecated Arguments" section below.}
\item{weight}{vector of response values. If not NULL, will set to dataset} \item{weight}{Deprecated. See "Deprecated Arguments" section below.}
\item{obj}{objective function, can be character or custom objective function. Examples include \item{obj}{objective function, can be character or custom objective function. Examples include
\code{regression}, \code{regression_l1}, \code{huber}, \code{regression}, \code{regression_l1}, \code{huber},
...@@ -103,11 +103,9 @@ the \code{nfold} and \code{stratified} parameters are ignored.} ...@@ -103,11 +103,9 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
\item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model} \item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset} \item{colnames}{Deprecated. See "Deprecated Arguments" section below.}
\item{categorical_feature}{categorical features. This can either be a character vector of feature \item{categorical_feature}{Deprecated. See "Deprecated Arguments" section below.}
names or an integer vector with the indices of the features (e.g.
\code{c(1L, 10L)} to say "the first and tenth columns").}
\item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null, \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null,
training will stop if the evaluation of any metric on any validation set training will stop if the evaluation of any metric on any validation set
...@@ -133,6 +131,14 @@ a trained model \code{lgb.CVBooster}. ...@@ -133,6 +131,14 @@ a trained model \code{lgb.CVBooster}.
\description{ \description{
Cross validation logic used by LightGBM Cross validation logic used by LightGBM
} }
\section{Deprecated Arguments}{
A future release of \code{lightgbm} will require passing an \code{lgb.Dataset}
to argument \code{'data'}. It will also remove support for passing arguments
\code{'categorical_feature'}, \code{'colnames'}, \code{'label'}, and \code{'weight'}.
}
\section{Early Stopping}{ \section{Early Stopping}{
...@@ -171,4 +177,5 @@ model <- lgb.cv( ...@@ -171,4 +177,5 @@ model <- lgb.cv(
, nfold = 3L , nfold = 3L
) )
} }
} }
...@@ -82,11 +82,9 @@ printing of evaluation during training} ...@@ -82,11 +82,9 @@ printing of evaluation during training}
\item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model} \item{init_model}{path of model file or \code{lgb.Booster} object, will continue training from this model}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset} \item{colnames}{Deprecated. See "Deprecated Arguments" section below.}
\item{categorical_feature}{categorical features. This can either be a character vector of feature \item{categorical_feature}{Deprecated. See "Deprecated Arguments" section below.}
names or an integer vector with the indices of the features (e.g.
\code{c(1L, 10L)} to say "the first and tenth columns").}
\item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null, \item{early_stopping_rounds}{int. Activates early stopping. When this parameter is non-null,
training will stop if the evaluation of any metric on any validation set training will stop if the evaluation of any metric on any validation set
...@@ -111,6 +109,14 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}}, ...@@ -111,6 +109,14 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}},
this function is focused on performance (e.g. speed, memory efficiency). It is also this function is focused on performance (e.g. speed, memory efficiency). It is also
less likely to have breaking API changes in new releases than \code{\link{lightgbm}}. less likely to have breaking API changes in new releases than \code{\link{lightgbm}}.
} }
\section{Deprecated Arguments}{
A future release of \code{lightgbm} will remove support for passing arguments
\code{'categorical_feature'} and \code{'colnames'}. Pass those things to
\code{lgb.Dataset} instead.
}
\section{Early Stopping}{ \section{Early Stopping}{
...@@ -154,4 +160,5 @@ model <- lgb.train( ...@@ -154,4 +160,5 @@ model <- lgb.train(
, early_stopping_rounds = 3L , early_stopping_rounds = 3L
) )
} }
} }
...@@ -19,6 +19,8 @@ lightgbm( ...@@ -19,6 +19,8 @@ lightgbm(
objective = "auto", objective = "auto",
init_score = NULL, init_score = NULL,
num_threads = NULL, num_threads = NULL,
colnames = NULL,
categorical_feature = NULL,
... ...
) )
} }
...@@ -96,6 +98,13 @@ set to the iteration number of the best iteration.} ...@@ -96,6 +98,13 @@ set to the iteration number of the best iteration.}
\emph{New in version 4.0.0}} \emph{New in version 4.0.0}}
\item{colnames}{Character vector of features. Only used if \code{data} is not an \code{\link{lgb.Dataset}}.}
\item{categorical_feature}{categorical features. This can either be a character vector of feature
names or an integer vector with the indices of the features (e.g.
\code{c(1L, 10L)} to say "the first and tenth columns").
Only used if \code{data} is not an \code{\link{lgb.Dataset}}.}
\item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example \item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
\itemize{ \itemize{
\item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
...@@ -104,10 +113,6 @@ set to the iteration number of the best iteration.} ...@@ -104,10 +113,6 @@ set to the iteration number of the best iteration.}
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}} \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\item{\code{eval}: evaluation function, can be (a list of) character or custom eval function} \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
\item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}} \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
\item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
\item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
say "the first and tenth columns").}
\item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
into a predictor model which frees up memory and the original datasets} into a predictor model which frees up memory and the original datasets}
}} }}
......
...@@ -433,7 +433,7 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", { ...@@ -433,7 +433,7 @@ test_that("lgb.cv() rejects negative or 0 value passed to nrounds", {
} }
}) })
test_that("lgb.cv() throws an informative error is 'data' is not an lgb.Dataset and labels are not given", { test_that("lgb.cv() throws an informative error if 'data' is not an lgb.Dataset and labels are not given", {
bad_values <- list( bad_values <- list(
4L 4L
, "hello" , "hello"
...@@ -1788,11 +1788,6 @@ test_that("lgb.train() works with early stopping for regression with a metric th ...@@ -1788,11 +1788,6 @@ test_that("lgb.train() works with early stopping for regression with a metric th
test_that("lgb.train() supports non-ASCII feature names", { test_that("lgb.train() supports non-ASCII feature names", {
dtrain <- lgb.Dataset(
data = matrix(rnorm(400L), ncol = 4L)
, label = rnorm(100L)
, params = list(num_threads = .LGB_MAX_THREADS)
)
# content below is equivalent to # content below is equivalent to
# #
# feature_names <- c("F_零", "F_一", "F_二", "F_三") # feature_names <- c("F_零", "F_一", "F_二", "F_三")
...@@ -1805,6 +1800,12 @@ test_that("lgb.train() supports non-ASCII feature names", { ...@@ -1805,6 +1800,12 @@ test_that("lgb.train() supports non-ASCII feature names", {
, rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xba, 0x8c))) , rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xba, 0x8c)))
, rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xb8, 0x89))) , rawToChar(as.raw(c(0x46, 0x5f, 0xe4, 0xb8, 0x89)))
) )
dtrain <- lgb.Dataset(
data = matrix(rnorm(400L), ncol = 4L)
, label = rnorm(100L)
, params = list(num_threads = .LGB_MAX_THREADS)
, colnames = feature_names
)
bst <- lgb.train( bst <- lgb.train(
data = dtrain data = dtrain
, nrounds = 5L , nrounds = 5L
...@@ -1814,7 +1815,6 @@ test_that("lgb.train() supports non-ASCII feature names", { ...@@ -1814,7 +1815,6 @@ test_that("lgb.train() supports non-ASCII feature names", {
, verbose = .LGB_VERBOSITY , verbose = .LGB_VERBOSITY
, num_threads = .LGB_MAX_THREADS , num_threads = .LGB_MAX_THREADS
) )
, colnames = feature_names
) )
expect_true(.is_Booster(bst)) expect_true(.is_Booster(bst))
dumped_model <- jsonlite::fromJSON(bst$dump_model()) dumped_model <- jsonlite::fromJSON(bst$dump_model())
...@@ -2838,7 +2838,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is ...@@ -2838,7 +2838,11 @@ test_that(paste0("lgb.train() gives same result when interaction_constraints is
test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), { test_that(paste0("lgb.train() gives same results when using interaction_constraints and specifying colnames"), {
set.seed(1L) set.seed(1L)
dtrain <- lgb.Dataset(train$data, label = train$label, params = list(num_threads = .LGB_MAX_THREADS)) dtrain <- lgb.Dataset(
train$data
, label = train$label
, params = list(num_threads = .LGB_MAX_THREADS)
)
params <- list( params <- list(
objective = "regression" objective = "regression"
...@@ -2854,6 +2858,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai ...@@ -2854,6 +2858,7 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
pred1 <- bst$predict(test$data) pred1 <- bst$predict(test$data)
new_colnames <- paste0(colnames(train$data), "_x") new_colnames <- paste0(colnames(train$data), "_x")
dtrain$set_colnames(new_colnames)
params <- list( params <- list(
objective = "regression" objective = "regression"
, interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L]) , interaction_constraints = list(c(new_colnames[1L], new_colnames[2L]), new_colnames[3L])
...@@ -2864,7 +2869,6 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai ...@@ -2864,7 +2869,6 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
data = dtrain data = dtrain
, params = params , params = params
, nrounds = 2L , nrounds = 2L
, colnames = new_colnames
) )
pred2 <- bst$predict(test$data) pred2 <- bst$predict(test$data)
......
...@@ -25,9 +25,14 @@ X_test = df_test.drop(0, axis=1) ...@@ -25,9 +25,14 @@ X_test = df_test.drop(0, axis=1)
num_train, num_feature = X_train.shape num_train, num_feature = X_train.shape
# generate feature names
feature_name = [f"feature_{col}" for col in range(num_feature)]
# create dataset for lightgbm # create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False # if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train, weight=W_train, free_raw_data=False) lgb_train = lgb.Dataset(
X_train, y_train, weight=W_train, feature_name=feature_name, categorical_feature=[21], free_raw_data=False
)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, weight=W_test, free_raw_data=False)
# specify your configurations as a dict # specify your configurations as a dict
...@@ -43,9 +48,6 @@ params = { ...@@ -43,9 +48,6 @@ params = {
"verbose": 0, "verbose": 0,
} }
# generate feature names
feature_name = [f"feature_{col}" for col in range(num_feature)]
print("Starting training...") print("Starting training...")
# feature_name and categorical_feature # feature_name and categorical_feature
gbm = lgb.train( gbm = lgb.train(
...@@ -53,8 +55,6 @@ gbm = lgb.train( ...@@ -53,8 +55,6 @@ gbm = lgb.train(
lgb_train, lgb_train,
num_boost_round=10, num_boost_round=10,
valid_sets=lgb_train, # eval training data valid_sets=lgb_train, # eval training data
feature_name=feature_name,
categorical_feature=[21],
) )
print("Finished first 10 rounds...") print("Finished first 10 rounds...")
......
...@@ -78,7 +78,12 @@ ...@@ -78,7 +78,12 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"lgb_train = lgb.Dataset(X_train, y_train)\n", "lgb_train = lgb.Dataset(\n",
" X_train,\n",
" y_train,\n",
" feature_name=[f\"f{i + 1}\" for i in range(X_train.shape[-1])],\n",
" categorical_feature=[21],\n",
")\n",
"lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)" "lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)"
] ]
}, },
...@@ -144,8 +149,6 @@ ...@@ -144,8 +149,6 @@
" lgb_train,\n", " lgb_train,\n",
" num_boost_round=100,\n", " num_boost_round=100,\n",
" valid_sets=[lgb_train, lgb_test],\n", " valid_sets=[lgb_train, lgb_test],\n",
" feature_name=[f\"f{i + 1}\" for i in range(X_train.shape[-1])],\n",
" categorical_feature=[21],\n",
" callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)],\n", " callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)],\n",
")" ")"
] ]
......
...@@ -22,7 +22,12 @@ X_train = df_train.drop(0, axis=1) ...@@ -22,7 +22,12 @@ X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1) X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm # create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(
X_train,
y_train,
feature_name=[f"f{i + 1}" for i in range(X_train.shape[-1])],
categorical_feature=[21],
)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train) lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict # specify your configurations as a dict
...@@ -37,8 +42,6 @@ gbm = lgb.train( ...@@ -37,8 +42,6 @@ gbm = lgb.train(
lgb_train, lgb_train,
num_boost_round=100, num_boost_round=100,
valid_sets=[lgb_train, lgb_test], valid_sets=[lgb_train, lgb_test],
feature_name=[f"f{i + 1}" for i in range(X_train.shape[-1])],
categorical_feature=[21],
callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)], callbacks=[lgb.log_evaluation(10), lgb.record_evaluation(evals_result)],
) )
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
"""Library with training routines of LightGBM.""" """Library with training routines of LightGBM."""
import copy import copy
import json import json
import warnings
from collections import OrderedDict, defaultdict from collections import OrderedDict, defaultdict
from operator import attrgetter from operator import attrgetter
from pathlib import Path from pathlib import Path
...@@ -13,6 +14,7 @@ from . import callback ...@@ -13,6 +14,7 @@ from . import callback
from .basic import ( from .basic import (
Booster, Booster,
Dataset, Dataset,
LGBMDeprecationWarning,
LightGBMError, LightGBMError,
_choose_param_value, _choose_param_value,
_ConfigAliases, _ConfigAliases,
...@@ -51,6 +53,15 @@ _LGBM_PreprocFunction = Callable[ ...@@ -51,6 +53,15 @@ _LGBM_PreprocFunction = Callable[
] ]
def _emit_dataset_kwarg_warning(calling_function: str, argname: str) -> None:
msg = (
f"Argument '{argname}' to {calling_function}() is deprecated and will be removed in "
f"a future release. Set '{argname}' when calling lightgbm.Dataset() instead. "
"See https://github.com/microsoft/LightGBM/issues/6435."
)
warnings.warn(msg, category=LGBMDeprecationWarning, stacklevel=2)
def train( def train(
params: Dict[str, Any], params: Dict[str, Any],
train_set: Dataset, train_set: Dataset,
...@@ -103,9 +114,11 @@ def train( ...@@ -103,9 +114,11 @@ def train(
init_model : str, pathlib.Path, Booster or None, optional (default=None) init_model : str, pathlib.Path, Booster or None, optional (default=None)
Filename of LightGBM model or Booster instance used for continue training. Filename of LightGBM model or Booster instance used for continue training.
feature_name : list of str, or 'auto', optional (default="auto") feature_name : list of str, or 'auto', optional (default="auto")
**Deprecated.** Set ``feature_name`` on ``train_set`` instead.
Feature names. Feature names.
If 'auto' and data is pandas DataFrame, data columns names are used. If 'auto' and data is pandas DataFrame, data columns names are used.
categorical_feature : list of str or int, or 'auto', optional (default="auto") categorical_feature : list of str or int, or 'auto', optional (default="auto")
**Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
Categorical features. Categorical features.
If list of int, interpreted as indices. If list of int, interpreted as indices.
If list of str, interpreted as feature names (need to specify ``feature_name`` as well). If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
...@@ -166,6 +179,13 @@ def train( ...@@ -166,6 +179,13 @@ def train(
f"Item {i} has type '{type(valid_item).__name__}'." f"Item {i} has type '{type(valid_item).__name__}'."
) )
# raise deprecation warnings if necessary
# ref: https://github.com/microsoft/LightGBM/issues/6435
if categorical_feature != "auto":
_emit_dataset_kwarg_warning("train", "categorical_feature")
if feature_name != "auto":
_emit_dataset_kwarg_warning("train", "feature_name")
# create predictor first # create predictor first
params = copy.deepcopy(params) params = copy.deepcopy(params)
params = _choose_param_value( params = _choose_param_value(
...@@ -625,9 +645,11 @@ def cv( ...@@ -625,9 +645,11 @@ def cv(
init_model : str, pathlib.Path, Booster or None, optional (default=None) init_model : str, pathlib.Path, Booster or None, optional (default=None)
Filename of LightGBM model or Booster instance used for continue training. Filename of LightGBM model or Booster instance used for continue training.
feature_name : list of str, or 'auto', optional (default="auto") feature_name : list of str, or 'auto', optional (default="auto")
**Deprecated.** Set ``feature_name`` on ``train_set`` instead.
Feature names. Feature names.
If 'auto' and data is pandas DataFrame, data columns names are used. If 'auto' and data is pandas DataFrame, data columns names are used.
categorical_feature : list of str or int, or 'auto', optional (default="auto") categorical_feature : list of str or int, or 'auto', optional (default="auto")
**Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
Categorical features. Categorical features.
If list of int, interpreted as indices. If list of int, interpreted as indices.
If list of str, interpreted as feature names (need to specify ``feature_name`` as well). If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
...@@ -693,6 +715,13 @@ def cv( ...@@ -693,6 +715,13 @@ def cv(
if num_boost_round <= 0: if num_boost_round <= 0:
raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.") raise ValueError(f"num_boost_round must be greater than 0. Got {num_boost_round}.")
# raise deprecation warnings if necessary
# ref: https://github.com/microsoft/LightGBM/issues/6435
if categorical_feature != "auto":
_emit_dataset_kwarg_warning("cv", "categorical_feature")
if feature_name != "auto":
_emit_dataset_kwarg_warning("cv", "feature_name")
params = copy.deepcopy(params) params = copy.deepcopy(params)
params = _choose_param_value( params = _choose_param_value(
main_param_name="objective", main_param_name="objective",
......
...@@ -862,6 +862,7 @@ class LGBMModel(_LGBMModelBase): ...@@ -862,6 +862,7 @@ class LGBMModel(_LGBMModelBase):
group=group, group=group,
init_score=init_score, init_score=init_score,
categorical_feature=categorical_feature, categorical_feature=categorical_feature,
feature_name=feature_name,
params=params, params=params,
) )
...@@ -928,7 +929,6 @@ class LGBMModel(_LGBMModelBase): ...@@ -928,7 +929,6 @@ class LGBMModel(_LGBMModelBase):
valid_names=eval_names, valid_names=eval_names,
feval=eval_metrics_callable, # type: ignore[arg-type] feval=eval_metrics_callable, # type: ignore[arg-type]
init_model=init_model, init_model=init_model,
feature_name=feature_name,
callbacks=callbacks, callbacks=callbacks,
) )
......
...@@ -1421,13 +1421,14 @@ def test_cvbooster_picklable(serializer): ...@@ -1421,13 +1421,14 @@ def test_cvbooster_picklable(serializer):
def test_feature_name(): def test_feature_name():
X_train, y_train = make_synthetic_regression() X_train, y_train = make_synthetic_regression()
params = {"verbose": -1} params = {"verbose": -1}
lgb_train = lgb.Dataset(X_train, y_train)
feature_names = [f"f_{i}" for i in range(X_train.shape[-1])] feature_names = [f"f_{i}" for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names) lgb_train = lgb.Dataset(X_train, y_train, feature_name=feature_names)
gbm = lgb.train(params, lgb_train, num_boost_round=5)
assert feature_names == gbm.feature_name() assert feature_names == gbm.feature_name()
# test feature_names with whitespaces # test feature_names with whitespaces
feature_names_with_space = [f"f {i}" for i in range(X_train.shape[-1])] feature_names_with_space = [f"f {i}" for i in range(X_train.shape[-1])]
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names_with_space) lgb_train.set_feature_name(feature_names_with_space)
gbm = lgb.train(params, lgb_train, num_boost_round=5)
assert feature_names == gbm.feature_name() assert feature_names == gbm.feature_name()
...@@ -1437,9 +1438,9 @@ def test_feature_name_with_non_ascii(): ...@@ -1437,9 +1438,9 @@ def test_feature_name_with_non_ascii():
# This has non-ascii strings. # This has non-ascii strings.
feature_names = ["F_零", "F_一", "F_二", "F_三"] feature_names = ["F_零", "F_一", "F_二", "F_三"]
params = {"verbose": -1} params = {"verbose": -1}
lgb_train = lgb.Dataset(X_train, y_train) lgb_train = lgb.Dataset(X_train, y_train, feature_name=feature_names)
gbm = lgb.train(params, lgb_train, num_boost_round=5, feature_name=feature_names) gbm = lgb.train(params, lgb_train, num_boost_round=5)
assert feature_names == gbm.feature_name() assert feature_names == gbm.feature_name()
gbm.save_model("lgb.model") gbm.save_model("lgb.model")
......
...@@ -25,8 +25,8 @@ def test_register_logger(tmp_path): ...@@ -25,8 +25,8 @@ def test_register_logger(tmp_path):
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32) X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
y = np.array([0, 1, 1, 0]) y = np.array([0, 1, 1, 0])
lgb_train = lgb.Dataset(X, y) lgb_train = lgb.Dataset(X, y, categorical_feature=[1])
lgb_valid = lgb.Dataset(X, y) # different object for early-stopping lgb_valid = lgb.Dataset(X, y, categorical_feature=[1]) # different object for early-stopping
eval_records = {} eval_records = {}
callbacks = [lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(10)] callbacks = [lgb.record_evaluation(eval_records), lgb.log_evaluation(2), lgb.early_stopping(10)]
...@@ -36,7 +36,6 @@ def test_register_logger(tmp_path): ...@@ -36,7 +36,6 @@ def test_register_logger(tmp_path):
num_boost_round=10, num_boost_round=10,
feval=dummy_metric, feval=dummy_metric,
valid_sets=[lgb_valid], valid_sets=[lgb_valid],
categorical_feature=[1],
callbacks=callbacks, callbacks=callbacks,
) )
...@@ -151,12 +150,11 @@ def test_register_custom_logger(): ...@@ -151,12 +150,11 @@ def test_register_custom_logger():
logged_messages = [] logged_messages = []
X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32) X = np.array([[1, 2, 3], [1, 2, 4], [1, 2, 4], [1, 2, 3]], dtype=np.float32)
y = np.array([0, 1, 1, 0]) y = np.array([0, 1, 1, 0])
lgb_data = lgb.Dataset(X, y) lgb_data = lgb.Dataset(X, y, categorical_feature=[1])
lgb.train( lgb.train(
{"objective": "binary", "metric": "auc"}, {"objective": "binary", "metric": "auc"},
lgb_data, lgb_data,
num_boost_round=10, num_boost_round=10,
valid_sets=[lgb_data], valid_sets=[lgb_data],
categorical_feature=[1],
) )
assert logged_messages, "custom logger was not called" assert logged_messages, "custom logger was not called"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment