Unverified Commit fc991c9d authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] added R linting and changed R code to comma-first (fixes #2373) (#2437)

parent b4bb38d9
library(lintr)
args <- commandArgs(
trailingOnly = TRUE
)
SOURCE_DIR <- args[[1]]
FILES_TO_LINT <- list.files(
path = SOURCE_DIR
, pattern = "\\.r$"
, all.files = TRUE
, ignore.case = TRUE
, full.names = TRUE
, recursive = TRUE
, include.dirs = FALSE
)
# Some linters from the lintr package have not made it to CRAN yet
# We build lintr from source to address that.
LINTERS_TO_USE <- list(
"closed_curly" = lintr::closed_curly_linter
, "infix_spaces" = lintr::infix_spaces_linter
, "long_lines" = lintr::line_length_linter(length = 120)
, "tabs" = lintr::no_tab_linter
, "open_curly" = lintr::open_curly_linter
, "spaces_inside" = lintr::spaces_inside_linter
, "spaces_left_parens" = lintr::spaces_left_parentheses_linter
, "trailing_blank" = lintr::trailing_blank_lines_linter
, "trailing_white" = lintr::trailing_whitespace_linter
)
cat(sprintf("Found %i R files to lint\n", length(FILES_TO_LINT)))
results <- c()
for (r_file in FILES_TO_LINT){
this_result <- lintr::lint(
filename = r_file
, linters = LINTERS_TO_USE
, cache = FALSE
)
cat(sprintf(
"Found %i linting errors in %s\n"
, length(this_result)
, r_file
))
results <- c(results, this_result)
}
issues_found <- length(results)
if (issues_found > 0){
cat("\n")
print(results)
}
quit(save = "no", status = issues_found)
...@@ -51,10 +51,17 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "check-docs" ]]; then ...@@ -51,10 +51,17 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "check-docs" ]]; then
fi fi
if [[ $TASK == "lint" ]]; then if [[ $TASK == "lint" ]]; then
conda install -q -y -n $CONDA_ENV pycodestyle pydocstyle conda install -q -y -n $CONDA_ENV \
pycodestyle \
pydocstyle \
r-lintr
pip install --user cpplint pip install --user cpplint
echo "Linting Python code"
pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget . || exit -1 pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget . || exit -1
pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
echo "Linting R code"
Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
echo "Linting C++ code"
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit 0 cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit 0
exit 0 exit 0
fi fi
......
...@@ -39,7 +39,11 @@ cb.reset.parameters <- function(new_params) { ...@@ -39,7 +39,11 @@ cb.reset.parameters <- function(new_params) {
# since changing them would simply wreck some chaos # since changing them would simply wreck some chaos
not_allowed <- c("num_class", "metric", "boosting_type") not_allowed <- c("num_class", "metric", "boosting_type")
if (any(pnames %in% not_allowed)) { if (any(pnames %in% not_allowed)) {
stop("Parameters ", paste0(pnames[pnames %in% not_allowed], collapse = ", "), " cannot be changed during boosting") stop(
"Parameters "
, paste0(pnames[pnames %in% not_allowed], collapse = ", ")
, " cannot be changed during boosting"
)
} }
# Check parameter names # Check parameter names
...@@ -166,7 +170,7 @@ cb.print.evaluation <- function(period = 1) { ...@@ -166,7 +170,7 @@ cb.print.evaluation <- function(period = 1) {
i <- env$iteration i <- env$iteration
# Check if iteration matches moduo # Check if iteration matches moduo
if ((i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration ))) { if ( (i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration))) {
# Merge evaluation string # Merge evaluation string
msg <- merge.eval.string(env) msg <- merge.eval.string(env)
...@@ -244,8 +248,14 @@ cb.record.evaluation <- function() { ...@@ -244,8 +248,14 @@ cb.record.evaluation <- function() {
name <- eval_res$name name <- eval_res$name
# Store evaluation data # Store evaluation data
env$model$record_evals[[data_name]][[name]]$eval <- c(env$model$record_evals[[data_name]][[name]]$eval, eval_res$value) env$model$record_evals[[data_name]][[name]]$eval <- c(
env$model$record_evals[[data_name]][[name]]$eval_err <- c(env$model$record_evals[[data_name]][[name]]$eval_err, eval_err) env$model$record_evals[[data_name]][[name]]$eval
, eval_res$value
)
env$model$record_evals[[data_name]][[name]]$eval_err <- c(
env$model$record_evals[[data_name]][[name]]$eval_err
, eval_err
)
} }
...@@ -391,7 +401,9 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) { ...@@ -391,7 +401,9 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
} }
# Extract callback names from the list of callbacks # Extract callback names from the list of callbacks
callback.names <- function(cb_list) { unlist(lapply(cb_list, attr, "name")) } callback.names <- function(cb_list) {
unlist(lapply(cb_list, attr, "name"))
}
add.cb <- function(cb_list, cb) { add.cb <- function(cb_list, cb) {
......
...@@ -46,7 +46,12 @@ Booster <- R6::R6Class( ...@@ -46,7 +46,12 @@ Booster <- R6::R6Class(
} }
# Store booster handle # Store booster handle
handle <- lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str) handle <- lgb.call(
"LGBM_BoosterCreate_R"
, ret = handle
, train_set$.__enclos_env__$private$get_handle()
, params_str
)
# Create private booster information # Create private booster information
private$train_set <- train_set private$train_set <- train_set
...@@ -57,10 +62,12 @@ Booster <- R6::R6Class( ...@@ -57,10 +62,12 @@ Booster <- R6::R6Class(
if (!is.null(private$init_predictor)) { if (!is.null(private$init_predictor)) {
# Merge booster # Merge booster
lgb.call("LGBM_BoosterMerge_R", lgb.call(
ret = NULL, "LGBM_BoosterMerge_R"
handle, , ret = NULL
private$init_predictor$.__enclos_env__$private$handle) , handle
, private$init_predictor$.__enclos_env__$private$handle
)
} }
...@@ -75,9 +82,11 @@ Booster <- R6::R6Class( ...@@ -75,9 +82,11 @@ Booster <- R6::R6Class(
} }
# Create booster from model # Create booster from model
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", handle <- lgb.call(
ret = handle, "LGBM_BoosterCreateFromModelfile_R"
lgb.c_str(modelfile)) , ret = handle
, lgb.c_str(modelfile)
)
} else if (!is.null(model_str)) { } else if (!is.null(model_str)) {
...@@ -87,14 +96,19 @@ Booster <- R6::R6Class( ...@@ -87,14 +96,19 @@ Booster <- R6::R6Class(
} }
# Create booster from model # Create booster from model
handle <- lgb.call("LGBM_BoosterLoadModelFromString_R", handle <- lgb.call(
ret = handle, "LGBM_BoosterLoadModelFromString_R"
lgb.c_str(model_str)) , ret = handle
, lgb.c_str(model_str)
)
} else { } else {
# Booster non existent # Booster non existent
stop("lgb.Booster: Need at least either training dataset, model file, or model_str to create booster instance") stop(
"lgb.Booster: Need at least either training dataset, "
, "model file, or model_str to create booster instance"
)
} }
...@@ -111,9 +125,11 @@ Booster <- R6::R6Class( ...@@ -111,9 +125,11 @@ Booster <- R6::R6Class(
class(handle) <- "lgb.Booster.handle" class(handle) <- "lgb.Booster.handle"
private$handle <- handle private$handle <- handle
private$num_class <- 1L private$num_class <- 1L
private$num_class <- lgb.call("LGBM_BoosterGetNumClasses_R", private$num_class <- lgb.call(
ret = private$num_class, "LGBM_BoosterGetNumClasses_R"
private$handle) , ret = private$num_class
, private$handle
)
} }
...@@ -138,7 +154,10 @@ Booster <- R6::R6Class( ...@@ -138,7 +154,10 @@ Booster <- R6::R6Class(
# Check if predictors are identical # Check if predictors are identical
if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) { if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) {
stop("lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data") stop(
"lgb.Booster.add_valid: Failed to add validation data; "
, "you should use the same predictor for these data"
)
} }
# Check if names are character # Check if names are character
...@@ -147,10 +166,12 @@ Booster <- R6::R6Class( ...@@ -147,10 +166,12 @@ Booster <- R6::R6Class(
} }
# Add validation data to booster # Add validation data to booster
lgb.call("LGBM_BoosterAddValidData_R", lgb.call(
ret = NULL, "LGBM_BoosterAddValidData_R"
private$handle, , ret = NULL
data$.__enclos_env__$private$get_handle()) , private$handle
, data$.__enclos_env__$private$get_handle()
)
# Store private information # Store private information
private$valid_sets <- c(private$valid_sets, data) private$valid_sets <- c(private$valid_sets, data)
...@@ -171,10 +192,12 @@ Booster <- R6::R6Class( ...@@ -171,10 +192,12 @@ Booster <- R6::R6Class(
params_str <- lgb.params2str(params) params_str <- lgb.params2str(params)
# Reset parameters # Reset parameters
lgb.call("LGBM_BoosterResetParameter_R", lgb.call(
ret = NULL, "LGBM_BoosterResetParameter_R"
private$handle, , ret = NULL
params_str) , private$handle
, params_str
)
# Return self # Return self
return(invisible(self)) return(invisible(self))
...@@ -198,10 +221,12 @@ Booster <- R6::R6Class( ...@@ -198,10 +221,12 @@ Booster <- R6::R6Class(
} }
# Reset training data on booster # Reset training data on booster
lgb.call("LGBM_BoosterResetTrainingData_R", lgb.call(
ret = NULL, "LGBM_BoosterResetTrainingData_R"
private$handle, , ret = NULL
train_set$.__enclos_env__$private$get_handle()) , private$handle
, train_set$.__enclos_env__$private$get_handle()
)
# Store private train set # Store private train set
private$train_set = train_set private$train_set = train_set
...@@ -230,18 +255,20 @@ Booster <- R6::R6Class( ...@@ -230,18 +255,20 @@ Booster <- R6::R6Class(
gpair <- fobj(private$inner_predict(1), private$train_set) gpair <- fobj(private$inner_predict(1), private$train_set)
# Check for gradient and hessian as list # Check for gradient and hessian as list
if(is.null(gpair$grad) || is.null(gpair$hess)){ if (is.null(gpair$grad) || is.null(gpair$hess)){
stop("lgb.Booster.update: custom objective should stop("lgb.Booster.update: custom objective should
return a list with attributes (hess, grad)") return a list with attributes (hess, grad)")
} }
# Return custom boosting gradient/hessian # Return custom boosting gradient/hessian
ret <- lgb.call("LGBM_BoosterUpdateOneIterCustom_R", ret <- lgb.call(
ret = NULL, "LGBM_BoosterUpdateOneIterCustom_R"
private$handle, , ret = NULL
gpair$grad, , private$handle
gpair$hess, , gpair$grad
length(gpair$grad)) , gpair$hess
, length(gpair$grad)
)
} }
...@@ -258,9 +285,11 @@ Booster <- R6::R6Class( ...@@ -258,9 +285,11 @@ Booster <- R6::R6Class(
rollback_one_iter = function() { rollback_one_iter = function() {
# Return one iteration behind # Return one iteration behind
lgb.call("LGBM_BoosterRollbackOneIter_R", lgb.call(
ret = NULL, "LGBM_BoosterRollbackOneIter_R"
private$handle) , ret = NULL
, private$handle
)
# Loop through each iteration # Loop through each iteration
for (i in seq_along(private$is_predicted_cur_iter)) { for (i in seq_along(private$is_predicted_cur_iter)) {
...@@ -276,9 +305,11 @@ Booster <- R6::R6Class( ...@@ -276,9 +305,11 @@ Booster <- R6::R6Class(
current_iter = function() { current_iter = function() {
cur_iter <- 0L cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R", lgb.call(
ret = cur_iter, "LGBM_BoosterGetCurrentIteration_R"
private$handle) , ret = cur_iter
, private$handle
)
}, },
...@@ -349,7 +380,10 @@ Booster <- R6::R6Class( ...@@ -349,7 +380,10 @@ Booster <- R6::R6Class(
# Loop through each validation set # Loop through each validation set
for (i in seq_along(private$valid_sets)) { for (i in seq_along(private$valid_sets)) {
ret <- append(ret, private$inner_eval(private$name_valid_sets[[i]], i + 1, feval)) ret <- append(
x = ret
, values = private$inner_eval(private$name_valid_sets[[i]], i + 1, feval)
)
} }
# Return ret # Return ret
...@@ -366,11 +400,13 @@ Booster <- R6::R6Class( ...@@ -366,11 +400,13 @@ Booster <- R6::R6Class(
} }
# Save booster model # Save booster model
lgb.call("LGBM_BoosterSaveModel_R", lgb.call(
ret = NULL, "LGBM_BoosterSaveModel_R"
private$handle, , ret = NULL
as.integer(num_iteration), , private$handle
lgb.c_str(filename)) , as.integer(num_iteration)
, lgb.c_str(filename)
)
# Return self # Return self
return(invisible(self)) return(invisible(self))
...@@ -385,9 +421,11 @@ Booster <- R6::R6Class( ...@@ -385,9 +421,11 @@ Booster <- R6::R6Class(
} }
# Return model string # Return model string
return(lgb.call.return.str("LGBM_BoosterSaveModelToString_R", return(lgb.call.return.str(
private$handle, "LGBM_BoosterSaveModelToString_R"
as.integer(num_iteration))) , private$handle
, as.integer(num_iteration)
))
}, },
...@@ -400,9 +438,11 @@ Booster <- R6::R6Class( ...@@ -400,9 +438,11 @@ Booster <- R6::R6Class(
} }
# Return dumped model # Return dumped model
lgb.call.return.str("LGBM_BoosterDumpModel_R", lgb.call.return.str(
private$handle, "LGBM_BoosterDumpModel_R"
as.integer(num_iteration)) , private$handle
, as.integer(num_iteration)
)
}, },
...@@ -478,10 +518,12 @@ Booster <- R6::R6Class( ...@@ -478,10 +518,12 @@ Booster <- R6::R6Class(
# Store predictions # Store predictions
npred <- 0L npred <- 0L
npred <- lgb.call("LGBM_BoosterGetNumPredict_R", npred <- lgb.call(
ret = npred, "LGBM_BoosterGetNumPredict_R"
private$handle, , ret = npred
as.integer(idx - 1)) , private$handle
, as.integer(idx - 1)
)
private$predict_buffer[[data_name]] <- numeric(npred) private$predict_buffer[[data_name]] <- numeric(npred)
} }
...@@ -490,10 +532,12 @@ Booster <- R6::R6Class( ...@@ -490,10 +532,12 @@ Booster <- R6::R6Class(
if (!private$is_predicted_cur_iter[[idx]]) { if (!private$is_predicted_cur_iter[[idx]]) {
# Use buffer # Use buffer
private$predict_buffer[[data_name]] <- lgb.call("LGBM_BoosterGetPredict_R", private$predict_buffer[[data_name]] <- lgb.call(
ret = private$predict_buffer[[data_name]], "LGBM_BoosterGetPredict_R"
private$handle, , ret = private$predict_buffer[[data_name]]
as.integer(idx - 1)) , private$handle
, as.integer(idx - 1)
)
private$is_predicted_cur_iter[[idx]] <- TRUE private$is_predicted_cur_iter[[idx]] <- TRUE
} }
...@@ -508,8 +552,10 @@ Booster <- R6::R6Class( ...@@ -508,8 +552,10 @@ Booster <- R6::R6Class(
if (is.null(private$eval_names)) { if (is.null(private$eval_names)) {
# Get evaluation names # Get evaluation names
names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R", names <- lgb.call.return.str(
private$handle) "LGBM_BoosterGetEvalNames_R"
, private$handle
)
# Check names' length # Check names' length
if (nchar(names) > 0) { if (nchar(names) > 0) {
...@@ -547,10 +593,12 @@ Booster <- R6::R6Class( ...@@ -547,10 +593,12 @@ Booster <- R6::R6Class(
# Create evaluation values # Create evaluation values
tmp_vals <- numeric(length(private$eval_names)) tmp_vals <- numeric(length(private$eval_names))
tmp_vals <- lgb.call("LGBM_BoosterGetEval_R", tmp_vals <- lgb.call(
ret = tmp_vals, "LGBM_BoosterGetEval_R"
private$handle, , ret = tmp_vals
as.integer(data_idx - 1)) , private$handle
, as.integer(data_idx - 1)
)
# Loop through all evaluation names # Loop through all evaluation names
for (i in seq_along(private$eval_names)) { for (i in seq_along(private$eval_names)) {
...@@ -587,7 +635,7 @@ Booster <- R6::R6Class( ...@@ -587,7 +635,7 @@ Booster <- R6::R6Class(
res <- feval(private$inner_predict(data_idx), data) res <- feval(private$inner_predict(data_idx), data)
# Check for name correctness # Check for name correctness
if(is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) { if (is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) {
stop("lgb.Booster.eval: custom eval function should return a stop("lgb.Booster.eval: custom eval function should return a
list with attribute (name, value, higher_better)"); list with attribute (name, value, higher_better)");
} }
...@@ -614,13 +662,13 @@ Booster <- R6::R6Class( ...@@ -614,13 +662,13 @@ Booster <- R6::R6Class(
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename #' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration #' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration
#' @param rawscore whether the prediction should be returned in the for of original untransformed #' @param rawscore whether the prediction should be returned in the for of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for #' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
#' logistic regression would result in predictions for log-odds instead of probabilities. #' for logistic regression would result in predictions for log-odds instead of probabilities.
#' @param predleaf whether predict leaf index instead. #' @param predleaf whether predict leaf index instead.
#' @param predcontrib return per-feature contributions for each record. #' @param predcontrib return per-feature contributions for each record.
#' @param header only used for prediction for text file. True if text file has header #' @param header only used for prediction for text file. True if text file has header
#' @param reshape whether to reshape the vector of predictions to a matrix form when there are several #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
#' prediction outputs per case. #' prediction outputs per case.
#' @param ... Additional named arguments passed to the \code{predict()} method of #' @param ... Additional named arguments passed to the \code{predict()} method of
#' the \code{lgb.Booster} object passed to \code{object}. #' the \code{lgb.Booster} object passed to \code{object}.
#' @return #' @return
...@@ -642,13 +690,15 @@ Booster <- R6::R6Class( ...@@ -642,13 +690,15 @@ Booster <- R6::R6Class(
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' preds <- predict(model, test$data) #' preds <- predict(model, test$data)
#' #'
#' @rdname predict.lgb.Booster #' @rdname predict.lgb.Booster
...@@ -669,13 +719,16 @@ predict.lgb.Booster <- function(object, ...@@ -669,13 +719,16 @@ predict.lgb.Booster <- function(object,
} }
# Return booster predictions # Return booster predictions
object$predict(data, object$predict(
num_iteration, data
rawscore, , num_iteration
predleaf, , rawscore
predcontrib, , predleaf
header, , predcontrib
reshape, ...) , header
, reshape
, ...
)
} }
#' Load LightGBM model #' Load LightGBM model
...@@ -699,13 +752,15 @@ predict.lgb.Booster <- function(object, ...@@ -699,13 +752,15 @@ predict.lgb.Booster <- function(object,
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.save(model, "model.txt") #' lgb.save(model, "model.txt")
#' load_booster <- lgb.load(filename = "model.txt") #' load_booster <- lgb.load(filename = "model.txt")
#' model_string <- model$save_model_to_string(NULL) # saves best iteration #' model_string <- model$save_model_to_string(NULL) # saves best iteration
...@@ -757,13 +812,15 @@ lgb.load <- function(filename = NULL, model_str = NULL){ ...@@ -757,13 +812,15 @@ lgb.load <- function(filename = NULL, model_str = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.save(model, "model.txt") #' lgb.save(model, "model.txt")
#' #'
#' @rdname lgb.save #' @rdname lgb.save
...@@ -804,13 +861,15 @@ lgb.save <- function(booster, filename, num_iteration = NULL){ ...@@ -804,13 +861,15 @@ lgb.save <- function(booster, filename, num_iteration = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' json_model <- lgb.dump(model) #' json_model <- lgb.dump(model)
#' #'
#' @rdname lgb.dump #' @rdname lgb.dump
...@@ -848,13 +907,15 @@ lgb.dump <- function(booster, num_iteration = NULL){ ...@@ -848,13 +907,15 @@ lgb.dump <- function(booster, num_iteration = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.get.eval.result(model, "test", "l2") #' lgb.get.eval.result(model, "test", "l2")
#' @rdname lgb.get.eval.result #' @rdname lgb.get.eval.result
#' @export #' @export
......
...@@ -97,16 +97,18 @@ Dataset <- R6::R6Class( ...@@ -97,16 +97,18 @@ Dataset <- R6::R6Class(
...) { ...) {
# Create new dataset # Create new dataset
ret <- Dataset$new(data, ret <- Dataset$new(
private$params, data = data
self, , params = private$params
private$colnames, , reference = self
private$categorical_feature, , colnames = private$colnames
private$predictor, , categorical_feature = private$categorical_feature
private$free_raw_data, , predictor = private$predictor
NULL, , free_raw_data = private$free_raw_data
info, , used_indices = NULL
...) , info = info
, ...
)
# Return ret # Return ret
return(invisible(ret)) return(invisible(ret))
...@@ -142,14 +144,23 @@ Dataset <- R6::R6Class( ...@@ -142,14 +144,23 @@ Dataset <- R6::R6Class(
# Provided indices, but some indices are not existing? # Provided indices, but some indices are not existing?
if (sum(is.na(cate_indices)) > 0) { if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)])) stop(
"lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
, sQuote(private$categorical_feature[is.na(cate_indices)])
)
} }
} else { } else {
# Check if more categorical features were output over the feature space # Check if more categorical features were output over the feature space
if (max(private$categorical_feature) > length(private$colnames)) { if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features") stop(
"lgb.self.get.handle: supplied a too large value in categorical_feature: "
, max(private$categorical_feature)
, " but only "
, length(private$colnames)
, " features"
)
} }
# Store indices as [0, n-1] indexed instead of [1, n] indexed # Store indices as [0, n-1] indexed instead of [1, n] indexed
...@@ -165,7 +176,9 @@ Dataset <- R6::R6Class( ...@@ -165,7 +176,9 @@ Dataset <- R6::R6Class(
# Check has header or not # Check has header or not
has_header <- FALSE has_header <- FALSE
if (!is.null(private$params$has_header) || !is.null(private$params$header)) { if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
if (tolower(as.character(private$params$has_header)) == "true" || tolower(as.character(private$params$header)) == "true") { params_has_header <- tolower(as.character(private$params$has_header)) == "true"
params_header <- tolower(as.character(private$params$header)) == "true"
if (params_has_header || params_header) {
has_header <- TRUE has_header <- TRUE
} }
} }
...@@ -186,43 +199,52 @@ Dataset <- R6::R6Class( ...@@ -186,43 +199,52 @@ Dataset <- R6::R6Class(
# Are we using a data file? # Are we using a data file?
if (is.character(private$raw_data)) { if (is.character(private$raw_data)) {
handle <- lgb.call("LGBM_DatasetCreateFromFile_R", handle <- lgb.call(
ret = handle, "LGBM_DatasetCreateFromFile_R"
lgb.c_str(private$raw_data), , ret = handle
params_str, , lgb.c_str(private$raw_data)
ref_handle) , params_str
, ref_handle
)
} else if (is.matrix(private$raw_data)) { } else if (is.matrix(private$raw_data)) {
# Are we using a matrix? # Are we using a matrix?
handle <- lgb.call("LGBM_DatasetCreateFromMat_R", handle <- lgb.call(
ret = handle, "LGBM_DatasetCreateFromMat_R"
private$raw_data, , ret = handle
nrow(private$raw_data), , private$raw_data
ncol(private$raw_data), , nrow(private$raw_data)
params_str, , ncol(private$raw_data)
ref_handle) , params_str
, ref_handle
)
} else if (methods::is(private$raw_data, "dgCMatrix")) { } else if (methods::is(private$raw_data, "dgCMatrix")) {
if (length(private$raw_data@p) > 2147483647) { if (length(private$raw_data@p) > 2147483647) {
stop("Cannot support large CSC matrix") stop("Cannot support large CSC matrix")
} }
# Are we using a dgCMatrix (sparsed matrix column compressed) # Are we using a dgCMatrix (sparsed matrix column compressed)
handle <- lgb.call("LGBM_DatasetCreateFromCSC_R", handle <- lgb.call(
ret = handle, "LGBM_DatasetCreateFromCSC_R"
private$raw_data@p, , ret = handle
private$raw_data@i, , private$raw_data@p
private$raw_data@x, , private$raw_data@i
length(private$raw_data@p), , private$raw_data@x
length(private$raw_data@x), , length(private$raw_data@p)
nrow(private$raw_data), , length(private$raw_data@x)
params_str, , nrow(private$raw_data)
ref_handle) , params_str
, ref_handle
)
} else { } else {
# Unknown data type # Unknown data type
stop("lgb.Dataset.construct: does not support constructing from ", sQuote(class(private$raw_data))) stop(
"lgb.Dataset.construct: does not support constructing from "
, sQuote(class(private$raw_data))
)
} }
...@@ -234,12 +256,14 @@ Dataset <- R6::R6Class( ...@@ -234,12 +256,14 @@ Dataset <- R6::R6Class(
} }
# Construct subset # Construct subset
handle <- lgb.call("LGBM_DatasetGetSubset_R", handle <- lgb.call(
ret = handle, "LGBM_DatasetGetSubset_R"
ref_handle, , ret = handle
c(private$used_indices), # Adding c() fixes issue in R v3.5 , ref_handle
length(private$used_indices), , c(private$used_indices) # Adding c() fixes issue in R v3.5
params_str) , length(private$used_indices)
, params_str
)
} }
if (lgb.is.null.handle(handle)) { if (lgb.is.null.handle(handle)) {
...@@ -258,7 +282,11 @@ Dataset <- R6::R6Class( ...@@ -258,7 +282,11 @@ Dataset <- R6::R6Class(
if (!is.null(private$predictor) && is.null(private$used_indices)) { if (!is.null(private$predictor) && is.null(private$used_indices)) {
# Setup initial scores # Setup initial scores
init_score <- private$predictor$predict(private$raw_data, rawscore = TRUE, reshape = TRUE) init_score <- private$predictor$predict(
private$raw_data
, rawscore = TRUE
, reshape = TRUE
)
# Not needed to transpose, for is col_marjor # Not needed to transpose, for is col_marjor
init_score <- as.vector(init_score) init_score <- as.vector(init_score)
...@@ -316,7 +344,10 @@ Dataset <- R6::R6Class( ...@@ -316,7 +344,10 @@ Dataset <- R6::R6Class(
} else { } else {
# Trying to work with unknown dimensions is not possible # Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly") stop(
"dim: cannot get dimensions before dataset has been constructed, "
, "please call lgb.Dataset.construct explicitly"
)
} }
...@@ -341,7 +372,10 @@ Dataset <- R6::R6Class( ...@@ -341,7 +372,10 @@ Dataset <- R6::R6Class(
} else { } else {
# Trying to work with unknown dimensions is not possible # Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly") stop(
"dim: cannot get dimensions before dataset has been constructed, please call "
, "lgb.Dataset.construct explicitly"
)
} }
...@@ -367,10 +401,12 @@ Dataset <- R6::R6Class( ...@@ -367,10 +401,12 @@ Dataset <- R6::R6Class(
# Merge names with tab separation # Merge names with tab separation
merged_name <- paste0(as.list(private$colnames), collapse = "\t") merged_name <- paste0(as.list(private$colnames), collapse = "\t")
lgb.call("LGBM_DatasetSetFeatureNames_R", lgb.call(
ret = NULL, "LGBM_DatasetSetFeatureNames_R"
private$handle, , ret = NULL
lgb.c_str(merged_name)) , private$handle
, lgb.c_str(merged_name)
)
} }
...@@ -399,10 +435,12 @@ Dataset <- R6::R6Class( ...@@ -399,10 +435,12 @@ Dataset <- R6::R6Class(
# Get field size of info # Get field size of info
info_len <- 0L info_len <- 0L
info_len <- lgb.call("LGBM_DatasetGetFieldSize_R", info_len <- lgb.call(
ret = info_len, "LGBM_DatasetGetFieldSize_R"
private$handle, , ret = info_len
lgb.c_str(name)) , private$handle
, lgb.c_str(name)
)
# Check if info is not empty # Check if info is not empty
if (info_len > 0) { if (info_len > 0) {
...@@ -415,10 +453,12 @@ Dataset <- R6::R6Class( ...@@ -415,10 +453,12 @@ Dataset <- R6::R6Class(
numeric(info_len) # Numeric numeric(info_len) # Numeric
} }
ret <- lgb.call("LGBM_DatasetGetField_R", ret <- lgb.call(
ret = ret, "LGBM_DatasetGetField_R"
private$handle, , ret = ret
lgb.c_str(name)) , private$handle
, lgb.c_str(name)
)
private$info[[name]] <- ret private$info[[name]] <- ret
...@@ -454,12 +494,14 @@ Dataset <- R6::R6Class( ...@@ -454,12 +494,14 @@ Dataset <- R6::R6Class(
if (length(info) > 0) { if (length(info) > 0) {
lgb.call("LGBM_DatasetSetField_R", lgb.call(
ret = NULL, "LGBM_DatasetSetField_R"
private$handle, , ret = NULL
lgb.c_str(name), , private$handle
info, , lgb.c_str(name)
length(info)) , info
, length(info)
)
} }
...@@ -474,16 +516,18 @@ Dataset <- R6::R6Class( ...@@ -474,16 +516,18 @@ Dataset <- R6::R6Class(
slice = function(idxset, ...) { slice = function(idxset, ...) {
# Perform slicing # Perform slicing
Dataset$new(NULL, Dataset$new(
private$params, data = NULL
self, , params = private$params
private$colnames, , reference = self
private$categorical_feature, , colnames = private$colnames
private$predictor, , categorical_feature = private$categorical_feature
private$free_raw_data, , predictor = private$predictor
sort(idxset, decreasing = FALSE), , free_raw_data = private$free_raw_data
NULL, , used_indices = sort(idxset, decreasing = FALSE)
...) , info = NULL
, ...
)
}, },
...@@ -492,7 +536,12 @@ Dataset <- R6::R6Class( ...@@ -492,7 +536,12 @@ Dataset <- R6::R6Class(
# Parameter updating # Parameter updating
if (!lgb.is.null.handle(private$handle)) { if (!lgb.is.null.handle(private$handle)) {
lgb.call("LGBM_DatasetUpdateParam_R", ret = NULL, private$handle, lgb.params2str(params)) lgb.call(
"LGBM_DatasetUpdateParam_R"
, ret = NULL
, private$handle
, lgb.params2str(params)
)
return(invisible(self)) return(invisible(self))
} }
private$params <- modifyList(private$params, params) private$params <- modifyList(private$params, params)
...@@ -568,10 +617,12 @@ Dataset <- R6::R6Class( ...@@ -568,10 +617,12 @@ Dataset <- R6::R6Class(
# Store binary data # Store binary data
self$construct() self$construct()
lgb.call("LGBM_DatasetSaveBinary_R", lgb.call(
ret = NULL, "LGBM_DatasetSaveBinary_R"
private$handle, , ret = NULL
lgb.c_str(fname)) , private$handle
, lgb.c_str(fname)
)
return(invisible(self)) return(invisible(self))
} }
...@@ -671,16 +722,18 @@ lgb.Dataset <- function(data, ...@@ -671,16 +722,18 @@ lgb.Dataset <- function(data,
...) { ...) {
# Create new dataset # Create new dataset
invisible(Dataset$new(data, invisible(Dataset$new(
params, data = data
reference, , params = params
colnames, , reference = reference
categorical_feature, , colnames = colnames
NULL, , categorical_feature = categorical_feature
free_raw_data, , predictor = NULL
NULL, , free_raw_data = free_raw_data
info, , used_indices = NULL
...)) , info = info
, ...
))
} }
...@@ -784,7 +837,7 @@ dim.lgb.Dataset <- function(x, ...) { ...@@ -784,7 +837,7 @@ dim.lgb.Dataset <- function(x, ...) {
#' #'
#' @param x object of class \code{lgb.Dataset} #' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored #' @param value a list of two elements: the first one is ignored
#' and the second one is column names #' and the second one is column names
#' #'
#' @details #' @details
#' Generic \code{dimnames} methods are used by \code{colnames}. #' Generic \code{dimnames} methods are used by \code{colnames}.
...@@ -840,7 +893,13 @@ dimnames.lgb.Dataset <- function(x) { ...@@ -840,7 +893,13 @@ dimnames.lgb.Dataset <- function(x) {
# Check for unmatching column size # Check for unmatching column size
if (ncol(x) != length(value[[2]])) { if (ncol(x) != length(value[[2]])) {
stop("can't assign ", sQuote(length(value[[2]])), " colnames to an lgb.Dataset with ", sQuote(ncol(x)), " columns") stop(
"can't assign "
, sQuote(length(value[[2]]))
, " colnames to an lgb.Dataset with "
, sQuote(ncol(x))
, " columns"
)
} }
# Set column names properly, and return # Set column names properly, and return
......
...@@ -13,7 +13,11 @@ Predictor <- R6::R6Class( ...@@ -13,7 +13,11 @@ Predictor <- R6::R6Class(
if (private$need_free_handle && !lgb.is.null.handle(private$handle)) { if (private$need_free_handle && !lgb.is.null.handle(private$handle)) {
# Freeing up handle # Freeing up handle
lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle) lgb.call(
"LGBM_BoosterFree_R"
, ret = NULL
, private$handle
)
private$handle <- NULL private$handle <- NULL
} }
...@@ -31,7 +35,11 @@ Predictor <- R6::R6Class( ...@@ -31,7 +35,11 @@ Predictor <- R6::R6Class(
if (is.character(modelfile)) { if (is.character(modelfile)) {
# Create handle on it # Create handle on it
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", ret = handle, lgb.c_str(modelfile)) handle <- lgb.call(
"LGBM_BoosterCreateFromModelfile_R"
, ret = handle
, lgb.c_str(modelfile)
)
private$need_free_handle <- TRUE private$need_free_handle <- TRUE
} else if (methods::is(modelfile, "lgb.Booster.handle")) { } else if (methods::is(modelfile, "lgb.Booster.handle")) {
...@@ -57,7 +65,11 @@ Predictor <- R6::R6Class( ...@@ -57,7 +65,11 @@ Predictor <- R6::R6Class(
current_iter = function() { current_iter = function() {
cur_iter <- 0L cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle) lgb.call(
"LGBM_BoosterGetCurrentIteration_R"
, ret = cur_iter
, private$handle
)
}, },
...@@ -86,14 +98,19 @@ Predictor <- R6::R6Class( ...@@ -86,14 +98,19 @@ Predictor <- R6::R6Class(
on.exit(unlink(tmp_filename), add = TRUE) on.exit(unlink(tmp_filename), add = TRUE)
# Predict from temporary file # Predict from temporary file
lgb.call("LGBM_BoosterPredictForFile_R", ret = NULL, private$handle, data, lgb.call(
as.integer(header), "LGBM_BoosterPredictForFile_R"
as.integer(rawscore), , ret = NULL
as.integer(predleaf), , private$handle
as.integer(predcontrib), , data
as.integer(num_iteration), , as.integer(header)
private$params, , as.integer(rawscore)
lgb.c_str(tmp_filename)) , as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
, private$params
, lgb.c_str(tmp_filename)
)
# Get predictions from file # Get predictions from file
preds <- read.delim(tmp_filename, header = FALSE, sep = "\t") preds <- read.delim(tmp_filename, header = FALSE, sep = "\t")
...@@ -108,51 +125,57 @@ Predictor <- R6::R6Class( ...@@ -108,51 +125,57 @@ Predictor <- R6::R6Class(
npred <- 0L npred <- 0L
# Check number of predictions to do # Check number of predictions to do
npred <- lgb.call("LGBM_BoosterCalcNumPredict_R", npred <- lgb.call(
ret = npred, "LGBM_BoosterCalcNumPredict_R"
private$handle, , ret = npred
as.integer(num_row), , private$handle
as.integer(rawscore), , as.integer(num_row)
as.integer(predleaf), , as.integer(rawscore)
as.integer(predcontrib), , as.integer(predleaf)
as.integer(num_iteration)) , as.integer(predcontrib)
, as.integer(num_iteration)
)
# Pre-allocate empty vector # Pre-allocate empty vector
preds <- numeric(npred) preds <- numeric(npred)
# Check if data is a matrix # Check if data is a matrix
if (is.matrix(data)) { if (is.matrix(data)) {
preds <- lgb.call("LGBM_BoosterPredictForMat_R", preds <- lgb.call(
ret = preds, "LGBM_BoosterPredictForMat_R"
private$handle, , ret = preds
data, , private$handle
as.integer(nrow(data)), , data
as.integer(ncol(data)), , as.integer(nrow(data))
as.integer(rawscore), , as.integer(ncol(data))
as.integer(predleaf), , as.integer(rawscore)
as.integer(predcontrib), , as.integer(predleaf)
as.integer(num_iteration), , as.integer(predcontrib)
private$params) , as.integer(num_iteration)
, private$params
)
} else if (methods::is(data, "dgCMatrix")) { } else if (methods::is(data, "dgCMatrix")) {
if (length(data@p) > 2147483647) { if (length(data@p) > 2147483647) {
stop("Cannot support large CSC matrix") stop("Cannot support large CSC matrix")
} }
# Check if data is a dgCMatrix (sparse matrix, column compressed format) # Check if data is a dgCMatrix (sparse matrix, column compressed format)
preds <- lgb.call("LGBM_BoosterPredictForCSC_R", preds <- lgb.call(
ret = preds, "LGBM_BoosterPredictForCSC_R"
private$handle, , ret = preds
data@p, , private$handle
data@i, , data@p
data@x, , data@i
length(data@p), , data@x
length(data@x), , length(data@p)
nrow(data), , length(data@x)
as.integer(rawscore), , nrow(data)
as.integer(predleaf), , as.integer(rawscore)
as.integer(predcontrib), , as.integer(predleaf)
as.integer(num_iteration), , as.integer(predcontrib)
private$params) , as.integer(num_iteration)
, private$params
)
} else { } else {
...@@ -165,7 +188,12 @@ Predictor <- R6::R6Class( ...@@ -165,7 +188,12 @@ Predictor <- R6::R6Class(
# Check if number of rows is strange (not a multiple of the dataset rows) # Check if number of rows is strange (not a multiple of the dataset rows)
if (length(preds) %% num_row != 0) { if (length(preds) %% num_row != 0) {
stop("predict: prediction length ", sQuote(length(preds))," is not a multiple of nrows(data): ", sQuote(num_row)) stop(
"predict: prediction length "
, sQuote(length(preds))
," is not a multiple of nrows(data): "
, sQuote(num_row)
)
} }
# Get number of cases per row # Get number of cases per row
...@@ -192,7 +220,9 @@ Predictor <- R6::R6Class( ...@@ -192,7 +220,9 @@ Predictor <- R6::R6Class(
} }
), ),
private = list(handle = NULL, private = list(
need_free_handle = FALSE, handle = NULL
params = "") , need_free_handle = FALSE
, params = ""
)
) )
...@@ -25,23 +25,23 @@ CVBooster <- R6::R6Class( ...@@ -25,23 +25,23 @@ CVBooster <- R6::R6Class(
#' @param label vector of response values. Should be provided only when data is an R-matrix. #' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param weight vector of response values. If not NULL, will set to dataset #' @param weight vector of response values. If not NULL, will set to dataset
#' @param obj objective function, can be character or custom objective function. Examples include #' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber}, #' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass} #' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param eval evaluation function, can be (list of) character or custom eval function #' @param eval evaluation function, can be (list of) character or custom eval function
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified #' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
#' by the values of outcome labels. #' by the values of outcome labels.
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
#' (each element must be a vector of test fold's indices). When folds are supplied, #' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored. #' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int #' @param categorical_feature list of str or int
#' type int represents index, #' type int represents index,
#' type str represents feature names #' type str represents feature names
#' @param callbacks list of callback functions #' @param callbacks List of callback functions that are applied at each iteration.
#' List of callback functions that are applied at each iteration. #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets #' into a predictor model which frees up memory and the original datasets
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters: #' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{ #' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}} #' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
...@@ -61,13 +61,15 @@ CVBooster <- R6::R6Class( ...@@ -61,13 +61,15 @@ CVBooster <- R6::R6Class(
#' train <- agaricus.train #' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label) #' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' model <- lgb.cv(params, #' model <- lgb.cv(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' nfold = 3, #' , nrounds = 10
#' min_data = 1, #' , nfold = 3
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' @export #' @export
lgb.cv <- function(params = list(), lgb.cv <- function(params = list(),
data, data,
...@@ -134,7 +136,17 @@ lgb.cv <- function(params = list(), ...@@ -134,7 +136,17 @@ lgb.cv <- function(params = list(),
begin_iteration <- predictor$current_iter() + 1 begin_iteration <- predictor$current_iter() + 1
} }
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
n_trees <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators") n_trees <- c(
"num_iterations"
, "num_iteration"
, "n_iter"
, "num_tree"
, "num_trees"
, "num_round"
, "num_rounds"
, "num_boost_round"
, "n_estimators"
)
if (any(names(params) %in% n_trees)) { if (any(names(params) %in% n_trees)) {
end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1]]] - 1 end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1]]] - 1
} else { } else {
...@@ -192,12 +204,14 @@ lgb.cv <- function(params = list(), ...@@ -192,12 +204,14 @@ lgb.cv <- function(params = list(),
} }
# Create folds # Create folds
folds <- generate.cv.folds(nfold, folds <- generate.cv.folds(
nrow(data), nfold
stratified, , nrow(data)
getinfo(data, "label"), , stratified
getinfo(data, "group"), , getinfo(data, "label")
params) , getinfo(data, "group")
, params
)
} }
...@@ -215,12 +229,24 @@ lgb.cv <- function(params = list(), ...@@ -215,12 +229,24 @@ lgb.cv <- function(params = list(),
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change") early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
if (any(names(params) %in% early_stop)) { if (any(names(params) %in% early_stop)) {
if (params[[which(names(params) %in% early_stop)[1]]] > 0) { if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose)) callbacks <- add.cb(
callbacks
, cb.early.stop(
params[[which(names(params) %in% early_stop)[1]]]
, verbose = verbose
)
)
} }
} else { } else {
if (!is.null(early_stopping_rounds)) { if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) { if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose)) callbacks <- add.cb(
callbacks
, cb.early.stop(
early_stopping_rounds
, verbose = verbose
)
)
} }
} }
} }
...@@ -292,7 +318,7 @@ lgb.cv <- function(params = list(), ...@@ -292,7 +318,7 @@ lgb.cv <- function(params = list(),
env$eval_list <- merged_msg$eval_list env$eval_list <- merged_msg$eval_list
# Check for standard deviation requirement # Check for standard deviation requirement
if(showsd) { if (showsd) {
env$eval_err_list <- merged_msg$eval_err_list env$eval_err_list <- merged_msg$eval_err_list
} }
...@@ -319,9 +345,11 @@ lgb.cv <- function(params = list(), ...@@ -319,9 +345,11 @@ lgb.cv <- function(params = list(),
if (reset_data) { if (reset_data) {
lapply(cv_booster$boosters, function(fd) { lapply(cv_booster$boosters, function(fd) {
# Store temporarily model data elsewhere # Store temporarily model data elsewhere
booster_old <- list(best_iter = fd$booster$best_iter, booster_old <- list(
best_score = fd$booster$best_score, best_iter = fd$booster$best_iter
record_evals = fd$booster$record_evals) , best_score = fd$booster$best_score,
, record_evals = fd$booster$record_evals
)
# Reload model # Reload model
fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string()) fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
fd$booster$best_iter <- booster_old$best_iter fd$booster$best_iter <- booster_old$best_iter
...@@ -384,8 +412,10 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) { ...@@ -384,8 +412,10 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
# Loop through each fold # Loop through each fold
for (i in seq_len(nfold)) { for (i in seq_len(nfold)) {
kstep <- length(rnd_idx) %/% (nfold - i + 1) kstep <- length(rnd_idx) %/% (nfold - i + 1)
folds[[i]] <- list(fold = which(ungrouped %in% rnd_idx[seq_len(kstep)]), folds[[i]] <- list(
group = rnd_idx[seq_len(kstep)]) fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
, group = rnd_idx[seq_len(kstep)]
)
rnd_idx <- rnd_idx[-seq_len(kstep)] rnd_idx <- rnd_idx[-seq_len(kstep)]
} }
...@@ -413,11 +443,17 @@ lgb.stratified.folds <- function(y, k = 10) { ...@@ -413,11 +443,17 @@ lgb.stratified.folds <- function(y, k = 10) {
if (is.numeric(y)) { if (is.numeric(y)) {
cuts <- length(y) %/% k cuts <- length(y) %/% k
if (cuts < 2) { cuts <- 2 } if (cuts < 2) {
if (cuts > 5) { cuts <- 5 } cuts <- 2
y <- cut(y, }
unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts))), if (cuts > 5) {
include.lowest = TRUE) cuts <- 5
}
y <- cut(
y
, unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts)))
, include.lowest = TRUE
)
} }
...@@ -499,8 +535,10 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) { ...@@ -499,8 +535,10 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
# Parse standard deviation # Parse standard deviation
for (j in seq_len(eval_len)) { for (j in seq_len(eval_len)) {
ret_eval_err <- c(ret_eval_err, ret_eval_err <- c(
sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2)) ret_eval_err
, sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2)
)
} }
# Convert to list # Convert to list
...@@ -509,7 +547,9 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) { ...@@ -509,7 +547,9 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
} }
# Return errors # Return errors
list(eval_list = ret_eval, list(
eval_err_list = ret_eval_err) eval_list = ret_eval
, eval_err_list = ret_eval_err
)
} }
...@@ -21,9 +21,14 @@ ...@@ -21,9 +21,14 @@
#' train <- agaricus.train #' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label) #' dtrain <- lgb.Dataset(train$data, label = train$label)
#' #'
#' params <- list(objective = "binary", #' params <- list(
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1, #' objective = "binary"
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) #' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10) #' model <- lgb.train(params, dtrain, 10)
#' #'
#' tree_imp1 <- lgb.importance(model, percentage = TRUE) #' tree_imp1 <- lgb.importance(model, percentage = TRUE)
...@@ -63,9 +68,11 @@ lgb.importance <- function(model, percentage = TRUE) { ...@@ -63,9 +68,11 @@ lgb.importance <- function(model, percentage = TRUE) {
# Check if relative values are requested # Check if relative values are requested
if (percentage) { if (percentage) {
tree_imp_dt[, ":="(Gain = Gain / sum(Gain), tree_imp_dt[, `:=`(
Cover = Cover / sum(Cover), Gain = Gain / sum(Gain)
Frequency = Frequency / sum(Frequency))] , Cover = Cover / sum(Cover)
, Frequency = Frequency / sum(Frequency)
)]
} }
# Return importance table # Return importance table
......
...@@ -69,12 +69,21 @@ lgb.interprete <- function(model, ...@@ -69,12 +69,21 @@ lgb.interprete <- function(model,
) )
# Get list of trees # Get list of trees
tree_index_mat_list <- lapply(leaf_index_mat_list, tree_index_mat_list <- lapply(
FUN = function(x) matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE)) X = leaf_index_mat_list
, FUN = function(x){
matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE)
}
)
# Sequence over idxset # Sequence over idxset
for (i in seq_along(idxset)) { for (i in seq_along(idxset)) {
tree_interpretation_dt_list[[i]] <- single.row.interprete(tree_dt, num_class, tree_index_mat_list[[i]], leaf_index_mat_list[[i]]) tree_interpretation_dt_list[[i]] <- single.row.interprete(
tree_dt
, num_class
, tree_index_mat_list[[i]]
, leaf_index_mat_list[[i]]
)
} }
# Return interpretation list # Return interpretation list
...@@ -122,7 +131,10 @@ single.tree.interprete <- function(tree_dt, ...@@ -122,7 +131,10 @@ single.tree.interprete <- function(tree_dt,
leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]]) leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]])
# Return formatted data.table # Return formatted data.table
data.table::data.table(Feature = feature_seq, Contribution = diff.default(value_seq)) data.table::data.table(
Feature = feature_seq
, Contribution = diff.default(value_seq)
)
} }
...@@ -198,16 +210,22 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index ...@@ -198,16 +210,22 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index
} else { } else {
# Full interpretation elements # Full interpretation elements
tree_interpretation_dt <- Reduce(f = function(x, y) merge(x, y, by = "Feature", all = TRUE), tree_interpretation_dt <- Reduce(
x = tree_interpretation) f = function(x, y){
merge(x, y, by = "Feature", all = TRUE)
}
, x = tree_interpretation
)
# Loop throughout each tree # Loop throughout each tree
for (j in 2:ncol(tree_interpretation_dt)) { for (j in 2:ncol(tree_interpretation_dt)) {
data.table::set(tree_interpretation_dt, data.table::set(
i = which(is.na(tree_interpretation_dt[[j]])), tree_interpretation_dt
j = j, , i = which(is.na(tree_interpretation_dt[[j]]))
value = 0) , j = j
, value = 0
)
} }
......
...@@ -35,9 +35,14 @@ ...@@ -35,9 +35,14 @@
#' train <- agaricus.train #' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label) #' dtrain <- lgb.Dataset(train$data, label = train$label)
#' #'
#' params <- list(objective = "binary", #' params <- list(
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1, #' objective = "binary"
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) #' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10) #' model <- lgb.train(params, dtrain, 10)
#' #'
#' tree_dt <- lgb.model.dt.tree(model) #' tree_dt <- lgb.model.dt.tree(model)
...@@ -51,11 +56,13 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) { ...@@ -51,11 +56,13 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
json_model <- lgb.dump(model, num_iteration = num_iteration) json_model <- lgb.dump(model, num_iteration = num_iteration)
# Parse json model second # Parse json model second
parsed_json_model <- jsonlite::fromJSON(json_model, parsed_json_model <- jsonlite::fromJSON(
simplifyVector = TRUE, json_model
simplifyDataFrame = FALSE, , simplifyVector = TRUE
simplifyMatrix = FALSE, , simplifyDataFrame = FALSE
flatten = FALSE) , simplifyMatrix = FALSE
, flatten = FALSE
)
# Parse tree model third # Parse tree model third
tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse) tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
...@@ -89,21 +96,23 @@ single.tree.parse <- function(lgb_tree) { ...@@ -89,21 +96,23 @@ single.tree.parse <- function(lgb_tree) {
if (is.null(env)) { if (is.null(env)) {
# Setup initial default data.table with default types # Setup initial default data.table with default types
env <- new.env(parent = emptyenv()) env <- new.env(parent = emptyenv())
env$single_tree_dt <- data.table::data.table(tree_index = integer(0), env$single_tree_dt <- data.table::data.table(
depth = integer(0), tree_index = integer(0)
split_index = integer(0), , depth = integer(0)
split_feature = integer(0), , split_index = integer(0)
node_parent = integer(0), , split_feature = integer(0)
leaf_index = integer(0), , node_parent = integer(0)
leaf_parent = integer(0), , leaf_index = integer(0)
split_gain = numeric(0), , leaf_parent = integer(0)
threshold = numeric(0), , split_gain = numeric(0)
decision_type = character(0), , threshold = numeric(0)
default_left = character(0), , decision_type = character(0)
internal_value = integer(0), , default_left = character(0)
internal_count = integer(0), , internal_value = integer(0)
leaf_value = integer(0), , internal_count = integer(0)
leaf_count = integer(0)) , leaf_value = integer(0)
, leaf_count = integer(0)
)
# start tree traversal # start tree traversal
pre_order_traversal(env, tree_node_leaf, current_depth, parent_index) pre_order_traversal(env, tree_node_leaf, current_depth, parent_index)
} else { } else {
...@@ -127,14 +136,18 @@ single.tree.parse <- function(lgb_tree) { ...@@ -127,14 +136,18 @@ single.tree.parse <- function(lgb_tree) {
fill = TRUE) fill = TRUE)
# Traverse tree again both left and right # Traverse tree again both left and right
pre_order_traversal(env, pre_order_traversal(
tree_node_leaf$left_child, env
current_depth = current_depth + 1L, , tree_node_leaf$left_child
parent_index = tree_node_leaf$split_index) , current_depth = current_depth + 1L
pre_order_traversal(env, , parent_index = tree_node_leaf$split_index
tree_node_leaf$right_child, )
current_depth = current_depth + 1L, pre_order_traversal(
parent_index = tree_node_leaf$split_index) env
, tree_node_leaf$right_child
, current_depth = current_depth + 1L
, parent_index = tree_node_leaf$split_index
)
} else if (!is.null(tree_node_leaf$leaf_index)) { } else if (!is.null(tree_node_leaf$leaf_index)) {
......
...@@ -43,7 +43,11 @@ lgb.plot.importance <- function(tree_imp, ...@@ -43,7 +43,11 @@ lgb.plot.importance <- function(tree_imp,
cex = NULL) { cex = NULL) {
# Check for measurement (column names) correctness # Check for measurement (column names) correctness
measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE) measure <- match.arg(
measure
, choices = c("Gain", "Cover", "Frequency")
, several.ok = FALSE
)
# Get top N importance (defaults to 10) # Get top N importance (defaults to 10)
top_n <- min(top_n, nrow(tree_imp)) top_n <- min(top_n, nrow(tree_imp))
...@@ -72,14 +76,14 @@ lgb.plot.importance <- function(tree_imp, ...@@ -72,14 +76,14 @@ lgb.plot.importance <- function(tree_imp,
# Do plot # Do plot
tree_imp[.N:1, tree_imp[.N:1,
graphics::barplot( graphics::barplot(
height = get(measure), height = get(measure)
names.arg = Feature, , names.arg = Feature
horiz = TRUE, , horiz = TRUE
border = NA, , border = NA
main = "Feature Importance", , main = "Feature Importance"
xlab = measure, , xlab = measure
cex.names = cex, , cex.names = cex
las = 1 , las = 1
)] )]
# Return invisibly # Return invisibly
......
...@@ -9,8 +9,8 @@ ...@@ -9,8 +9,8 @@
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}. #' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#' #'
#' @details #' @details
#' The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature. #' The graph represents each feature as a horizontal bar of length proportional to the defined
#' Features are shown ranked in a decreasing contribution order. #' contribution of a feature. Features are shown ranked in a decreasing contribution order.
#' #'
#' @return #' @return
#' The \code{lgb.plot.interpretation} function creates a \code{barplot}. #' The \code{lgb.plot.interpretation} function creates a \code{barplot}.
...@@ -26,9 +26,14 @@ ...@@ -26,9 +26,14 @@
#' data(agaricus.test, package = "lightgbm") #' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test #' test <- agaricus.test
#' #'
#' params <- list(objective = "binary", #' params <- list(
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1, #' objective = "binary"
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1) #' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10) #' model <- lgb.train(params, dtrain, 10)
#' #'
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5) #' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
...@@ -67,16 +72,21 @@ lgb.plot.interpretation <- function(tree_interpretation_dt, ...@@ -67,16 +72,21 @@ lgb.plot.interpretation <- function(tree_interpretation_dt,
if (num_class == 1) { if (num_class == 1) {
# Only one class, plot straight away # Only one class, plot straight away
multiple.tree.plot.interpretation(tree_interpretation_dt, multiple.tree.plot.interpretation(
top_n = top_n, tree_interpretation_dt
title = NULL, , top_n = top_n
cex = cex) , title = NULL
, cex = cex
)
} else { } else {
# More than one class, shape data first # More than one class, shape data first
layout_mat <- matrix(seq.int(to = cols * ceiling(num_class / cols)), layout_mat <- matrix(
ncol = cols, nrow = ceiling(num_class / cols)) seq.int(to = cols * ceiling(num_class / cols))
, ncol = cols
, nrow = ceiling(num_class / cols)
)
# Shape output # Shape output
graphics::par(mfcol = c(nrow(layout_mat), ncol(layout_mat))) graphics::par(mfcol = c(nrow(layout_mat), ncol(layout_mat)))
...@@ -119,14 +129,14 @@ multiple.tree.plot.interpretation <- function(tree_interpretation, ...@@ -119,14 +129,14 @@ multiple.tree.plot.interpretation <- function(tree_interpretation,
# Do plot # Do plot
tree_interpretation[.N:1, tree_interpretation[.N:1,
graphics::barplot( graphics::barplot(
height = Contribution, height = Contribution
names.arg = Feature, , names.arg = Feature
horiz = TRUE, , horiz = TRUE
col = ifelse(Contribution > 0, "firebrick", "steelblue"), , col = ifelse(Contribution > 0, "firebrick", "steelblue")
border = NA, , border = NA
main = title, , main = title
cex.names = cex, , cex.names = cex
las = 1 , las = 1
)] )]
# Return invisibly # Return invisibly
......
#' Data preparator for LightGBM datasets (numeric) #' Data preparator for LightGBM datasets (numeric)
#' #'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets. #' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric without integers. Please use
#' \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
#' #'
#' @param data A data.frame or data.table to prepare. #' @param data A data.frame or data.table to prepare.
#' #'
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. #' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#' #'
#' @examples #' @examples
#' library(lightgbm) #' library(lightgbm)
...@@ -71,7 +74,11 @@ lgb.prepare <- function(data) { ...@@ -71,7 +74,11 @@ lgb.prepare <- function(data) {
} else { } else {
# What do you think you are doing here? Throw error. # What do you think you are doing here? Throw error.
stop("lgb.prepare2: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame") stop(
"lgb.prepare2: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
} }
......
#' Data preparator for LightGBM datasets (integer) #' Data preparator for LightGBM datasets (integer)
#' #'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. #' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Note that there are programs which do not support integer-only input. Consider this as a half
#' memory technique which is dangerous, especially for LightGBM.
#' #'
#' @param data A data.frame or data.table to prepare. #' @param data A data.frame or data.table to prepare.
#' #'
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. #' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#' #'
#' @examples #' @examples
#' library(lightgbm) #' library(lightgbm)
......
#' Data preparator for LightGBM datasets with rules (numeric) #' Data preparator for LightGBM datasets with rules (numeric)
#' #'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter. #' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric. In addition, keeps rules created
#' so you can convert other datasets using this converter.
#' #'
#' @param data A data.frame or data.table to prepare. #' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used. #' @param rules A set of rules from the data preparator, if already used.
#' #'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. #' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input
#' in \code{lgb.Dataset}.
#' #'
#' @examples #' @examples
#' library(lightgbm) #' library(lightgbm)
...@@ -160,7 +164,11 @@ lgb.prepare_rules <- function(data, rules = NULL) { ...@@ -160,7 +164,11 @@ lgb.prepare_rules <- function(data, rules = NULL) {
} else { } else {
# What do you think you are doing here? Throw error. # What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame") stop(
"lgb.prepare: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
} }
......
#' Data preparator for LightGBM datasets with rules (integer) #' Data preparator for LightGBM datasets with rules (integer)
#' #'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM. #' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' In addition, keeps rules created so you can convert other datasets using this converter.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Note that there are programs which do not support integer-only input.
#' Consider this as a half memory technique which is dangerous, especially for LightGBM.
#' #'
#' @param data A data.frame or data.table to prepare. #' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used. #' @param rules A set of rules from the data preparator, if already used.
#' #'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}. #' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input in
#' \code{lgb.Dataset}.
#' #'
#' @examples #' @examples
#' library(lightgbm) #' library(lightgbm)
...@@ -35,9 +42,13 @@ ...@@ -35,9 +42,13 @@
#' data(iris) # Erase iris dataset #' data(iris) # Erase iris dataset
#' #'
#' # We remapped values differently #' # We remapped values differently
#' personal_rules <- list(Species = c("setosa" = 3L, #' personal_rules <- list(
#' "versicolor" = 2L, #' Species = c(
#' "virginica" = 1L)) #' "setosa" = 3L
#' , "versicolor" = 2L
#' , virginica" = 1L
#' )
#' )
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules) #' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS! #' str(newest_iris$data) # SUCCESS!
#' #'
...@@ -158,7 +169,11 @@ lgb.prepare_rules2 <- function(data, rules = NULL) { ...@@ -158,7 +169,11 @@ lgb.prepare_rules2 <- function(data, rules = NULL) {
} else { } else {
# What do you think you are doing here? Throw error. # What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame") stop(
"lgb.prepare: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
} }
......
...@@ -4,17 +4,18 @@ ...@@ -4,17 +4,18 @@
#' @inheritParams lgb_shared_params #' @inheritParams lgb_shared_params
#' @param valids a list of \code{lgb.Dataset} objects, used for validation #' @param valids a list of \code{lgb.Dataset} objects, used for validation
#' @param obj objective function, can be character or custom objective function. Examples include #' @param obj objective function, can be character or custom objective function. Examples include
#' \code{regression}, \code{regression_l1}, \code{huber}, #' \code{regression}, \code{regression_l1}, \code{huber},
#' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass} #' \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
#' @param eval evaluation function, can be (a list of) character or custom eval function #' @param eval evaluation function, can be (a list of) character or custom eval function
#' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals} #' @param record Boolean, TRUE will record iteration message to \code{booster$record_evals}
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset #' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int #' @param categorical_feature list of str or int
#' type int represents index, #' type int represents index,
#' type str represents feature names #' type str represents feature names
#' @param callbacks list of callback functions #' @param callbacks List of callback functions that are applied at each iteration.
#' List of callback functions that are applied at each iteration. #' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets #' booster model into a predictor model which frees up memory and the
#' original datasets
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters: #' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{ #' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}} #' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
...@@ -37,13 +38,15 @@ ...@@ -37,13 +38,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' @export #' @export
lgb.train <- function(params = list(), lgb.train <- function(params = list(),
data, data,
...@@ -105,7 +108,17 @@ lgb.train <- function(params = list(), ...@@ -105,7 +108,17 @@ lgb.train <- function(params = list(),
begin_iteration <- predictor$current_iter() + 1 begin_iteration <- predictor$current_iter() + 1
} }
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one # Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
n_rounds <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators") n_rounds <- c(
"num_iterations"
, "num_iteration"
, "n_iter"
, "num_tree"
, "num_trees"
, "num_round"
, "num_rounds"
, "num_boost_round"
, "n_estimators"
)
if (any(names(params) %in% n_rounds)) { if (any(names(params) %in% n_rounds)) {
end_iteration <- begin_iteration + params[[which(names(params) %in% n_rounds)[1]]] - 1 end_iteration <- begin_iteration + params[[which(names(params) %in% n_rounds)[1]]] - 1
} else { } else {
...@@ -198,12 +211,24 @@ lgb.train <- function(params = list(), ...@@ -198,12 +211,24 @@ lgb.train <- function(params = list(),
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change") early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
if (any(names(params) %in% early_stop)) { if (any(names(params) %in% early_stop)) {
if (params[[which(names(params) %in% early_stop)[1]]] > 0) { if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose)) callbacks <- add.cb(
callbacks
, cb.early.stop(
params[[which(names(params) %in% early_stop)[1]]]
, verbose = verbose
)
)
} }
} else { } else {
if (!is.null(early_stopping_rounds)) { if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) { if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose)) callbacks <- add.cb(
callbacks
, cb.early.stop(
early_stopping_rounds
, verbose = verbose
)
)
} }
} }
} }
...@@ -267,7 +292,8 @@ lgb.train <- function(params = list(), ...@@ -267,7 +292,8 @@ lgb.train <- function(params = list(),
} }
# When early stopping is not activated, we compute the best iteration / score ourselves by selecting the first metric and the first dataset # When early stopping is not activated, we compute the best iteration / score ourselves by
# selecting the first metric and the first dataset
if (record && length(valids) > 0 && is.na(env$best_score)) { if (record && length(valids) > 0 && is.na(env$best_score)) {
if (env$eval_list[[1]]$higher_better[1] == TRUE) { if (env$eval_list[[1]]$higher_better[1] == TRUE) {
booster$best_iter <- unname(which.max(unlist(booster$record_evals[[2]][[1]][[1]]))) booster$best_iter <- unname(which.max(unlist(booster$record_evals[[2]][[1]][[1]])))
...@@ -282,9 +308,11 @@ lgb.train <- function(params = list(), ...@@ -282,9 +308,11 @@ lgb.train <- function(params = list(),
if (reset_data) { if (reset_data) {
# Store temporarily model data elsewhere # Store temporarily model data elsewhere
booster_old <- list(best_iter = booster$best_iter, booster_old <- list(
best_score = booster$best_score, best_iter = booster$best_iter
record_evals = booster$record_evals) , best_score = booster$best_score
, record_evals = booster$record_evals
)
# Reload model # Reload model
booster <- lgb.load(model_str = booster$save_model_to_string()) booster <- lgb.load(model_str = booster$save_model_to_string())
......
...@@ -2,9 +2,13 @@ ...@@ -2,9 +2,13 @@
#' #'
#' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object. #' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
#' #'
#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed. #' @param restore Whether to reload \code{LightGBM} immediately after detaching from R.
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them. #' Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment. #' unloading is performed.
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
#' environment. Defaults to \code{FALSE} which means to not remove them.
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
#' \code{.GlobalEnv} which is the global environment.
#' #'
#' @return NULL invisibly. #' @return NULL invisibly.
#' #'
...@@ -18,13 +22,15 @@ ...@@ -18,13 +22,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' #'
#' \dontrun{ #' \dontrun{
#' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv) #' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
...@@ -43,8 +49,18 @@ lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) { ...@@ -43,8 +49,18 @@ lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
# Should we wipe variables? (lgb.Booster, lgb.Dataset) # Should we wipe variables? (lgb.Booster, lgb.Dataset)
if (wipe) { if (wipe) {
boosters <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Booster"), ls(envir = envir)) boosters <- Filter(
datasets <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Dataset"), ls(envir = envir)) f = function(x){
inherits(get(x, envir = envir), "lgb.Booster")
}
, x = ls(envir = envir)
)
datasets <- Filter(
f = function(x){
inherits(get(x, envir = envir), "lgb.Dataset")
}
, x = ls(envir = envir)
)
rm(list = c(boosters, datasets), envir = envir) rm(list = c(boosters, datasets), envir = envir)
gc(verbose = FALSE) gc(verbose = FALSE)
} }
......
...@@ -4,12 +4,10 @@ ...@@ -4,12 +4,10 @@
#' @param callbacks list of callback functions #' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration. #' List of callback functions that are applied at each iteration.
#' @param data a \code{lgb.Dataset} object, used for training #' @param data a \code{lgb.Dataset} object, used for training
#' @param early_stopping_rounds int #' @param early_stopping_rounds int. Activates early stopping. Requires at least one validation data
#' Activates early stopping. #' and one metric. If there's more than one, will check all of them
#' Requires at least one validation data and one metric #' except the training data. Returns the model with (best_iter + early_stopping_rounds).
#' If there's more than one, will check all of them except the training data #' If early stopping occurs, the model will have 'best_iter' field.
#' Returns the model with (best_iter + early_stopping_rounds)
#' If early stopping occurs, the model will have 'best_iter' field
#' @param eval_freq evaluation output frequency, only effect when verbose > 0 #' @param eval_freq evaluation output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model #' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param nrounds number of training rounds #' @param nrounds number of training rounds
...@@ -76,9 +74,18 @@ lightgbm <- function(data, ...@@ -76,9 +74,18 @@ lightgbm <- function(data,
} }
# Train a model using the regular way # Train a model using the regular way
bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq, bst <- lgb.train(
early_stopping_rounds = early_stopping_rounds, params = params
init_model = init_model, callbacks = callbacks, ...) , data = dtrain
, nrounds = nrounds
, valids = valids
, verbose = verbose
, eval_freq = eval_freq
, early_stopping_rounds = early_stopping_rounds
, init_model = init_model
, callbacks = callbacks
, ...
)
# Store model under a specific name # Store model under a specific name
bst$save_model(save_name) bst$save_model(save_name)
......
...@@ -17,13 +17,15 @@ ...@@ -17,13 +17,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) #' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2") #' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest) #' valids <- list(test = dtest)
#' model <- lgb.train(params, #' model <- lgb.train(
#' dtrain, #' params = params
#' 10, #' , data = dtrain
#' valids, #' , nrounds = 10
#' min_data = 1, #' , valids = valids
#' learning_rate = 1, #' , min_data = 1
#' early_stopping_rounds = 5) #' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' saveRDS.lgb.Booster(model, "model.rds") #' saveRDS.lgb.Booster(model, "model.rds")
#' new_model <- readRDS.lgb.Booster("model.rds") #' new_model <- readRDS.lgb.Booster("model.rds")
#' #'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment