Unverified Commit fc991c9d authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] added R linting and changed R code to comma-first (fixes #2373) (#2437)

parent b4bb38d9
library(lintr)
args <- commandArgs(
trailingOnly = TRUE
)
SOURCE_DIR <- args[[1]]
FILES_TO_LINT <- list.files(
path = SOURCE_DIR
, pattern = "\\.r$"
, all.files = TRUE
, ignore.case = TRUE
, full.names = TRUE
, recursive = TRUE
, include.dirs = FALSE
)
# Some linters from the lintr package have not made it to CRAN yet
# We build lintr from source to address that.
LINTERS_TO_USE <- list(
"closed_curly" = lintr::closed_curly_linter
, "infix_spaces" = lintr::infix_spaces_linter
, "long_lines" = lintr::line_length_linter(length = 120)
, "tabs" = lintr::no_tab_linter
, "open_curly" = lintr::open_curly_linter
, "spaces_inside" = lintr::spaces_inside_linter
, "spaces_left_parens" = lintr::spaces_left_parentheses_linter
, "trailing_blank" = lintr::trailing_blank_lines_linter
, "trailing_white" = lintr::trailing_whitespace_linter
)
cat(sprintf("Found %i R files to lint\n", length(FILES_TO_LINT)))
results <- c()
for (r_file in FILES_TO_LINT){
this_result <- lintr::lint(
filename = r_file
, linters = LINTERS_TO_USE
, cache = FALSE
)
cat(sprintf(
"Found %i linting errors in %s\n"
, length(this_result)
, r_file
))
results <- c(results, this_result)
}
issues_found <- length(results)
if (issues_found > 0){
cat("\n")
print(results)
}
quit(save = "no", status = issues_found)
......@@ -51,10 +51,17 @@ if [[ $TRAVIS == "true" ]] && [[ $TASK == "check-docs" ]]; then
fi
if [[ $TASK == "lint" ]]; then
conda install -q -y -n $CONDA_ENV pycodestyle pydocstyle
conda install -q -y -n $CONDA_ENV \
pycodestyle \
pydocstyle \
r-lintr
pip install --user cpplint
echo "Linting Python code"
pycodestyle --ignore=E501,W503 --exclude=./compute,./.nuget . || exit -1
pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^compute|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
echo "Linting R code"
Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
echo "Linting C++ code"
cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include || exit 0
exit 0
fi
......
......@@ -39,7 +39,11 @@ cb.reset.parameters <- function(new_params) {
# since changing them would simply wreck some chaos
not_allowed <- c("num_class", "metric", "boosting_type")
if (any(pnames %in% not_allowed)) {
stop("Parameters ", paste0(pnames[pnames %in% not_allowed], collapse = ", "), " cannot be changed during boosting")
stop(
"Parameters "
, paste0(pnames[pnames %in% not_allowed], collapse = ", ")
, " cannot be changed during boosting"
)
}
# Check parameter names
......@@ -166,7 +170,7 @@ cb.print.evaluation <- function(period = 1) {
i <- env$iteration
# Check if iteration matches moduo
if ((i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration ))) {
if ( (i - 1) %% period == 0 || is.element(i, c(env$begin_iteration, env$end_iteration))) {
# Merge evaluation string
msg <- merge.eval.string(env)
......@@ -244,8 +248,14 @@ cb.record.evaluation <- function() {
name <- eval_res$name
# Store evaluation data
env$model$record_evals[[data_name]][[name]]$eval <- c(env$model$record_evals[[data_name]][[name]]$eval, eval_res$value)
env$model$record_evals[[data_name]][[name]]$eval_err <- c(env$model$record_evals[[data_name]][[name]]$eval_err, eval_err)
env$model$record_evals[[data_name]][[name]]$eval <- c(
env$model$record_evals[[data_name]][[name]]$eval
, eval_res$value
)
env$model$record_evals[[data_name]][[name]]$eval_err <- c(
env$model$record_evals[[data_name]][[name]]$eval_err
, eval_err
)
}
......@@ -391,7 +401,9 @@ cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
}
# Extract callback names from the list of callbacks
callback.names <- function(cb_list) { unlist(lapply(cb_list, attr, "name")) }
callback.names <- function(cb_list) {
unlist(lapply(cb_list, attr, "name"))
}
add.cb <- function(cb_list, cb) {
......
......@@ -46,7 +46,12 @@ Booster <- R6::R6Class(
}
# Store booster handle
handle <- lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str)
handle <- lgb.call(
"LGBM_BoosterCreate_R"
, ret = handle
, train_set$.__enclos_env__$private$get_handle()
, params_str
)
# Create private booster information
private$train_set <- train_set
......@@ -57,10 +62,12 @@ Booster <- R6::R6Class(
if (!is.null(private$init_predictor)) {
# Merge booster
lgb.call("LGBM_BoosterMerge_R",
ret = NULL,
handle,
private$init_predictor$.__enclos_env__$private$handle)
lgb.call(
"LGBM_BoosterMerge_R"
, ret = NULL
, handle
, private$init_predictor$.__enclos_env__$private$handle
)
}
......@@ -75,9 +82,11 @@ Booster <- R6::R6Class(
}
# Create booster from model
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R",
ret = handle,
lgb.c_str(modelfile))
handle <- lgb.call(
"LGBM_BoosterCreateFromModelfile_R"
, ret = handle
, lgb.c_str(modelfile)
)
} else if (!is.null(model_str)) {
......@@ -87,14 +96,19 @@ Booster <- R6::R6Class(
}
# Create booster from model
handle <- lgb.call("LGBM_BoosterLoadModelFromString_R",
ret = handle,
lgb.c_str(model_str))
handle <- lgb.call(
"LGBM_BoosterLoadModelFromString_R"
, ret = handle
, lgb.c_str(model_str)
)
} else {
# Booster non existent
stop("lgb.Booster: Need at least either training dataset, model file, or model_str to create booster instance")
stop(
"lgb.Booster: Need at least either training dataset, "
, "model file, or model_str to create booster instance"
)
}
......@@ -111,9 +125,11 @@ Booster <- R6::R6Class(
class(handle) <- "lgb.Booster.handle"
private$handle <- handle
private$num_class <- 1L
private$num_class <- lgb.call("LGBM_BoosterGetNumClasses_R",
ret = private$num_class,
private$handle)
private$num_class <- lgb.call(
"LGBM_BoosterGetNumClasses_R"
, ret = private$num_class
, private$handle
)
}
......@@ -138,7 +154,10 @@ Booster <- R6::R6Class(
# Check if predictors are identical
if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) {
stop("lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data")
stop(
"lgb.Booster.add_valid: Failed to add validation data; "
, "you should use the same predictor for these data"
)
}
# Check if names are character
......@@ -147,10 +166,12 @@ Booster <- R6::R6Class(
}
# Add validation data to booster
lgb.call("LGBM_BoosterAddValidData_R",
ret = NULL,
private$handle,
data$.__enclos_env__$private$get_handle())
lgb.call(
"LGBM_BoosterAddValidData_R"
, ret = NULL
, private$handle
, data$.__enclos_env__$private$get_handle()
)
# Store private information
private$valid_sets <- c(private$valid_sets, data)
......@@ -171,10 +192,12 @@ Booster <- R6::R6Class(
params_str <- lgb.params2str(params)
# Reset parameters
lgb.call("LGBM_BoosterResetParameter_R",
ret = NULL,
private$handle,
params_str)
lgb.call(
"LGBM_BoosterResetParameter_R"
, ret = NULL
, private$handle
, params_str
)
# Return self
return(invisible(self))
......@@ -198,10 +221,12 @@ Booster <- R6::R6Class(
}
# Reset training data on booster
lgb.call("LGBM_BoosterResetTrainingData_R",
ret = NULL,
private$handle,
train_set$.__enclos_env__$private$get_handle())
lgb.call(
"LGBM_BoosterResetTrainingData_R"
, ret = NULL
, private$handle
, train_set$.__enclos_env__$private$get_handle()
)
# Store private train set
private$train_set = train_set
......@@ -230,18 +255,20 @@ Booster <- R6::R6Class(
gpair <- fobj(private$inner_predict(1), private$train_set)
# Check for gradient and hessian as list
if(is.null(gpair$grad) || is.null(gpair$hess)){
if (is.null(gpair$grad) || is.null(gpair$hess)){
stop("lgb.Booster.update: custom objective should
return a list with attributes (hess, grad)")
}
# Return custom boosting gradient/hessian
ret <- lgb.call("LGBM_BoosterUpdateOneIterCustom_R",
ret = NULL,
private$handle,
gpair$grad,
gpair$hess,
length(gpair$grad))
ret <- lgb.call(
"LGBM_BoosterUpdateOneIterCustom_R"
, ret = NULL
, private$handle
, gpair$grad
, gpair$hess
, length(gpair$grad)
)
}
......@@ -258,9 +285,11 @@ Booster <- R6::R6Class(
rollback_one_iter = function() {
# Return one iteration behind
lgb.call("LGBM_BoosterRollbackOneIter_R",
ret = NULL,
private$handle)
lgb.call(
"LGBM_BoosterRollbackOneIter_R"
, ret = NULL
, private$handle
)
# Loop through each iteration
for (i in seq_along(private$is_predicted_cur_iter)) {
......@@ -276,9 +305,11 @@ Booster <- R6::R6Class(
current_iter = function() {
cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R",
ret = cur_iter,
private$handle)
lgb.call(
"LGBM_BoosterGetCurrentIteration_R"
, ret = cur_iter
, private$handle
)
},
......@@ -349,7 +380,10 @@ Booster <- R6::R6Class(
# Loop through each validation set
for (i in seq_along(private$valid_sets)) {
ret <- append(ret, private$inner_eval(private$name_valid_sets[[i]], i + 1, feval))
ret <- append(
x = ret
, values = private$inner_eval(private$name_valid_sets[[i]], i + 1, feval)
)
}
# Return ret
......@@ -366,11 +400,13 @@ Booster <- R6::R6Class(
}
# Save booster model
lgb.call("LGBM_BoosterSaveModel_R",
ret = NULL,
private$handle,
as.integer(num_iteration),
lgb.c_str(filename))
lgb.call(
"LGBM_BoosterSaveModel_R"
, ret = NULL
, private$handle
, as.integer(num_iteration)
, lgb.c_str(filename)
)
# Return self
return(invisible(self))
......@@ -385,9 +421,11 @@ Booster <- R6::R6Class(
}
# Return model string
return(lgb.call.return.str("LGBM_BoosterSaveModelToString_R",
private$handle,
as.integer(num_iteration)))
return(lgb.call.return.str(
"LGBM_BoosterSaveModelToString_R"
, private$handle
, as.integer(num_iteration)
))
},
......@@ -400,9 +438,11 @@ Booster <- R6::R6Class(
}
# Return dumped model
lgb.call.return.str("LGBM_BoosterDumpModel_R",
private$handle,
as.integer(num_iteration))
lgb.call.return.str(
"LGBM_BoosterDumpModel_R"
, private$handle
, as.integer(num_iteration)
)
},
......@@ -478,10 +518,12 @@ Booster <- R6::R6Class(
# Store predictions
npred <- 0L
npred <- lgb.call("LGBM_BoosterGetNumPredict_R",
ret = npred,
private$handle,
as.integer(idx - 1))
npred <- lgb.call(
"LGBM_BoosterGetNumPredict_R"
, ret = npred
, private$handle
, as.integer(idx - 1)
)
private$predict_buffer[[data_name]] <- numeric(npred)
}
......@@ -490,10 +532,12 @@ Booster <- R6::R6Class(
if (!private$is_predicted_cur_iter[[idx]]) {
# Use buffer
private$predict_buffer[[data_name]] <- lgb.call("LGBM_BoosterGetPredict_R",
ret = private$predict_buffer[[data_name]],
private$handle,
as.integer(idx - 1))
private$predict_buffer[[data_name]] <- lgb.call(
"LGBM_BoosterGetPredict_R"
, ret = private$predict_buffer[[data_name]]
, private$handle
, as.integer(idx - 1)
)
private$is_predicted_cur_iter[[idx]] <- TRUE
}
......@@ -508,8 +552,10 @@ Booster <- R6::R6Class(
if (is.null(private$eval_names)) {
# Get evaluation names
names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R",
private$handle)
names <- lgb.call.return.str(
"LGBM_BoosterGetEvalNames_R"
, private$handle
)
# Check names' length
if (nchar(names) > 0) {
......@@ -547,10 +593,12 @@ Booster <- R6::R6Class(
# Create evaluation values
tmp_vals <- numeric(length(private$eval_names))
tmp_vals <- lgb.call("LGBM_BoosterGetEval_R",
ret = tmp_vals,
private$handle,
as.integer(data_idx - 1))
tmp_vals <- lgb.call(
"LGBM_BoosterGetEval_R"
, ret = tmp_vals
, private$handle
, as.integer(data_idx - 1)
)
# Loop through all evaluation names
for (i in seq_along(private$eval_names)) {
......@@ -587,7 +635,7 @@ Booster <- R6::R6Class(
res <- feval(private$inner_predict(data_idx), data)
# Check for name correctness
if(is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) {
if (is.null(res$name) || is.null(res$value) || is.null(res$higher_better)) {
stop("lgb.Booster.eval: custom eval function should return a
list with attribute (name, value, higher_better)");
}
......@@ -614,8 +662,8 @@ Booster <- R6::R6Class(
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration
#' @param rawscore whether the prediction should be returned in the for of original untransformed
#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE} for
#' logistic regression would result in predictions for log-odds instead of probabilities.
#' sum of predictions from boosting iterations' results. E.g., setting \code{rawscore=TRUE}
#' for logistic regression would result in predictions for log-odds instead of probabilities.
#' @param predleaf whether predict leaf index instead.
#' @param predcontrib return per-feature contributions for each record.
#' @param header only used for prediction for text file. True if text file has header
......@@ -642,13 +690,15 @@ Booster <- R6::R6Class(
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' preds <- predict(model, test$data)
#'
#' @rdname predict.lgb.Booster
......@@ -669,13 +719,16 @@ predict.lgb.Booster <- function(object,
}
# Return booster predictions
object$predict(data,
num_iteration,
rawscore,
predleaf,
predcontrib,
header,
reshape, ...)
object$predict(
data
, num_iteration
, rawscore
, predleaf
, predcontrib
, header
, reshape
, ...
)
}
#' Load LightGBM model
......@@ -699,13 +752,15 @@ predict.lgb.Booster <- function(object,
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.save(model, "model.txt")
#' load_booster <- lgb.load(filename = "model.txt")
#' model_string <- model$save_model_to_string(NULL) # saves best iteration
......@@ -757,13 +812,15 @@ lgb.load <- function(filename = NULL, model_str = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.save(model, "model.txt")
#'
#' @rdname lgb.save
......@@ -804,13 +861,15 @@ lgb.save <- function(booster, filename, num_iteration = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' json_model <- lgb.dump(model)
#'
#' @rdname lgb.dump
......@@ -848,13 +907,15 @@ lgb.dump <- function(booster, num_iteration = NULL){
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' lgb.get.eval.result(model, "test", "l2")
#' @rdname lgb.get.eval.result
#' @export
......
......@@ -97,16 +97,18 @@ Dataset <- R6::R6Class(
...) {
# Create new dataset
ret <- Dataset$new(data,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
NULL,
info,
...)
ret <- Dataset$new(
data = data
, params = private$params
, reference = self
, colnames = private$colnames
, categorical_feature = private$categorical_feature
, predictor = private$predictor
, free_raw_data = private$free_raw_data
, used_indices = NULL
, info = info
, ...
)
# Return ret
return(invisible(ret))
......@@ -142,14 +144,23 @@ Dataset <- R6::R6Class(
# Provided indices, but some indices are not existing?
if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
stop(
"lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
, sQuote(private$categorical_feature[is.na(cate_indices)])
)
}
} else {
# Check if more categorical features were output over the feature space
if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
stop(
"lgb.self.get.handle: supplied a too large value in categorical_feature: "
, max(private$categorical_feature)
, " but only "
, length(private$colnames)
, " features"
)
}
# Store indices as [0, n-1] indexed instead of [1, n] indexed
......@@ -165,7 +176,9 @@ Dataset <- R6::R6Class(
# Check has header or not
has_header <- FALSE
if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
if (tolower(as.character(private$params$has_header)) == "true" || tolower(as.character(private$params$header)) == "true") {
params_has_header <- tolower(as.character(private$params$has_header)) == "true"
params_header <- tolower(as.character(private$params$header)) == "true"
if (params_has_header || params_header) {
has_header <- TRUE
}
}
......@@ -186,43 +199,52 @@ Dataset <- R6::R6Class(
# Are we using a data file?
if (is.character(private$raw_data)) {
handle <- lgb.call("LGBM_DatasetCreateFromFile_R",
ret = handle,
lgb.c_str(private$raw_data),
params_str,
ref_handle)
handle <- lgb.call(
"LGBM_DatasetCreateFromFile_R"
, ret = handle
, lgb.c_str(private$raw_data)
, params_str
, ref_handle
)
} else if (is.matrix(private$raw_data)) {
# Are we using a matrix?
handle <- lgb.call("LGBM_DatasetCreateFromMat_R",
ret = handle,
private$raw_data,
nrow(private$raw_data),
ncol(private$raw_data),
params_str,
ref_handle)
handle <- lgb.call(
"LGBM_DatasetCreateFromMat_R"
, ret = handle
, private$raw_data
, nrow(private$raw_data)
, ncol(private$raw_data)
, params_str
, ref_handle
)
} else if (methods::is(private$raw_data, "dgCMatrix")) {
if (length(private$raw_data@p) > 2147483647) {
stop("Cannot support large CSC matrix")
}
# Are we using a dgCMatrix (sparsed matrix column compressed)
handle <- lgb.call("LGBM_DatasetCreateFromCSC_R",
ret = handle,
private$raw_data@p,
private$raw_data@i,
private$raw_data@x,
length(private$raw_data@p),
length(private$raw_data@x),
nrow(private$raw_data),
params_str,
ref_handle)
handle <- lgb.call(
"LGBM_DatasetCreateFromCSC_R"
, ret = handle
, private$raw_data@p
, private$raw_data@i
, private$raw_data@x
, length(private$raw_data@p)
, length(private$raw_data@x)
, nrow(private$raw_data)
, params_str
, ref_handle
)
} else {
# Unknown data type
stop("lgb.Dataset.construct: does not support constructing from ", sQuote(class(private$raw_data)))
stop(
"lgb.Dataset.construct: does not support constructing from "
, sQuote(class(private$raw_data))
)
}
......@@ -234,12 +256,14 @@ Dataset <- R6::R6Class(
}
# Construct subset
handle <- lgb.call("LGBM_DatasetGetSubset_R",
ret = handle,
ref_handle,
c(private$used_indices), # Adding c() fixes issue in R v3.5
length(private$used_indices),
params_str)
handle <- lgb.call(
"LGBM_DatasetGetSubset_R"
, ret = handle
, ref_handle
, c(private$used_indices) # Adding c() fixes issue in R v3.5
, length(private$used_indices)
, params_str
)
}
if (lgb.is.null.handle(handle)) {
......@@ -258,7 +282,11 @@ Dataset <- R6::R6Class(
if (!is.null(private$predictor) && is.null(private$used_indices)) {
# Setup initial scores
init_score <- private$predictor$predict(private$raw_data, rawscore = TRUE, reshape = TRUE)
init_score <- private$predictor$predict(
private$raw_data
, rawscore = TRUE
, reshape = TRUE
)
# Not needed to transpose, for is col_marjor
init_score <- as.vector(init_score)
......@@ -316,7 +344,10 @@ Dataset <- R6::R6Class(
} else {
# Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
stop(
"dim: cannot get dimensions before dataset has been constructed, "
, "please call lgb.Dataset.construct explicitly"
)
}
......@@ -341,7 +372,10 @@ Dataset <- R6::R6Class(
} else {
# Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
stop(
"dim: cannot get dimensions before dataset has been constructed, please call "
, "lgb.Dataset.construct explicitly"
)
}
......@@ -367,10 +401,12 @@ Dataset <- R6::R6Class(
# Merge names with tab separation
merged_name <- paste0(as.list(private$colnames), collapse = "\t")
lgb.call("LGBM_DatasetSetFeatureNames_R",
ret = NULL,
private$handle,
lgb.c_str(merged_name))
lgb.call(
"LGBM_DatasetSetFeatureNames_R"
, ret = NULL
, private$handle
, lgb.c_str(merged_name)
)
}
......@@ -399,10 +435,12 @@ Dataset <- R6::R6Class(
# Get field size of info
info_len <- 0L
info_len <- lgb.call("LGBM_DatasetGetFieldSize_R",
ret = info_len,
private$handle,
lgb.c_str(name))
info_len <- lgb.call(
"LGBM_DatasetGetFieldSize_R"
, ret = info_len
, private$handle
, lgb.c_str(name)
)
# Check if info is not empty
if (info_len > 0) {
......@@ -415,10 +453,12 @@ Dataset <- R6::R6Class(
numeric(info_len) # Numeric
}
ret <- lgb.call("LGBM_DatasetGetField_R",
ret = ret,
private$handle,
lgb.c_str(name))
ret <- lgb.call(
"LGBM_DatasetGetField_R"
, ret = ret
, private$handle
, lgb.c_str(name)
)
private$info[[name]] <- ret
......@@ -454,12 +494,14 @@ Dataset <- R6::R6Class(
if (length(info) > 0) {
lgb.call("LGBM_DatasetSetField_R",
ret = NULL,
private$handle,
lgb.c_str(name),
info,
length(info))
lgb.call(
"LGBM_DatasetSetField_R"
, ret = NULL
, private$handle
, lgb.c_str(name)
, info
, length(info)
)
}
......@@ -474,16 +516,18 @@ Dataset <- R6::R6Class(
slice = function(idxset, ...) {
# Perform slicing
Dataset$new(NULL,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
sort(idxset, decreasing = FALSE),
NULL,
...)
Dataset$new(
data = NULL
, params = private$params
, reference = self
, colnames = private$colnames
, categorical_feature = private$categorical_feature
, predictor = private$predictor
, free_raw_data = private$free_raw_data
, used_indices = sort(idxset, decreasing = FALSE)
, info = NULL
, ...
)
},
......@@ -492,7 +536,12 @@ Dataset <- R6::R6Class(
# Parameter updating
if (!lgb.is.null.handle(private$handle)) {
lgb.call("LGBM_DatasetUpdateParam_R", ret = NULL, private$handle, lgb.params2str(params))
lgb.call(
"LGBM_DatasetUpdateParam_R"
, ret = NULL
, private$handle
, lgb.params2str(params)
)
return(invisible(self))
}
private$params <- modifyList(private$params, params)
......@@ -568,10 +617,12 @@ Dataset <- R6::R6Class(
# Store binary data
self$construct()
lgb.call("LGBM_DatasetSaveBinary_R",
ret = NULL,
private$handle,
lgb.c_str(fname))
lgb.call(
"LGBM_DatasetSaveBinary_R"
, ret = NULL
, private$handle
, lgb.c_str(fname)
)
return(invisible(self))
}
......@@ -671,16 +722,18 @@ lgb.Dataset <- function(data,
...) {
# Create new dataset
invisible(Dataset$new(data,
params,
reference,
colnames,
categorical_feature,
NULL,
free_raw_data,
NULL,
info,
...))
invisible(Dataset$new(
data = data
, params = params
, reference = reference
, colnames = colnames
, categorical_feature = categorical_feature
, predictor = NULL
, free_raw_data = free_raw_data
, used_indices = NULL
, info = info
, ...
))
}
......@@ -840,7 +893,13 @@ dimnames.lgb.Dataset <- function(x) {
# Check for unmatching column size
if (ncol(x) != length(value[[2]])) {
stop("can't assign ", sQuote(length(value[[2]])), " colnames to an lgb.Dataset with ", sQuote(ncol(x)), " columns")
stop(
"can't assign "
, sQuote(length(value[[2]]))
, " colnames to an lgb.Dataset with "
, sQuote(ncol(x))
, " columns"
)
}
# Set column names properly, and return
......
......@@ -13,7 +13,11 @@ Predictor <- R6::R6Class(
if (private$need_free_handle && !lgb.is.null.handle(private$handle)) {
# Freeing up handle
lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle)
lgb.call(
"LGBM_BoosterFree_R"
, ret = NULL
, private$handle
)
private$handle <- NULL
}
......@@ -31,7 +35,11 @@ Predictor <- R6::R6Class(
if (is.character(modelfile)) {
# Create handle on it
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", ret = handle, lgb.c_str(modelfile))
handle <- lgb.call(
"LGBM_BoosterCreateFromModelfile_R"
, ret = handle
, lgb.c_str(modelfile)
)
private$need_free_handle <- TRUE
} else if (methods::is(modelfile, "lgb.Booster.handle")) {
......@@ -57,7 +65,11 @@ Predictor <- R6::R6Class(
current_iter = function() {
cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle)
lgb.call(
"LGBM_BoosterGetCurrentIteration_R"
, ret = cur_iter
, private$handle
)
},
......@@ -86,14 +98,19 @@ Predictor <- R6::R6Class(
on.exit(unlink(tmp_filename), add = TRUE)
# Predict from temporary file
lgb.call("LGBM_BoosterPredictForFile_R", ret = NULL, private$handle, data,
as.integer(header),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration),
private$params,
lgb.c_str(tmp_filename))
lgb.call(
"LGBM_BoosterPredictForFile_R"
, ret = NULL
, private$handle
, data
, as.integer(header)
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
, private$params
, lgb.c_str(tmp_filename)
)
# Get predictions from file
preds <- read.delim(tmp_filename, header = FALSE, sep = "\t")
......@@ -108,51 +125,57 @@ Predictor <- R6::R6Class(
npred <- 0L
# Check number of predictions to do
npred <- lgb.call("LGBM_BoosterCalcNumPredict_R",
ret = npred,
private$handle,
as.integer(num_row),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration))
npred <- lgb.call(
"LGBM_BoosterCalcNumPredict_R"
, ret = npred
, private$handle
, as.integer(num_row)
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
)
# Pre-allocate empty vector
preds <- numeric(npred)
# Check if data is a matrix
if (is.matrix(data)) {
preds <- lgb.call("LGBM_BoosterPredictForMat_R",
ret = preds,
private$handle,
data,
as.integer(nrow(data)),
as.integer(ncol(data)),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration),
private$params)
preds <- lgb.call(
"LGBM_BoosterPredictForMat_R"
, ret = preds
, private$handle
, data
, as.integer(nrow(data))
, as.integer(ncol(data))
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
, private$params
)
} else if (methods::is(data, "dgCMatrix")) {
if (length(data@p) > 2147483647) {
stop("Cannot support large CSC matrix")
}
# Check if data is a dgCMatrix (sparse matrix, column compressed format)
preds <- lgb.call("LGBM_BoosterPredictForCSC_R",
ret = preds,
private$handle,
data@p,
data@i,
data@x,
length(data@p),
length(data@x),
nrow(data),
as.integer(rawscore),
as.integer(predleaf),
as.integer(predcontrib),
as.integer(num_iteration),
private$params)
preds <- lgb.call(
"LGBM_BoosterPredictForCSC_R"
, ret = preds
, private$handle
, data@p
, data@i
, data@x
, length(data@p)
, length(data@x)
, nrow(data)
, as.integer(rawscore)
, as.integer(predleaf)
, as.integer(predcontrib)
, as.integer(num_iteration)
, private$params
)
} else {
......@@ -165,7 +188,12 @@ Predictor <- R6::R6Class(
# Check if number of rows is strange (not a multiple of the dataset rows)
if (length(preds) %% num_row != 0) {
stop("predict: prediction length ", sQuote(length(preds))," is not a multiple of nrows(data): ", sQuote(num_row))
stop(
"predict: prediction length "
, sQuote(length(preds))
," is not a multiple of nrows(data): "
, sQuote(num_row)
)
}
# Get number of cases per row
......@@ -192,7 +220,9 @@ Predictor <- R6::R6Class(
}
),
private = list(handle = NULL,
need_free_handle = FALSE,
params = "")
private = list(
handle = NULL
, need_free_handle = FALSE
, params = ""
)
)
......@@ -39,9 +39,9 @@ CVBooster <- R6::R6Class(
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model
#' into a predictor model which frees up memory and the original datasets
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
......@@ -61,13 +61,15 @@ CVBooster <- R6::R6Class(
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2")
#' model <- lgb.cv(params,
#' dtrain,
#' 10,
#' nfold = 3,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.cv(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , nfold = 3
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' @export
lgb.cv <- function(params = list(),
data,
......@@ -134,7 +136,17 @@ lgb.cv <- function(params = list(),
begin_iteration <- predictor$current_iter() + 1
}
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
n_trees <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
n_trees <- c(
"num_iterations"
, "num_iteration"
, "n_iter"
, "num_tree"
, "num_trees"
, "num_round"
, "num_rounds"
, "num_boost_round"
, "n_estimators"
)
if (any(names(params) %in% n_trees)) {
end_iteration <- begin_iteration + params[[which(names(params) %in% n_trees)[1]]] - 1
} else {
......@@ -192,12 +204,14 @@ lgb.cv <- function(params = list(),
}
# Create folds
folds <- generate.cv.folds(nfold,
nrow(data),
stratified,
getinfo(data, "label"),
getinfo(data, "group"),
params)
folds <- generate.cv.folds(
nfold
, nrow(data)
, stratified
, getinfo(data, "label")
, getinfo(data, "group")
, params
)
}
......@@ -215,12 +229,24 @@ lgb.cv <- function(params = list(),
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
if (any(names(params) %in% early_stop)) {
if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
params[[which(names(params) %in% early_stop)[1]]]
, verbose = verbose
)
)
}
} else {
if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
early_stopping_rounds
, verbose = verbose
)
)
}
}
}
......@@ -292,7 +318,7 @@ lgb.cv <- function(params = list(),
env$eval_list <- merged_msg$eval_list
# Check for standard deviation requirement
if(showsd) {
if (showsd) {
env$eval_err_list <- merged_msg$eval_err_list
}
......@@ -319,9 +345,11 @@ lgb.cv <- function(params = list(),
if (reset_data) {
lapply(cv_booster$boosters, function(fd) {
# Store temporarily model data elsewhere
booster_old <- list(best_iter = fd$booster$best_iter,
best_score = fd$booster$best_score,
record_evals = fd$booster$record_evals)
booster_old <- list(
best_iter = fd$booster$best_iter
, best_score = fd$booster$best_score,
, record_evals = fd$booster$record_evals
)
# Reload model
fd$booster <- lgb.load(model_str = fd$booster$save_model_to_string())
fd$booster$best_iter <- booster_old$best_iter
......@@ -384,8 +412,10 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
# Loop through each fold
for (i in seq_len(nfold)) {
kstep <- length(rnd_idx) %/% (nfold - i + 1)
folds[[i]] <- list(fold = which(ungrouped %in% rnd_idx[seq_len(kstep)]),
group = rnd_idx[seq_len(kstep)])
folds[[i]] <- list(
fold = which(ungrouped %in% rnd_idx[seq_len(kstep)])
, group = rnd_idx[seq_len(kstep)]
)
rnd_idx <- rnd_idx[-seq_len(kstep)]
}
......@@ -413,11 +443,17 @@ lgb.stratified.folds <- function(y, k = 10) {
if (is.numeric(y)) {
cuts <- length(y) %/% k
if (cuts < 2) { cuts <- 2 }
if (cuts > 5) { cuts <- 5 }
y <- cut(y,
unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts))),
include.lowest = TRUE)
if (cuts < 2) {
cuts <- 2
}
if (cuts > 5) {
cuts <- 5
}
y <- cut(
y
, unique(stats::quantile(y, probs = seq.int(0, 1, length.out = cuts)))
, include.lowest = TRUE
)
}
......@@ -499,8 +535,10 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
# Parse standard deviation
for (j in seq_len(eval_len)) {
ret_eval_err <- c(ret_eval_err,
sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2))
ret_eval_err <- c(
ret_eval_err
, sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2)
)
}
# Convert to list
......@@ -509,7 +547,9 @@ lgb.merge.cv.result <- function(msg, showsd = TRUE) {
}
# Return errors
list(eval_list = ret_eval,
eval_err_list = ret_eval_err)
list(
eval_list = ret_eval
, eval_err_list = ret_eval_err
)
}
......@@ -21,9 +21,14 @@
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params <- list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' params <- list(
#' objective = "binary"
#' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10)
#'
#' tree_imp1 <- lgb.importance(model, percentage = TRUE)
......@@ -63,9 +68,11 @@ lgb.importance <- function(model, percentage = TRUE) {
# Check if relative values are requested
if (percentage) {
tree_imp_dt[, ":="(Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency))]
tree_imp_dt[, `:=`(
Gain = Gain / sum(Gain)
, Cover = Cover / sum(Cover)
, Frequency = Frequency / sum(Frequency)
)]
}
# Return importance table
......
......@@ -69,12 +69,21 @@ lgb.interprete <- function(model,
)
# Get list of trees
tree_index_mat_list <- lapply(leaf_index_mat_list,
FUN = function(x) matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE))
tree_index_mat_list <- lapply(
X = leaf_index_mat_list
, FUN = function(x){
matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE)
}
)
# Sequence over idxset
for (i in seq_along(idxset)) {
tree_interpretation_dt_list[[i]] <- single.row.interprete(tree_dt, num_class, tree_index_mat_list[[i]], leaf_index_mat_list[[i]])
tree_interpretation_dt_list[[i]] <- single.row.interprete(
tree_dt
, num_class
, tree_index_mat_list[[i]]
, leaf_index_mat_list[[i]]
)
}
# Return interpretation list
......@@ -122,7 +131,10 @@ single.tree.interprete <- function(tree_dt,
leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]])
# Return formatted data.table
data.table::data.table(Feature = feature_seq, Contribution = diff.default(value_seq))
data.table::data.table(
Feature = feature_seq
, Contribution = diff.default(value_seq)
)
}
......@@ -198,16 +210,22 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index
} else {
# Full interpretation elements
tree_interpretation_dt <- Reduce(f = function(x, y) merge(x, y, by = "Feature", all = TRUE),
x = tree_interpretation)
tree_interpretation_dt <- Reduce(
f = function(x, y){
merge(x, y, by = "Feature", all = TRUE)
}
, x = tree_interpretation
)
# Loop throughout each tree
for (j in 2:ncol(tree_interpretation_dt)) {
data.table::set(tree_interpretation_dt,
i = which(is.na(tree_interpretation_dt[[j]])),
j = j,
value = 0)
data.table::set(
tree_interpretation_dt
, i = which(is.na(tree_interpretation_dt[[j]]))
, j = j
, value = 0
)
}
......
......@@ -35,9 +35,14 @@
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' params <- list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' params <- list(
#' objective = "binary"
#' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10)
#'
#' tree_dt <- lgb.model.dt.tree(model)
......@@ -51,11 +56,13 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
json_model <- lgb.dump(model, num_iteration = num_iteration)
# Parse json model second
parsed_json_model <- jsonlite::fromJSON(json_model,
simplifyVector = TRUE,
simplifyDataFrame = FALSE,
simplifyMatrix = FALSE,
flatten = FALSE)
parsed_json_model <- jsonlite::fromJSON(
json_model
, simplifyVector = TRUE
, simplifyDataFrame = FALSE
, simplifyMatrix = FALSE
, flatten = FALSE
)
# Parse tree model third
tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
......@@ -89,21 +96,23 @@ single.tree.parse <- function(lgb_tree) {
if (is.null(env)) {
# Setup initial default data.table with default types
env <- new.env(parent = emptyenv())
env$single_tree_dt <- data.table::data.table(tree_index = integer(0),
depth = integer(0),
split_index = integer(0),
split_feature = integer(0),
node_parent = integer(0),
leaf_index = integer(0),
leaf_parent = integer(0),
split_gain = numeric(0),
threshold = numeric(0),
decision_type = character(0),
default_left = character(0),
internal_value = integer(0),
internal_count = integer(0),
leaf_value = integer(0),
leaf_count = integer(0))
env$single_tree_dt <- data.table::data.table(
tree_index = integer(0)
, depth = integer(0)
, split_index = integer(0)
, split_feature = integer(0)
, node_parent = integer(0)
, leaf_index = integer(0)
, leaf_parent = integer(0)
, split_gain = numeric(0)
, threshold = numeric(0)
, decision_type = character(0)
, default_left = character(0)
, internal_value = integer(0)
, internal_count = integer(0)
, leaf_value = integer(0)
, leaf_count = integer(0)
)
# start tree traversal
pre_order_traversal(env, tree_node_leaf, current_depth, parent_index)
} else {
......@@ -127,14 +136,18 @@ single.tree.parse <- function(lgb_tree) {
fill = TRUE)
# Traverse tree again both left and right
pre_order_traversal(env,
tree_node_leaf$left_child,
current_depth = current_depth + 1L,
parent_index = tree_node_leaf$split_index)
pre_order_traversal(env,
tree_node_leaf$right_child,
current_depth = current_depth + 1L,
parent_index = tree_node_leaf$split_index)
pre_order_traversal(
env
, tree_node_leaf$left_child
, current_depth = current_depth + 1L
, parent_index = tree_node_leaf$split_index
)
pre_order_traversal(
env
, tree_node_leaf$right_child
, current_depth = current_depth + 1L
, parent_index = tree_node_leaf$split_index
)
} else if (!is.null(tree_node_leaf$leaf_index)) {
......
......@@ -43,7 +43,11 @@ lgb.plot.importance <- function(tree_imp,
cex = NULL) {
# Check for measurement (column names) correctness
measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE)
measure <- match.arg(
measure
, choices = c("Gain", "Cover", "Frequency")
, several.ok = FALSE
)
# Get top N importance (defaults to 10)
top_n <- min(top_n, nrow(tree_imp))
......@@ -72,14 +76,14 @@ lgb.plot.importance <- function(tree_imp,
# Do plot
tree_imp[.N:1,
graphics::barplot(
height = get(measure),
names.arg = Feature,
horiz = TRUE,
border = NA,
main = "Feature Importance",
xlab = measure,
cex.names = cex,
las = 1
height = get(measure)
, names.arg = Feature
, horiz = TRUE
, border = NA
, main = "Feature Importance"
, xlab = measure
, cex.names = cex
, las = 1
)]
# Return invisibly
......
......@@ -9,8 +9,8 @@
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#'
#' @details
#' The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
#' Features are shown ranked in a decreasing contribution order.
#' The graph represents each feature as a horizontal bar of length proportional to the defined
#' contribution of a feature. Features are shown ranked in a decreasing contribution order.
#'
#' @return
#' The \code{lgb.plot.interpretation} function creates a \code{barplot}.
......@@ -26,9 +26,14 @@
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#'
#' params <- list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' params <- list(
#' objective = "binary"
#' , learning_rate = 0.01
#' , num_leaves = 63
#' , max_depth = -1
#' , min_data_in_leaf = 1
#' , min_sum_hessian_in_leaf = 1
#' )
#' model <- lgb.train(params, dtrain, 10)
#'
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
......@@ -67,16 +72,21 @@ lgb.plot.interpretation <- function(tree_interpretation_dt,
if (num_class == 1) {
# Only one class, plot straight away
multiple.tree.plot.interpretation(tree_interpretation_dt,
top_n = top_n,
title = NULL,
cex = cex)
multiple.tree.plot.interpretation(
tree_interpretation_dt
, top_n = top_n
, title = NULL
, cex = cex
)
} else {
# More than one class, shape data first
layout_mat <- matrix(seq.int(to = cols * ceiling(num_class / cols)),
ncol = cols, nrow = ceiling(num_class / cols))
layout_mat <- matrix(
seq.int(to = cols * ceiling(num_class / cols))
, ncol = cols
, nrow = ceiling(num_class / cols)
)
# Shape output
graphics::par(mfcol = c(nrow(layout_mat), ncol(layout_mat)))
......@@ -119,14 +129,14 @@ multiple.tree.plot.interpretation <- function(tree_interpretation,
# Do plot
tree_interpretation[.N:1,
graphics::barplot(
height = Contribution,
names.arg = Feature,
horiz = TRUE,
col = ifelse(Contribution > 0, "firebrick", "steelblue"),
border = NA,
main = title,
cex.names = cex,
las = 1
height = Contribution
, names.arg = Feature
, horiz = TRUE
, col = ifelse(Contribution > 0, "firebrick", "steelblue")
, border = NA
, main = title
, cex.names = cex
, las = 1
)]
# Return invisibly
......
#' Data preparator for LightGBM datasets (numeric)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric without integers. Please use
#' \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
#'
#' @param data A data.frame or data.table to prepare.
#'
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)
......@@ -71,7 +74,11 @@ lgb.prepare <- function(data) {
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare2: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
stop(
"lgb.prepare2: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
}
......
#' Data preparator for LightGBM datasets (integer)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Note that there are programs which do not support integer-only input. Consider this as a half
#' memory technique which is dangerous, especially for LightGBM.
#'
#' @param data A data.frame or data.table to prepare.
#'
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix})
#' for input in \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)
......
#' Data preparator for LightGBM datasets with rules (numeric)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric. In addition, keeps rules created
#' so you can convert other datasets using this converter.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input
#' in \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)
......@@ -160,7 +164,11 @@ lgb.prepare_rules <- function(data, rules = NULL) {
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
stop(
"lgb.prepare: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
}
......
#' Data preparator for LightGBM datasets with rules (integer)
#'
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
#' Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
#' Factors and characters are converted to numeric (specifically: integer).
#' In addition, keeps rules created so you can convert other datasets using this converter.
#' This is useful if you have a specific need for integer dataset instead of numeric dataset.
#' Note that there are programs which do not support integer-only input.
#' Consider this as a half memory technique which is dangerous, especially for LightGBM.
#'
#' @param data A data.frame or data.table to prepare.
#' @param rules A set of rules from the data preparator, if already used.
#'
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in \code{lgb.Dataset}.
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
#' The data must be converted to a matrix format (\code{as.matrix}) for input in
#' \code{lgb.Dataset}.
#'
#' @examples
#' library(lightgbm)
......@@ -35,9 +42,13 @@
#' data(iris) # Erase iris dataset
#'
#' # We remapped values differently
#' personal_rules <- list(Species = c("setosa" = 3L,
#' "versicolor" = 2L,
#' "virginica" = 1L))
#' personal_rules <- list(
#' Species = c(
#' "setosa" = 3L
#' , "versicolor" = 2L
#' , virginica" = 1L
#' )
#' )
#' newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
#' str(newest_iris$data) # SUCCESS!
#'
......@@ -158,7 +169,11 @@ lgb.prepare_rules2 <- function(data, rules = NULL) {
} else {
# What do you think you are doing here? Throw error.
stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
stop(
"lgb.prepare: you provided "
, paste(class(data), collapse = " & ")
, " but data should have class data.frame"
)
}
......
......@@ -12,9 +12,10 @@
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets
#' @param callbacks List of callback functions that are applied at each iteration.
#' @param reset_data Boolean, setting it to TRUE (not the default value) will transform the
#' booster model into a predictor model which frees up memory and the
#' original datasets
#' @param ... other parameters, see Parameters.rst for more information. A few key parameters:
#' \itemize{
#' \item{boosting}{Boosting type. \code{"gbdt"} or \code{"dart"}}
......@@ -37,13 +38,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' @export
lgb.train <- function(params = list(),
data,
......@@ -105,7 +108,17 @@ lgb.train <- function(params = list(),
begin_iteration <- predictor$current_iter() + 1
}
# Check for number of rounds passed as parameter - in case there are multiple ones, take only the first one
n_rounds <- c("num_iterations", "num_iteration", "n_iter", "num_tree", "num_trees", "num_round", "num_rounds", "num_boost_round", "n_estimators")
n_rounds <- c(
"num_iterations"
, "num_iteration"
, "n_iter"
, "num_tree"
, "num_trees"
, "num_round"
, "num_rounds"
, "num_boost_round"
, "n_estimators"
)
if (any(names(params) %in% n_rounds)) {
end_iteration <- begin_iteration + params[[which(names(params) %in% n_rounds)[1]]] - 1
} else {
......@@ -198,12 +211,24 @@ lgb.train <- function(params = list(),
early_stop <- c("early_stopping_round", "early_stopping_rounds", "early_stopping", "n_iter_no_change")
if (any(names(params) %in% early_stop)) {
if (params[[which(names(params) %in% early_stop)[1]]] > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(params[[which(names(params) %in% early_stop)[1]]], verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
params[[which(names(params) %in% early_stop)[1]]]
, verbose = verbose
)
)
}
} else {
if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
callbacks <- add.cb(
callbacks
, cb.early.stop(
early_stopping_rounds
, verbose = verbose
)
)
}
}
}
......@@ -267,7 +292,8 @@ lgb.train <- function(params = list(),
}
# When early stopping is not activated, we compute the best iteration / score ourselves by selecting the first metric and the first dataset
# When early stopping is not activated, we compute the best iteration / score ourselves by
# selecting the first metric and the first dataset
if (record && length(valids) > 0 && is.na(env$best_score)) {
if (env$eval_list[[1]]$higher_better[1] == TRUE) {
booster$best_iter <- unname(which.max(unlist(booster$record_evals[[2]][[1]][[1]])))
......@@ -282,9 +308,11 @@ lgb.train <- function(params = list(),
if (reset_data) {
# Store temporarily model data elsewhere
booster_old <- list(best_iter = booster$best_iter,
best_score = booster$best_score,
record_evals = booster$record_evals)
booster_old <- list(
best_iter = booster$best_iter
, best_score = booster$best_score
, record_evals = booster$record_evals
)
# Reload model
booster <- lgb.load(model_str = booster$save_model_to_string())
......
......@@ -2,9 +2,13 @@
#'
#' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
#'
#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.
#' @param restore Whether to reload \code{LightGBM} immediately after detaching from R.
#' Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once
#' unloading is performed.
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global
#' environment. Defaults to \code{FALSE} which means to not remove them.
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to
#' \code{.GlobalEnv} which is the global environment.
#'
#' @return NULL invisibly.
#'
......@@ -18,13 +22,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#'
#' \dontrun{
#' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
......@@ -43,8 +49,18 @@ lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
# Should we wipe variables? (lgb.Booster, lgb.Dataset)
if (wipe) {
boosters <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Booster"), ls(envir = envir))
datasets <- Filter(function(x) inherits(get(x, envir = envir), "lgb.Dataset"), ls(envir = envir))
boosters <- Filter(
f = function(x){
inherits(get(x, envir = envir), "lgb.Booster")
}
, x = ls(envir = envir)
)
datasets <- Filter(
f = function(x){
inherits(get(x, envir = envir), "lgb.Dataset")
}
, x = ls(envir = envir)
)
rm(list = c(boosters, datasets), envir = envir)
gc(verbose = FALSE)
}
......
......@@ -4,12 +4,10 @@
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param data a \code{lgb.Dataset} object, used for training
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
#' If there's more than one, will check all of them except the training data
#' Returns the model with (best_iter + early_stopping_rounds)
#' If early stopping occurs, the model will have 'best_iter' field
#' @param early_stopping_rounds int. Activates early stopping. Requires at least one validation data
#' and one metric. If there's more than one, will check all of them
#' except the training data. Returns the model with (best_iter + early_stopping_rounds).
#' If early stopping occurs, the model will have 'best_iter' field.
#' @param eval_freq evaluation output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param nrounds number of training rounds
......@@ -76,9 +74,18 @@ lightgbm <- function(data,
}
# Train a model using the regular way
bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq,
early_stopping_rounds = early_stopping_rounds,
init_model = init_model, callbacks = callbacks, ...)
bst <- lgb.train(
params = params
, data = dtrain
, nrounds = nrounds
, valids = valids
, verbose = verbose
, eval_freq = eval_freq
, early_stopping_rounds = early_stopping_rounds
, init_model = init_model
, callbacks = callbacks
, ...
)
# Store model under a specific name
bst$save_model(save_name)
......
......@@ -17,13 +17,15 @@
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 10,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 5)
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' )
#' saveRDS.lgb.Booster(model, "model.rds")
#' new_model <- readRDS.lgb.Booster("model.rds")
#'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment