"vscode:/vscode.git/clone" did not exist on "3abff370bb353293e4a03e516111dd02785fbd97"
Commit b6c973af authored by Laurae's avatar Laurae Committed by Guolin Ke
Browse files

[R-package] Improvements, readability, and bug fixes (#378)

* Define environment in examples (xgboost clash)

* Large R code changes
parent e9275fb9
......@@ -25,6 +25,7 @@ export(lgb.plot.importance)
export(lgb.plot.interpretation)
export(lgb.save)
export(lgb.train)
export(lgb.unloader)
export(lightgbm)
export(readRDS.lgb.Booster)
export(saveRDS.lgb.Booster)
......
......@@ -2,220 +2,425 @@ CB_ENV <- R6Class(
"lgb.cb_env",
cloneable = FALSE,
public = list(
model = NULL,
iteration = NULL,
model = NULL,
iteration = NULL,
begin_iteration = NULL,
end_iteration = NULL,
eval_list = list(),
eval_err_list = list(),
best_iter = -1,
met_early_stop = FALSE
end_iteration = NULL,
eval_list = list(),
eval_err_list = list(),
best_iter = -1,
met_early_stop = FALSE
)
)
cb.reset.parameters <- function(new_params) {
if (!is.list(new_params)) { stop(sQuote("new_params"), " must be a list") }
# Check for parameter list
if (!is.list(new_params)) {
stop(sQuote("new_params"), " must be a list")
}
# Deparse parameter list
pnames <- gsub("\\.", "_", names(new_params))
nrounds <- NULL
# run some checks in the begining
# Run some checks in the beginning
init <- function(env) {
# Store boosting rounds
nrounds <<- env$end_iteration - env$begin_iteration + 1
# Check for model environment
if (is.null(env$model)) { stop("Env should have a ", sQuote("model")) }
# Some parameters are not allowed to be changed,
# since changing them would simply wreck some chaos
not_allowed <- c("num_class", "metric", "boosting_type")
if (any(pnames %in% not_allowed)) {
stop("Parameters ", paste0(pnames[pnames %in% not_allowed], collapse = ", "), " cannot be changed during boosting")
}
# Check parameter names
for (n in pnames) {
# Set name
p <- new_params[[n]]
# Check if function for parameter
if (is.function(p)) {
if (length(formals(p)) != 2)
# Check if requires at least two arguments
if (length(formals(p)) != 2) {
stop("Parameter ", sQuote(n), " is a function but not of two arguments")
}
# Check if numeric or character
} else if (is.numeric(p) || is.character(p)) {
if (length(p) != nrounds)
# Check if length is matching
if (length(p) != nrounds) {
stop("Length of ", sQuote(n), " has to be equal to length of ", sQuote("nrounds"))
}
} else {
stop("Parameter ", sQuote(n), " is not a function or a vector")
}
}
}
callback <- function(env) {
if (is.null(nrounds)) { init(env) }
# Check if rounds is null
if (is.null(nrounds)) {
init(env)
}
# Store iteration
i <- env$iteration - env$begin_iteration
# Apply list on parameters
pars <- lapply(new_params, function(p) {
if (is.function(p)) { return(p(i, nrounds)) }
if (is.function(p)) {
return(p(i, nrounds))
}
p[i]
})
# to-do check pars
if (!is.null(env$model)) { env$model$reset_parameter(pars) }
# To-do check pars
if (!is.null(env$model)) {
env$model$reset_parameter(pars)
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'is_pre_iteration') <- TRUE
attr(callback, 'name') <- 'cb.reset.parameters'
attr(callback, "call") <- match.call()
attr(callback, "is_pre_iteration") <- TRUE
attr(callback, "name") <- "cb.reset.parameters"
callback
}
# Format the evaluation metric string
format.eval.string <- function(eval_res, eval_err = NULL) {
if (is.null(eval_res) || length(eval_res) == 0) { stop('no evaluation results') }
# Check for empty evaluation string
if (is.null(eval_res) || length(eval_res) == 0) {
stop("no evaluation results")
}
# Check for empty evaluation error
if (!is.null(eval_err)) {
sprintf('%s\'s %s:%g+%g', eval_res$data_name, eval_res$name, eval_res$value, eval_err)
sprintf("%s\'s %s:%g+%g", eval_res$data_name, eval_res$name, eval_res$value, eval_err)
} else {
sprintf('%s\'s %s:%g', eval_res$data_name, eval_res$name, eval_res$value)
sprintf("%s\'s %s:%g", eval_res$data_name, eval_res$name, eval_res$value)
}
}
merge.eval.string <- function(env) {
if (length(env$eval_list) <= 0) { return("") }
msg <- list(sprintf('[%d]:', env$iteration))
# Check length of evaluation list
if (length(env$eval_list) <= 0) {
return("")
}
# Get evaluation
msg <- list(sprintf("[%d]:", env$iteration))
# Set if evaluation error
is_eval_err <- FALSE
if (length(env$eval_err_list) > 0) { is_eval_err <- TRUE }
# Check evaluation error list length
if (length(env$eval_err_list) > 0) {
is_eval_err <- TRUE
}
# Loop through evaluation list
for (j in seq_along(env$eval_list)) {
# Store evaluation error
eval_err <- NULL
if (is_eval_err) { eval_err <- env$eval_err_list[[j]] }
if (is_eval_err) {
eval_err <- env$eval_err_list[[j]]
}
# Set error message
msg <- c(msg, format.eval.string(env$eval_list[[j]], eval_err))
}
paste0(msg, collapse='\t')
# Return tabulated separated message
paste0(msg, collapse = "\t")
}
cb.print.evaluation <- function(period = 1){
cb.print.evaluation <- function(period = 1) {
# Create callback
callback <- function(env) {
# Check if period is at least 1 or more
if (period > 0) {
# Store iteration
i <- env$iteration
if ( (i - 1) %% period == 0
| i == env$begin_iteration
| i == env$end_iteration ) {
# Check if iteration matches moduo
if ((i - 1) %% period == 0 | i == env$begin_iteration | i == env$end_iteration ) {
# Merge evaluation string
msg <- merge.eval.string(env)
if (nchar(msg) > 0) { cat(merge.eval.string(env), "\n") }
# Check if message is existing
if (nchar(msg) > 0) {
cat(merge.eval.string(env), "\n")
}
}
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.print.evaluation'
# Store attributes
attr(callback, "call") <- match.call()
attr(callback, "name") <- "cb.print.evaluation"
# Return callback
callback
}
cb.record.evaluation <- function() {
# Create callback
callback <- function(env) {
if (length(env$eval_list) <= 0) { return() }
# Return empty if empty evaluation list
if (length(env$eval_list) <= 0) {
return()
}
# Set if evaluation error
is_eval_err <- FALSE
if (length(env$eval_err_list) > 0) { is_eval_err <- TRUE }
# Check evaluation error list length
if (length(env$eval_err_list) > 0) {
is_eval_err <- TRUE
}
# Check length of recorded evaluation
if (length(env$model$record_evals) == 0) {
# Loop through each evaluation list element
for (j in seq_along(env$eval_list)) {
# Store names
data_name <- env$eval_list[[j]]$data_name
name <- env$eval_list[[j]]$name
name <- env$eval_list[[j]]$name
env$model$record_evals$start_iter <- env$begin_iteration
# Check if evaluation record exists
if (is.null(env$model$record_evals[[data_name]])) {
env$model$record_evals[[data_name]] <- list()
}
env$model$record_evals[[data_name]][[name]] <- list()
env$model$record_evals[[data_name]][[name]]$eval <- list()
# Create dummy lists
env$model$record_evals[[data_name]][[name]] <- list()
env$model$record_evals[[data_name]][[name]]$eval <- list()
env$model$record_evals[[data_name]][[name]]$eval_err <- list()
}
}
# Loop through each evaluation list element
for (j in seq_along(env$eval_list)) {
# Get evaluation data
eval_res <- env$eval_list[[j]]
eval_err <- NULL
if (is_eval_err) { eval_err <- env$eval_err_list[[j]] }
if (is_eval_err) {
eval_err <- env$eval_err_list[[j]]
}
# Store names
data_name <- eval_res$data_name
name <- eval_res$name
name <- eval_res$name
# Store evaluation data
env$model$record_evals[[data_name]][[name]]$eval <- c(env$model$record_evals[[data_name]][[name]]$eval, eval_res$value)
env$model$record_evals[[data_name]][[name]]$eval_err <- c(env$model$record_evals[[data_name]][[name]]$eval_err, eval_err)
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.record.evaluation'
# Store attributes
attr(callback, "call") <- match.call()
attr(callback, "name") <- "cb.record.evaluation"
# Return callback
callback
}
cb.early.stop <- function(stopping_rounds, verbose = TRUE) {
# state variables
# Initialize variables
factor_to_bigger_better <- NULL
best_iter <- NULL
best_score <- NULL
best_msg <- NULL
eval_len <- NULL
best_iter <- NULL
best_score <- NULL
best_msg <- NULL
eval_len <- NULL
# Initalization function
init <- function(env) {
# Store evaluation length
eval_len <<- length(env$eval_list)
# Early stopping cannot work without metrics
if (eval_len == 0) {
stop("For early stopping, valids must have at least one element")
}
# Check if verbose or not
if (isTRUE(verbose)) {
cat("Will train until there is no improvement in ", stopping_rounds, " rounds.\n\n", sep = '')
cat("Will train until there is no improvement in ", stopping_rounds, " rounds.\n\n", sep = "")
}
# Maximization or minimization task
factor_to_bigger_better <<- rep(1.0, eval_len)
best_iter <<- rep(-1, eval_len)
best_iter <<- rep(-1, eval_len)
best_score <<- rep(-Inf, eval_len)
best_msg <<- list()
best_msg <<- list()
# Loop through evaluation elements
for (i in seq_len(eval_len)) {
# Prepend message
best_msg <<- c(best_msg, "")
# Check if maximization or minimization
if (!env$eval_list[[i]]$higher_better) {
factor_to_bigger_better[i] <<- -1.0
}
}
}
# Create callback
callback <- function(env, finalize = FALSE) {
if (is.null(eval_len)) { init(env) }
# Check for empty evaluation
if (is.null(eval_len)) {
init(env)
}
# Store iteration
cur_iter <- env$iteration
# Loop through evaluation
for (i in seq_len(eval_len)) {
# Store score
score <- env$eval_list[[i]]$value * factor_to_bigger_better[i]
# Check if score is better
if (score > best_score[i]) {
# Store new scores
best_score[i] <<- score
best_iter[i] <<- cur_iter
best_iter[i] <<- cur_iter
# Prepare to print if verbose
if (verbose) {
best_msg[[i]] <<- as.character(merge.eval.string(env))
}
} else {
# Check if early stopping is required
if (cur_iter - best_iter[i] >= stopping_rounds) {
if (!is.null(env$model)) { env$model$best_iter <- best_iter[i] }
# Check if model is not null
if (!is.null(env$model)) {
env$model$best_iter <- best_iter[i]
}
# Print message if verbose
if (isTRUE(verbose)) {
cat("Early stopping, best iteration is:", "\n")
cat(best_msg[[i]], "\n")
}
env$best_iter <- best_iter[i]
# Store best iteration and stop
env$best_iter <- best_iter[i]
env$met_early_stop <- TRUE
}
}
}
}
attr(callback, 'call') <- match.call()
attr(callback, 'name') <- 'cb.early.stop'
# Set attributes
attr(callback, "call") <- match.call()
attr(callback, "name") <- "cb.early.stop"
# Return callback
callback
}
# Extract callback names from the list of callbacks
callback.names <- function(cb_list) { unlist(lapply(cb_list, attr, "name")) }
add.cb <- function(cb_list, cb) {
# Combine two elements
cb_list <- c(cb_list, cb)
# Set names of elements
names(cb_list) <- callback.names(cb_list)
if ('cb.early.stop' %in% names(cb_list)) {
cb_list <- c(cb_list, cb_list['cb.early.stop'])
# this removes only the first one
cb_list['cb.early.stop'] <- NULL
# Check for existence
if ("cb.early.stop" %in% names(cb_list)) {
# Concatenate existing elements
cb_list <- c(cb_list, cb_list["cb.early.stop"])
# Remove only the first one
cb_list["cb.early.stop"] <- NULL
}
# Return element
cb_list
}
categorize.callbacks <- function(cb_list) {
# Check for pre-iteration or post-iteration
list(
pre_iter = Filter(function(x) {
pre <- attr(x, 'is_pre_iteration')
pre <- attr(x, "is_pre_iteration")
!is.null(pre) && pre
}, cb_list),
post_iter = Filter(function(x) {
pre <- attr(x, 'is_pre_iteration')
pre <- attr(x, "is_pre_iteration")
is.null(pre) || !pre
}, cb_list)
)
}
Booster <- R6Class(
"lgb.Booster",
classname = "lgb.Booster",
cloneable = FALSE,
public = list(
best_iter = -1,
best_iter = -1,
record_evals = list(),
finalize = function() {
# Finalize will free up the handles
finalize = function() {
# Check the need for freeing handle
if (!lgb.is.null.handle(private$handle)) {
# Freeing up handle
lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle)
private$handle <- NULL
}
},
initialize = function(params = list(),
# Initialize will create a starter booster
initialize = function(params = list(),
train_set = NULL,
modelfile = NULL,
...) {
params <- append(params, list(...))
# Create parameters and handle
params <- append(params, list(...))
params_str <- lgb.params2str(params)
handle <- lgb.new.handle()
handle <- lgb.new.handle()
# Check if training dataset is not null
if (!is.null(train_set)) {
# Check if training dataset is lgb.Dataset or not
if (!lgb.check.r6.class(train_set, "lgb.Dataset")) {
stop("lgb.Booster: Can only use lgb.Dataset as training data")
}
handle <-
lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str)
private$train_set <- train_set
private$num_dataset <- 1
# Store booster handle
handle <- lgb.call("LGBM_BoosterCreate_R", ret = handle, train_set$.__enclos_env__$private$get_handle(), params_str)
# Create private booster information
private$train_set <- train_set
private$num_dataset <- 1
private$init_predictor <- train_set$.__enclos_env__$private$predictor
# Check if predictor is existing
if (!is.null(private$init_predictor)) {
lgb.call("LGBM_BoosterMerge_R", ret = NULL,
handle,
private$init_predictor$.__enclos_env__$private$handle)
# Merge booster
lgb.call("LGBM_BoosterMerge_R",
ret = NULL,
handle,
private$init_predictor$.__enclos_env__$private$handle)
}
# Check current iteration
private$is_predicted_cur_iter <- c(private$is_predicted_cur_iter, FALSE)
} else if (!is.null(modelfile)) {
# Do we have a model file as character?
if (!is.character(modelfile)) {
stop("lgb.Booster: Can only use a string as model file path")
}
handle <-
lgb.call("LGBM_BoosterCreateFromModelfile_R",
ret = handle,
lgb.c_str(modelfile))
# Create booster from model
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R",
ret = handle,
lgb.c_str(modelfile))
} else {
stop(
"lgb.Booster: Need at least either training dataset or model file to create booster instance"
)
# Booster non existent
stop("lgb.Booster: Need at least either training dataset or model file to create booster instance")
}
class(handle) <- "lgb.Booster.handle"
private$handle <- handle
# Create class
class(handle) <- "lgb.Booster.handle"
private$handle <- handle
private$num_class <- 1L
private$num_class <-
lgb.call("LGBM_BoosterGetNumClasses_R", ret = private$num_class, private$handle)
private$num_class <- lgb.call("LGBM_BoosterGetNumClasses_R",
ret = private$num_class,
private$handle)
},
# Set training data name
set_train_data_name = function(name) {
# Set name
private$name_train_set <- name
self
},
# Add validation data
add_valid = function(data, name) {
# Check if data is lgb.Dataset
if (!lgb.check.r6.class(data, "lgb.Dataset")) {
stop("lgb.Booster.add_valid: Can only use lgb.Dataset as validation data")
}
# Check if predictors are identical
if (!identical(data$.__enclos_env__$private$predictor, private$init_predictor)) {
stop(
"lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data"
)
stop("lgb.Booster.add_valid: Failed to add validation data; you should use the same predictor for these data")
}
# Check if names are character
if (!is.character(name)) {
stop("lgb.Booster.add_valid: Can only use characters as data name")
}
lgb.call("LGBM_BoosterAddValidData_R", ret = NULL, private$handle, data$.__enclos_env__$private$get_handle())
private$valid_sets <- c(private$valid_sets, data)
private$name_valid_sets <- c(private$name_valid_sets, name)
private$num_dataset <- private$num_dataset + 1
private$is_predicted_cur_iter <-
c(private$is_predicted_cur_iter, FALSE)
self
# Add validation data to booster
lgb.call("LGBM_BoosterAddValidData_R",
ret = NULL,
private$handle,
data$.__enclos_env__$private$get_handle())
# Store private information
private$valid_sets <- c(private$valid_sets, data)
private$name_valid_sets <- c(private$name_valid_sets, name)
private$num_dataset <- private$num_dataset + 1
private$is_predicted_cur_iter <- c(private$is_predicted_cur_iter, FALSE)
# Return self
return(self)
},
# Reset parameters of booster
reset_parameter = function(params, ...) {
params <- append(params, list(...))
# Append parameters
params <- append(params, list(...))
params_str <- algb.params2str(params)
lgb.call("LGBM_BoosterResetParameter_R", ret = NULL,
private$handle,
params_str)
self
# Reset parameters
lgb.call("LGBM_BoosterResetParameter_R",
ret = NULL,
private$handle,
params_str)
# Return self
return(self)
},
# Perform boosting update iteration
update = function(train_set = NULL, fobj = NULL) {
# Check if training set is not null
if (!is.null(train_set)) {
# Check if training set is lgb.Dataset
if (!lgb.check.r6.class(train_set, "lgb.Dataset")) {
stop("lgb.Booster.update: Only can use lgb.Dataset as training data")
}
# Check if predictors are identical
if (!identical(train_set$predictor, private$init_predictor)) {
stop(
"lgb.Booster.update: Change train_set failed, you should use the same predictor for these data"
)
stop("lgb.Booster.update: Change train_set failed, you should use the same predictor for these data")
}
lgb.call("LGBM_BoosterResetTrainingData_R", ret = NULL,
private$handle,
train_set$.__enclos_env__$private$get_handle())
# Reset training data on booster
lgb.call("LGBM_BoosterResetTrainingData_R",
ret = NULL,
private$handle,
train_set$.__enclos_env__$private$get_handle())
# Store private train set
private$train_set = train_set
}
# Check if objective is empty
if (is.null(fobj)) {
# Boost iteration from known objective
ret <- lgb.call("LGBM_BoosterUpdateOneIter_R", ret = NULL, private$handle)
} else {
if (!is.function(fobj)) { stop("lgb.Booster.update: fobj should be a function") }
# Check if objective is function
if (!is.function(fobj)) {
stop("lgb.Booster.update: fobj should be a function")
}
# Perform objective calculation
gpair <- fobj(private$inner_predict(1), private$train_set)
# Check for gradient and hessian as list
if(is.null(gpair$grad) | is.null(gpair$hess)){
stop("lgb.Booster.update: custom objective should
return a list with attributes (hess, grad)")
}
ret <- lgb.call(
"LGBM_BoosterUpdateOneIterCustom_R", ret = NULL,
private$handle,
gpair$grad,
gpair$hess,
length(gpair$grad)
)
# Return custom boosting gradient/hessian
ret <- lgb.call("LGBM_BoosterUpdateOneIterCustom_R",
ret = NULL,
private$handle,
gpair$grad,
gpair$hess,
length(gpair$grad))
}
# Loop through each iteration
for (i in seq_along(private$is_predicted_cur_iter)) {
private$is_predicted_cur_iter[[i]] <- FALSE
}
ret
# Return self
return(ret)
},
# Return one iteration behind
rollback_one_iter = function() {
lgb.call("LGBM_BoosterRollbackOneIter_R", ret = NULL, private$handle)
# Return one iteration behind
lgb.call("LGBM_BoosterRollbackOneIter_R",
ret = NULL,
private$handle)
# Loop through each iteration
for (i in seq_along(private$is_predicted_cur_iter)) {
private$is_predicted_cur_iter[[i]] <- FALSE
}
self
# Return self
return(self)
},
# Get current iteration
current_iter = function() {
cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle)
lgb.call("LGBM_BoosterGetCurrentIteration_R",
ret = cur_iter,
private$handle)
},
# Evaluate data on metrics
eval = function(data, name, feval = NULL) {
# Check if dataset is lgb.Dataset
if (!lgb.check.r6.class(data, "lgb.Dataset")) {
stop("lgb.Booster.eval: Can only use lgb.Dataset to eval")
}
# Check for identical data
data_idx <- 0
if (identical(data, private$train_set)) { data_idx <- 1 } else {
if (identical(data, private$train_set)) {
data_idx <- 1
} else {
# Check for validation data
if (length(private$valid_sets) > 0) {
# Loop through each validation set
for (i in seq_along(private$valid_sets)) {
# Check for identical validation data with training data
if (identical(data, private$valid_sets[[i]])) {
# Found identical data, skip
data_idx <- i + 1
break
}
}
}
}
# Check if evaluation was not done
if (data_idx == 0) {
# Add validation data by name
self$add_valid(data, name)
data_idx <- private$num_dataset
}
# Evaluate data
private$inner_eval(name, data_idx, feval)
},
# Evaluation training data
eval_train = function(feval = NULL) {
private$inner_eval(private$name_train_set, 1, feval)
},
# Evaluation validation data
eval_valid = function(feval = NULL) {
# Create ret list
ret = list()
if (length(private$valid_sets) <= 0) { return(ret) }
# Check if validation is empty
if (length(private$valid_sets) <= 0) {
return(ret)
}
# Loop through each validation set
for (i in seq_along(private$valid_sets)) {
ret <- append(ret, private$inner_eval(private$name_valid_sets[[i]], i + 1, feval))
}
ret
# Return ret
return(ret)
},
# Save model
save_model = function(filename, num_iteration = NULL) {
if (is.null(num_iteration)) { num_iteration <- self$best_iter }
lgb.call(
"LGBM_BoosterSaveModel_R",
ret = NULL,
private$handle,
as.integer(num_iteration),
lgb.c_str(filename)
)
self
# Check if number of iteration is non existent
if (is.null(num_iteration)) {
num_iteration <- self$best_iter
}
# Save booster model
lgb.call("LGBM_BoosterSaveModel_R",
ret = NULL,
private$handle,
as.integer(num_iteration),
lgb.c_str(filename))
# Return self
return(self)
},
# Dump model in memory
dump_model = function(num_iteration = NULL) {
if (is.null(num_iteration)) { num_iteration <- self$best_iter }
lgb.call.return.str(
"LGBM_BoosterDumpModel_R",
private$handle,
as.integer(num_iteration)
)
# Check if number of iteration is non existent
if (is.null(num_iteration)) {
num_iteration <- self$best_iter
}
# Return dumped model
lgb.call.return.str("LGBM_BoosterDumpModel_R",
private$handle,
as.integer(num_iteration))
},
# Predict on new data
predict = function(data,
num_iteration = NULL,
rawscore = FALSE,
predleaf = FALSE,
header = FALSE,
reshape = FALSE) {
if (is.null(num_iteration)) { num_iteration <- self$best_iter }
num_iteration = NULL,
rawscore = FALSE,
predleaf = FALSE,
header = FALSE,
reshape = FALSE) {
# Check if number of iteration is non existent
if (is.null(num_iteration)) {
num_iteration <- self$best_iter
}
# Predict on new data
predictor <- Predictor$new(private$handle)
predictor$predict(data, num_iteration, rawscore, predleaf, header, reshape)
},
# Transform into predictor
to_predictor = function() {
Predictor$new(private$handle)
},
to_predictor = function() { Predictor$new(private$handle) },
# Used for save
raw = NA,
# Save model to temporary file for in-memory saving
save = function() {
# Create temporary file
temp <- tempfile()
# Save model to file
lgb.save(self, temp)
# Overwrite model in object
self$raw <- readChar(temp, file.info(temp)$size)
# Remove temporary file
file.remove(temp)
}
),
private = list(
handle = NULL,
train_set = NULL,
name_train_set = "training",
valid_sets = list(),
name_valid_sets = list(),
predict_buffer = list(),
is_predicted_cur_iter = list(),
num_class = 1,
num_dataset = 0,
init_predictor = NULL,
eval_names = NULL,
handle = NULL,
train_set = NULL,
name_train_set = "training",
valid_sets = list(),
name_valid_sets = list(),
predict_buffer = list(),
is_predicted_cur_iter = list(),
num_class = 1,
num_dataset = 0,
init_predictor = NULL,
eval_names = NULL,
higher_better_inner_eval = NULL,
inner_predict = function(idx) {
# Predict data
inner_predict = function(idx) {
# Store data name
data_name <- private$name_train_set
if (idx > 1) { data_name <- private$name_valid_sets[[idx - 1]] }
# Check for id bigger than 1
if (idx > 1) {
data_name <- private$name_valid_sets[[idx - 1]]
}
# Check for unknown dataset (over the maximum provided range)
if (idx > private$num_dataset) {
stop("data_idx should not be greater than num_dataset")
}
# Check for prediction buffer
if (is.null(private$predict_buffer[[data_name]])) {
# Store predictions
npred <- 0L
npred <- lgb.call("LGBM_BoosterGetNumPredict_R",
ret = npred,
private$handle,
as.integer(idx - 1))
private$predict_buffer[[data_name]] <- rep(0.0, npred)
ret = npred,
private$handle,
as.integer(idx - 1))
private$predict_buffer[[data_name]] <- numeric(npred)
}
# Check if current iteration was already predicted
if (!private$is_predicted_cur_iter[[idx]]) {
private$predict_buffer[[data_name]] <- lgb.call(
"LGBM_BoosterGetPredict_R",
ret = private$predict_buffer[[data_name]],
private$handle,
as.integer(idx - 1)
)
# Use buffer
private$predict_buffer[[data_name]] <- lgb.call("LGBM_BoosterGetPredict_R",
ret = private$predict_buffer[[data_name]],
private$handle,
as.integer(idx - 1))
private$is_predicted_cur_iter[[idx]] <- TRUE
}
private$predict_buffer[[data_name]]
# Return prediction buffer
return(private$predict_buffer[[data_name]])
},
# Get evaluation information
get_eval_info = function() {
# Check for evaluation names emptiness
if (is.null(private$eval_names)) {
names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R", private$handle)
# Get evaluation names
names <- lgb.call.return.str("LGBM_BoosterGetEvalNames_R",
private$handle)
# Check names' length
if (nchar(names) > 0) {
# Parse and store privately names
names <- strsplit(names, "\t")[[1]]
private$eval_names <- names
private$higher_better_inner_eval <- rep(FALSE, length(names))
# Loop through each name to pick up evaluation (and parse ndcg manually)
for (i in seq_along(names)) {
if ((names[i] == "auc") | grepl("^ndcg", names[i])) {
private$higher_better_inner_eval[i] <- TRUE
}
}
}
}
private$eval_names
# Return evaluation names
return(private$eval_names)
},
# Perform inner evaluation
inner_eval = function(data_name, data_idx, feval = NULL) {
# Check for unknown dataset (over the maximum provided range)
if (data_idx > private$num_dataset) {
stop("data_idx should not be greater than num_dataset")
}
# Get evaluation information
private$get_eval_info()
# Prepare return
ret <- list()
# Check evaluation names existence
if (length(private$eval_names) > 0) {
tmp_vals <- rep(0.0, length(private$eval_names))
tmp_vals <- lgb.call("LGBM_BoosterGetEval_R", ret = tmp_vals,
private$handle,
as.integer(data_idx - 1))
# Create evaluation values
tmp_vals <- numeric(length(private$eval_names))
tmp_vals <- lgb.call("LGBM_BoosterGetEval_R",
ret = tmp_vals,
private$handle,
as.integer(data_idx - 1))
# Loop through all evaluation names
for (i in seq_along(private$eval_names)) {
res <- list()
res$data_name <- data_name
res$name <- private$eval_names[i]
res$value <- tmp_vals[i]
# Store evaluation and append to return
res <- list()
res$data_name <- data_name
res$name <- private$eval_names[i]
res$value <- tmp_vals[i]
res$higher_better <- private$higher_better_inner_eval[i]
ret <- append(ret, list(res))
ret <- append(ret, list(res))
}
}
# Check if there are evaluation metrics
if (!is.null(feval)) {
# Check if evaluation metric is a function
if (!is.function(feval)) {
stop("lgb.Booster.eval: feval should be a function")
}
# Prepare data
data <- private$train_set
if (data_idx > 1) { data <- private$valid_sets[[data_idx - 1]] }
# Check if data to assess is existing differently
if (data_idx > 1) {
data <- private$valid_sets[[data_idx - 1]]
}
# Perform function evaluation
res <- feval(private$inner_predict(data_idx), data)
if(is.null(res$name) | is.null(res$value) |
is.null(res$higher_better)) {
# Check for name correctness
if(is.null(res$name) | is.null(res$value) | is.null(res$higher_better)) {
stop("lgb.Booster.eval: custom eval function should return a
list with attribute (name, value, higher_better)");
}
# Append names and evaluation
res$data_name <- data_name
ret <- append(ret, list(res))
ret <- append(ret, list(res))
}
ret
# Return ret
return(ret)
}
)
)
......@@ -322,32 +598,49 @@ Booster <- R6Class(
#'
#' When \code{predleaf = TRUE}, the output is a matrix object with the
#' number of columns corresponding to the number of trees.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' preds <- predict(model, test$data)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' preds <- predict(model, test$data)
#' }
#'
#' @rdname predict.lgb.Booster
#' @export
predict.lgb.Booster <- function(object, data,
num_iteration = NULL,
rawscore = FALSE,
predleaf = FALSE,
header = FALSE,
reshape = FALSE) {
rawscore = FALSE,
predleaf = FALSE,
header = FALSE,
reshape = FALSE) {
# Check booster existence
if (!lgb.is.Booster(object)) {
stop("predict.lgb.Booster: object should be an ", sQuote("lgb.Booster"))
}
object$predict(data, num_iteration, rawscore, predleaf, header, reshape)
# Return booster predictions
object$predict(data,
num_iteration,
rawscore,
predleaf,
header,
reshape)
}
#' Load LightGBM model
......@@ -357,26 +650,41 @@ predict.lgb.Booster <- function(object, data,
#' @param filename path of model file
#'
#' @return booster
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' lgb.save(model, "model.txt")
#' load_booster <- lgb.load("model.txt")
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' lgb.save(model, "model.txt")
#' load_booster <- lgb.load("model.txt")
#' }
#'
#' @rdname lgb.load
#' @export
lgb.load <- function(filename){
if (!is.character(filename)) { stop("lgb.load: filename should be character") }
# Check if file name is character or not
if (!is.character(filename)) {
stop("lgb.load: filename should be character")
}
# Return new booster
Booster$new(modelfile = filename)
}
#' Save LightGBM model
......@@ -388,26 +696,45 @@ lgb.load <- function(filename){
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration
#'
#' @return booster
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' lgb.save(model, "model.txt")
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' lgb.save(model, "model.txt")
#' }
#'
#' @rdname lgb.save
#' @export
lgb.save <- function(booster, filename, num_iteration = NULL){
if (!lgb.is.Booster(booster)) { stop("lgb.save: booster should be an ", sQuote("lgb.Booster")) }
if (!is.character(filename)) { stop("lgb.save: filename should be a character") }
# Check if booster is booster
if (!lgb.is.Booster(booster)) {
stop("lgb.save: booster should be an ", sQuote("lgb.Booster"))
}
# Check if file name is character
if (!is.character(filename)) {
stop("lgb.save: filename should be a character")
}
# Store booster
booster$save_model(filename, num_iteration)
}
#' Dump LightGBM model to json
......@@ -418,25 +745,40 @@ lgb.save <- function(booster, filename, num_iteration = NULL){
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration
#'
#' @return json format of model
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' json_model <- lgb.dump(model)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' json_model <- lgb.dump(model)
#' }
#'
#' @rdname lgb.dump
#' @export
lgb.dump <- function(booster, num_iteration = NULL){
if (!lgb.is.Booster(booster)) { stop("lgb.save: booster should be an ", sQuote("lgb.Booster")) }
# Check if booster is booster
if (!lgb.is.Booster(booster)) {
stop("lgb.save: booster should be an ", sQuote("lgb.Booster"))
}
# Return booster at requested iteration
booster$dump_model(num_iteration)
}
#' Get record evaluation result from booster
......@@ -447,31 +789,51 @@ lgb.dump <- function(booster, num_iteration = NULL){
#' @param eval_name name of evaluation
#' @param iters iterations, NULL will return all
#' @param is_err TRUE will return evaluation error instead
#'
#' @return vector of evaluation result
#'
#' @rdname lgb.get.eval.result
#' @export
lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_err = FALSE) {
# Check if booster is booster
if (!lgb.is.Booster(booster)) {
stop("lgb.get.eval.result: Can only use ", sQuote("lgb.Booster"), " to get eval result")
}
# Check if data and evaluation name are characters or not
if (!is.character(data_name) || !is.character(eval_name)) {
stop("lgb.get.eval.result: data_name and eval_name should be characters")
}
# Check if recorded evaluation is existing
if (is.null(booster$record_evals[[data_name]])) {
stop("lgb.get.eval.result: wrong data name")
}
# Check if evaluation result is existing
if (is.null(booster$record_evals[[data_name]][[eval_name]])) {
stop("lgb.get.eval.result: wrong eval name")
}
# Create result
result <- booster$record_evals[[data_name]][[eval_name]]$eval
# Check if error is requested
if (is_err) {
result <- booster$record_evals[[data_name]][[eval_name]]$eval_err
}
# Check if iteration is non existant
if (is.null(iters)) {
return(as.numeric(result))
}
# Parse iteration and booster delta
iters <- as.integer(iters)
delta <- booster$record_evals$start_iter - 1
iters <- iters - delta
# Return requested result
as.numeric(result[iters])
}
Dataset <- R6Class(
"lgb.Dataset",
classname = "lgb.Dataset",
cloneable = FALSE,
public = list(
# Finalize will free up the handles
finalize = function() {
# Check the need for freeing handle
if (!lgb.is.null.handle(private$handle)) {
# Freeing up handle
lgb.call("LGBM_DatasetFree_R", ret = NULL, private$handle)
private$handle <- NULL
}
},
# Initialize will create a starter dataset
initialize = function(data,
params = list(),
reference = NULL,
colnames = NULL,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
info = list(),
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
info = list(),
...) {
# Check for additional parameters
additional_params <- list(...)
INFO_KEYS <- c('label', 'weight', 'init_score', 'group')
# Create known attributes list
INFO_KEYS <- c("label", "weight", "init_score", "group")
# Check if attribute key is in the known attribute list
for (key in names(additional_params)) {
# Key existing
if (key %in% INFO_KEYS) {
# Store as info
info[[key]] <- additional_params[[key]]
} else {
# Store as param
params[[key]] <- additional_params[[key]]
}
}
# Check for dataset reference
if (!is.null(reference)) {
if (!lgb.check.r6.class(reference, "lgb.Dataset")) {
stop("lgb.Dataset: Can only use ", sQuote("lgb.Dataset"), " as reference")
}
}
# Check for predictor reference
if (!is.null(predictor)) {
if (!lgb.check.r6.class(predictor, "lgb.Predictor")) {
stop("lgb.Dataset: Only can use ", sQuote("lgb.Predictor"), " as predictor")
}
}
private$raw_data <- data
private$params <- params
# Setup private attributes
private$raw_data <- data
private$params <- params
private$reference <- reference
private$colnames <- colnames
private$colnames <- colnames
private$categorical_feature <- categorical_feature
private$predictor <- predictor
private$free_raw_data <- free_raw_data
private$used_indices <- used_indices
private$info <- info
private$predictor <- predictor
private$free_raw_data <- free_raw_data
private$used_indices <- used_indices
private$info <- info
},
create_valid = function(data, info = list(), ...) {
ret <- Dataset$new(
data,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
NULL,
info,
...
)
ret
create_valid = function(data,
info = list(),
...) {
# Create new dataset
ret <- Dataset$new(data,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
NULL,
info,
...)
# Return ret
return(ret)
},
# Dataset constructor
construct = function() {
# Check for handle null
if (!lgb.is.null.handle(private$handle)) {
return(self)
}
# Get feature names
cnames <- NULL
if (is.matrix(private$raw_data) || is(private$raw_data, "dgCMatrix")) {
cnames <- colnames(private$raw_data)
}
# set feature names if not exist
if (is.null(private$colnames) && !is.null(cnames)) {
private$colnames <- as.character(cnames)
}
# Get categorical feature index
if (!is.null(private$categorical_feature)) {
# Check for character name
if (typeof(private$categorical_feature) == "character") {
cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1)
# Provided indices, but some indices are not existing?
if (sum(is.na(cate_indices)) > 0) {
stop("lgb.self.get.handle: supplied an unknown feature in categorical_feature: ", sQuote(private$categorical_feature[is.na(cate_indices)]))
}
} else {
# Check if more categorical features were output over the feature space
if (max(private$categorical_feature) > length(private$colnames)) {
stop("lgb.self.get.handle: supplied a too large value in categorical_feature: ", max(private$categorical_feature), " but only ", length(private$colnames), " features")
}
# Store indices as [0, n-1] indexed instead of [1, n] indexed
cate_indices <- as.list(private$categorical_feature - 1)
}
# Store indices for categorical features
private$params$categorical_feature <- cate_indices
}
# Check has header or not
has_header <- FALSE
if (!is.null(private$params$has_header) ||
!is.null(private$params$header)) {
if (tolower(as.character(private$params$has_header)) == "true"
||
tolower(as.character(private$params$header)) == "true") {
if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
if (tolower(as.character(private$params$has_header)) == "true" || tolower(as.character(private$params$header)) == "true") {
has_header <- TRUE
}
}
# Generate parameter str
params_str <- lgb.params2str(private$params)
# get handle of reference dataset
# Get handle of reference dataset
ref_handle <- NULL
if (!is.null(private$reference)) {
ref_handle <- private$reference$.__enclos_env__$private$get_handle()
}
handle <- lgb.new.handle()
# not subset
# Not subsetting
if (is.null(private$used_indices)) {
# Are we using a data file?
if (is.character(private$raw_data)) {
handle <- lgb.call(
"LGBM_DatasetCreateFromFile_R",
ret = handle,
lgb.c_str(private$raw_data),
params_str,
ref_handle
)
handle <- lgb.call("LGBM_DatasetCreateFromFile_R",
ret = handle,
lgb.c_str(private$raw_data),
params_str,
ref_handle)
} else if (is.matrix(private$raw_data)) {
handle <- lgb.call(
"LGBM_DatasetCreateFromMat_R",
ret = handle,
private$raw_data,
nrow(private$raw_data),
ncol(private$raw_data),
params_str,
ref_handle
)
# Are we using a matrix?
handle <- lgb.call("LGBM_DatasetCreateFromMat_R",
ret = handle,
private$raw_data,
nrow(private$raw_data),
ncol(private$raw_data),
params_str,
ref_handle)
} else if (is(private$raw_data, "dgCMatrix")) {
handle <- lgb.call(
"LGBM_DatasetCreateFromCSC_R",
ret = handle,
private$raw_data@p,
private$raw_data@i,
private$raw_data@x,
length(private$raw_data@p),
length(private$raw_data@x),
nrow(private$raw_data),
params_str,
ref_handle
)
# Are we using a dgCMatrix (sparsed matrix column compressed)
handle <- lgb.call("LGBM_DatasetCreateFromCSC_R",
ret = handle,
private$raw_data@p,
private$raw_data@i,
private$raw_data@x,
length(private$raw_data@p),
length(private$raw_data@x),
nrow(private$raw_data),
params_str,
ref_handle)
} else {
stop(
"lgb.Dataset.construct: does not support constructing from ", sQuote(class(private$raw_data))
)
# Unknown data type
stop("lgb.Dataset.construct: does not support constructing from ", sQuote(class(private$raw_data)))
}
} else {
# construct subset
# Reference is empty
if (is.null(private$reference)) {
stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
}
handle <- lgb.call(
"LGBM_DatasetGetSubset_R",
ret = handle,
ref_handle,
private$used_indices,
length(private$used_indices),
params_str
)
# Construct subset
handle <- lgb.call("LGBM_DatasetGetSubset_R",
ret = handle,
ref_handle,
private$used_indices,
length(private$used_indices),
params_str)
}
# Setup class and private type
class(handle) <- "lgb.Dataset.handle"
private$handle <- handle
# set feature names
if (!is.null(private$colnames)) { self$set_colnames(private$colnames) }
# Set feature names
if (!is.null(private$colnames)) {
self$set_colnames(private$colnames)
}
# load init score
if (!is.null(private$predictor) &&
is.null(private$used_indices)) {
# Load init score if requested
if (!is.null(private$predictor) && is.null(private$used_indices)) {
# Setup initial scores
init_score <- private$predictor$predict(private$raw_data, rawscore = TRUE, reshape = TRUE)
# do not need to transpose, for is col_marjor
# Not needed to transpose, for is col_marjor
init_score <- as.vector(init_score)
private$info$init_score <- init_score
}
# Should we free raw data?
if (isTRUE(private$free_raw_data)) {
private$raw_data <- NULL
}
if (isTRUE(private$free_raw_data)) { private$raw_data <- NULL }
# Get private information
if (length(private$info) > 0) {
# set infos
# Set infos
for (i in seq_along(private$info)) {
p <- private$info[i]
self$setinfo(names(p), p[[1]])
}
}
# Get label information existence
if (is.null(self$getinfo("label"))) {
stop("lgb.Dataset.construct: label should be set")
}
self
# Return oneself
return(self)
},
# Dimension function
dim = function() {
# Check for handle
if (!lgb.is.null.handle(private$handle)) {
num_row <- 0L
num_col <- 0L
c(
lgb.call("LGBM_DatasetGetNumData_R", ret = num_row, private$handle),
lgb.call("LGBM_DatasetGetNumFeature_R", ret = num_col, private$handle)
)
# Get numeric data and numeric features
c(lgb.call("LGBM_DatasetGetNumData_R", ret = num_row, private$handle),
lgb.call("LGBM_DatasetGetNumFeature_R", ret = num_col, private$handle))
} else if (is.matrix(private$raw_data) || is(private$raw_data, "dgCMatrix")) {
# Check if dgCMatrix (sparse matrix column compressed)
dim(private$raw_data)
} else {
stop(
"dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly"
)
# Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
}
},
# Get column names
get_colnames = function() {
# Check for handle
if (!lgb.is.null.handle(private$handle)) {
# Get feature names and write them
cnames <- lgb.call.return.str("LGBM_DatasetGetFeatureNames_R", private$handle)
private$colnames <- as.character(base::strsplit(cnames, "\t")[[1]])
private$colnames
} else if (is.matrix(private$raw_data) || is(private$raw_data, "dgCMatrix")) {
# Check if dgCMatrix (sparse matrix column compressed)
colnames(private$raw_data)
} else {
stop(
"dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly"
)
# Trying to work with unknown dimensions is not possible
stop("dim: cannot get dimensions before dataset has been constructed, please call lgb.Dataset.construct explicitly")
}
},
# Set column names
set_colnames = function(colnames) {
if (is.null(colnames)) { return(self) }
# Check column names non-existence
if (is.null(colnames)) {
return(self)
}
# Check empty column names
colnames <- as.character(colnames)
if (length(colnames) == 0) { return(self) }
if (length(colnames) == 0) {
return(self)
}
# Write column names
private$colnames <- colnames
if (!lgb.is.null.handle(private$handle)) {
# Merge names with tab separation
merged_name <- paste0(as.list(private$colnames), collapse = "\t")
lgb.call("LGBM_DatasetSetFeatureNames_R",
ret = NULL,
private$handle,
lgb.c_str(merged_name))
}
self
# Return self
return(self)
},
# Get information
getinfo = function(name) {
# Create known attributes list
INFONAMES <- c("label", "weight", "init_score", "group")
if (!is.character(name) ||
length(name) != 1 ||
!name %in% INFONAMES) {
stop(
"getinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", ")
)
# Check if attribute key is in the known attribute list
if (!is.character(name) || length(name) != 1 || !name %in% INFONAMES) {
stop("getinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
}
# Check for info name and handle
if (is.null(private$info[[name]]) && !lgb.is.null.handle(private$handle)) {
# Get field size of info
info_len <- 0L
info_len <- lgb.call("LGBM_DatasetGetFieldSize_R",
ret = info_len,
private$handle,
lgb.c_str(name))
# Check if info is not empty
if (info_len > 0) {
# Get back fields
ret <- NULL
ret <- if (name == "group") { integer(info_len) } else { rep(0.0, info_len) }
ret <- if (name == "group") {
integer(info_len) # Integer
} else {
numeric(info_len) # Numeric
}
ret <- lgb.call("LGBM_DatasetGetField_R",
ret = ret,
private$handle,
lgb.c_str(name))
private$info[[name]] <- ret
}
}
private$info[[name]]
},
# Set information
setinfo = function(name, info) {
# Create known attributes list
INFONAMES <- c("label", "weight", "init_score", "group")
if (!is.character(name) ||
length(name) != 1 ||
!name %in% INFONAMES) {
stop(
"setinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", ")
)
}
info <- if (name == "group") { as.integer(info) } else { as.numeric(info) }
# Check if attribute key is in the known attribute list
if (!is.character(name) || length(name) != 1 || !name %in% INFONAMES) {
stop("setinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
}
# Check for type of information
info <- if (name == "group") {
as.integer(info) # Integer
} else {
as.numeric(info) # Numeric
}
# Store information privately
private$info[[name]] <- info
if (!lgb.is.null.handle(private$handle) && !is.null(info)) {
if (length(info) > 0) {
lgb.call(
"LGBM_DatasetSetField_R",
ret = NULL,
private$handle,
lgb.c_str(name),
info,
length(info)
)
lgb.call("LGBM_DatasetSetField_R",
ret = NULL,
private$handle,
lgb.c_str(name),
info,
length(info))
}
}
self
# Return self
return(self)
},
# Slice dataset
slice = function(idxset, ...) {
Dataset$new(
NULL,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
idxset,
NULL,
...
)
# Perform slicing
Dataset$new(NULL,
private$params,
self,
private$colnames,
private$categorical_feature,
private$predictor,
private$free_raw_data,
idxset,
NULL,
...)
},
# Update parameters
update_params = function(params) {
# Parameter updating
private$params <- modifyList(private$params, params)
self
},
# Set categorical feature parameter
set_categorical_feature = function(categorical_feature) {
if (identical(private$categorical_feature, categorical_feature)) { return(self) }
# Check for identical input
if (identical(private$categorical_feature, categorical_feature)) {
return(self)
}
# Check for empty data
if (is.null(private$raw_data)) {
stop(
"set_categorical_feature: cannot set categorical feature after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
)
stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
}
# Overwrite categorical features
private$categorical_feature <- categorical_feature
# Finalize and return self
self$finalize()
self
return(self)
},
# Set reference
set_reference = function(reference) {
# Set known references
self$set_categorical_feature(reference$.__enclos_env__$private$categorical_feature)
self$set_colnames(reference$get_colnames())
private$set_predictor(reference$.__enclos_env__$private$predictor)
if (identical(private$reference, reference)) { return(self) }
# Check for identical references
if (identical(private$reference, reference)) {
return(self)
}
# Check for empty data
if (is.null(private$raw_data)) {
stop(
"set_reference: cannot set reference after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
)
stop("set_reference: cannot set reference after freeing raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
}
# Check for non-existing reference
if (!is.null(reference)) {
# Reference is unknown
if (!lgb.check.r6.class(reference, "lgb.Dataset")) {
stop("set_reference: Can only use lgb.Dataset as a reference")
}
}
# Store reference
private$reference <- reference
# Finalize and return self
self$finalize()
self
return(self)
},
# Save binary model
save_binary = function(fname) {
# Store binary data
self$construct()
lgb.call("LGBM_DatasetSaveBinary_R",
ret = NULL,
private$handle,
lgb.c_str(fname))
self
return(self)
}
),
private = list(
handle = NULL,
raw_data = NULL,
params = list(),
reference = NULL,
colnames = NULL,
handle = NULL,
raw_data = NULL,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
info = NULL,
get_handle = function() {
if (lgb.is.null.handle(private$handle)) { self$construct() }
predictor = NULL,
free_raw_data = TRUE,
used_indices = NULL,
info = NULL,
# Get handle
get_handle = function() {
# Get handle and construct if needed
if (lgb.is.null.handle(private$handle)) {
self$construct()
}
private$handle
},
# Set predictor
set_predictor = function(predictor) {
if (identical(private$predictor, predictor)) { return(self) }
# Return self is identical predictor
if (identical(private$predictor, predictor)) {
return(self)
}
# Check for empty data
if (is.null(private$raw_data)) {
stop(
"set_predictor: cannot set predictor after free raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset"
)
stop("set_predictor: cannot set predictor after free raw data,
please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
}
# Check for empty predictor
if (!is.null(predictor)) {
# Predictor is unknown
if (!lgb.check.r6.class(predictor, "lgb.Predictor")) {
stop("set_predictor: Can only use lgb.Predictor as predictor")
}
}
# Store predictor
private$predictor <- predictor
# Finalize and return self
self$finalize()
self
return(self)
}
)
)
......@@ -390,112 +624,141 @@ Dataset <- R6Class(
#' @param free_raw_data TRUE for need to free raw data after construct
#' @param info a list of information of the lgb.Dataset object
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
#'
#' @return constructed dataset
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
#' dtrain <- lgb.Dataset('lgb.Dataset.data')
#' lgb.Dataset.construct(dtrain)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.save(dtrain, "lgb.Dataset.data")
#' dtrain <- lgb.Dataset("lgb.Dataset.data")
#' lgb.Dataset.construct(dtrain)
#' }
#'
#' @export
lgb.Dataset <- function(data,
params = list(),
reference = NULL,
colnames = NULL,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
free_raw_data = TRUE,
info = list(),
free_raw_data = TRUE,
info = list(),
...) {
Dataset$new(
data,
params,
reference,
colnames,
categorical_feature,
NULL,
free_raw_data,
NULL,
info,
...
)
# Create new dataset
Dataset$new(data,
params,
reference,
colnames,
categorical_feature,
NULL,
free_raw_data,
NULL,
info,
...)
}
#' Construct validation data
#'
#'
#' Construct validation data according to training data
#'
#'
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param info a list of information of the lgb.Dataset object
#' @param ... other information to pass to \code{info}.
#'
#' @return constructed dataset
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' }
#'
#' @export
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
}
# Create validation dataset
dataset$create_valid(data, info, ...)
}
#' Construct Dataset explicitly
#'
#'
#' @param dataset Object of class \code{lgb.Dataset}
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.construct(dtrain)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' }
#'
#' @export
lgb.Dataset.construct <- function(dataset) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
}
# Construct the dataset
dataset$construct()
}
#' Dimensions of an lgb.Dataset
#'
#'
#' Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
#' @param x Object of class \code{lgb.Dataset}
#' @param ... other parameters
#'
#' @return a vector of numbers of rows and of columns
#'
#'
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
#'
#'
#' @examples
#' dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#'
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
#' }
#'
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(x)) {
stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
}
# Return dimensions
x$dim()
}
#' Handling of column names of \code{lgb.Dataset}
......@@ -512,227 +775,305 @@ dim.lgb.Dataset <- function(x, ...) {
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
#' dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
#' colnames(dtrain) <- make.names(1:ncol(train$data))
#' print(dtrain, verbose=TRUE)
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
#' colnames(dtrain) <- make.names(1:ncol(train$data))
#' print(dtrain, verbose = TRUE)
#' }
#'
#' @rdname dimnames.lgb.Dataset
#' @export
dimnames.lgb.Dataset <- function(x) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(x)) {
stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
}
# Return dimension names
list(NULL, x$get_colnames())
}
#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
if (!is.list(value) || length(value) != 2L)
# Check if invalid element list
if (!is.list(value) || length(value) != 2L) {
stop("invalid ", sQuote("value"), " given: must be a list of two elements")
if (!is.null(value[[1L]])) { stop("lgb.Dataset does not have rownames") }
}
# Check for unknown row names
if (!is.null(value[[1L]])) {
stop("lgb.Dataset does not have rownames")
}
# Check for second value missing
if (is.null(value[[2]])) {
# No column names
x$set_colnames(NULL)
return(x)
}
# Check for unmatching column size
if (ncol(x) != length(value[[2]])) {
stop("can't assign ", sQuote(length(value[[2]])), " colnames to an lgb.Dataset with ", sQuote(ncol(x)), " columns")
}
if (ncol(x) != length(value[[2]]))
stop("can't assign ",
sQuote(length(value[[2]])),
" colnames to an lgb.Dataset with ",
sQuote(ncol(x)), " columns")
# Set column names properly, and return
x$set_colnames(value[[2]])
x
}
#' Slice a dataset
#'
#'
#' Get a new \code{lgb.Dataset} containing the specified rows of
#' orginal lgb.Dataset object
#'
#'
#' @param dataset Object of class "lgb.Dataset"
#' @param idxset a integer vector of indices of rows needed
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
#'
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#'
#' dsub <- slice(dtrain, 1:42)
#' labels1 <- getinfo(dsub, 'label')
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
#' dsub <- lightgbm::slice(dtrain, 1:42)
#' labels <- lightgbm::getinfo(dsub, "label")
#' }
#'
#' @export
slice <- function(dataset, ...) { UseMethod("slice") }
slice <- function(dataset, ...) {
UseMethod("slice")
}
#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
}
# Return sliced set
dataset$slice(idxset, ...)
}
#' Get information of an lgb.Dataset object
#'
#'
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#' @return info data
#'
#'
#' @details
#' The \code{name} field can be one of the following:
#'
#'
#' \itemize{
#' \item \code{label}: label lightgbm learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{group}: group size
#' \item \code{init_score}: initial score is the base prediction lightgbm will boost from ;
#' }
#'
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.construct(dtrain)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#'
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#'
#' @export
getinfo <- function(dataset, ...) { UseMethod("getinfo") }
getinfo <- function(dataset, ...) {
UseMethod("getinfo")
}
#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
}
# Return information
dataset$getinfo(name)
}
#' Set information of an lgb.Dataset object
#'
#'
#' @param dataset Object of class "lgb.Dataset"
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
#' @return passed object
#'
#'
#' @details
#' The \code{name} field can be one of the following:
#'
#'
#' \itemize{
#' \item \code{label}: label lightgbm learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{init_score}: initial score is the base prediction lightgbm will boost from ;
#' \item \code{group}.
#' }
#'
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.construct(dtrain)
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all.equal(labels2, 1-labels))
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#'
#' @export
setinfo <- function(dataset, ...) { UseMethod("setinfo") }
setinfo <- function(dataset, ...) {
UseMethod("setinfo")
}
#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
}
# Set information
dataset$setinfo(name, info)
}
#' Set categorical feature of \code{lgb.Dataset}
#'
#'
#' @param dataset object of class \code{lgb.Dataset}
#' @param categorical_feature categorical features
#'
#' @return passed dataset
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.save(dtrain, 'lgb.Dataset.data')
#' dtrain <- lgb.Dataset('lgb.Dataset.data')
#' lgb.Dataset.set.categorical(dtrain, 1:2)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.save(dtrain, "lgb.Dataset.data")
#' dtrain <- lgb.Dataset("lgb.Dataset.data")
#' lgb.Dataset.set.categorical(dtrain, 1:2)
#' }
#'
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
}
# Set categoricals
dataset$set_categorical_feature(categorical_feature)
}
#' Set reference of \code{lgb.Dataset}
#'
#'
#' If you want to use validation data, you should set reference to training data
#'
#'
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
#'
#' @return passed dataset
#'
#' @examples
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test=train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
#' library(lightgbm)
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test = train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
#' }
#'
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
}
# Set reference
dataset$set_reference(reference)
}
#' Save \code{lgb.Dataset} to a binary file
#'
#'
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
#'
#' @return passed dataset
#'
#' @examples
#'
#' \dontrun{
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' lgb.Dataset.save(dtrain, "data.bin")
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.save(dtrain, "data.bin")
#' }
#'
#' @rdname lgb.Dataset.save
#' @export
lgb.Dataset.save <- function(dataset, fname) {
# Check if dataset is not a dataset
if (!lgb.is.Dataset(dataset)) {
stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
}
# File-type is not matching
if (!is.character(fname)) {
stop("lgb.Dataset.set: fname should be a character or a file connection")
}
# Store binary
dataset$save_binary(fname)
}
Predictor <- R6Class(
"lgb.Predictor",
classname = "lgb.Predictor",
cloneable = FALSE,
public = list(
# Finalize will free up the handles
finalize = function() {
# Check the need for freeing handle
if (private$need_free_handle && !lgb.is.null.handle(private$handle)) {
cat("free booster handle\n")
# Freeing up handle
lgb.call("LGBM_BoosterFree_R", ret = NULL, private$handle)
private$handle <- NULL
}
},
# Initialize will create a starter model
initialize = function(modelfile) {
# Create new lgb handle
handle <- lgb.new.handle()
# Check if handle is a character
if (is.character(modelfile)) {
# Create handle on it
handle <- lgb.call("LGBM_BoosterCreateFromModelfile_R", ret = handle, lgb.c_str(modelfile))
private$need_free_handle <- TRUE
} else if (is(modelfile, "lgb.Booster.handle")) {
# Check if model file is a booster handle already
handle <- modelfile
private$need_free_handle <- FALSE
} else {
# Model file is unknown
stop("lgb.Predictor: modelfile must be either a character filename or an lgb.Booster.handle")
}
# Override class and store it
class(handle) <- "lgb.Booster.handle"
private$handle <- handle
},
# Get current iteration
current_iter = function() {
cur_iter <- 0L
lgb.call("LGBM_BoosterGetCurrentIteration_R", ret = cur_iter, private$handle)
},
predict = function(data, num_iteration = NULL, rawscore = FALSE,
predleaf = FALSE, header = FALSE, reshape = FALSE) {
if (is.null(num_iteration)) { num_iteration <- -1 }
# Predict from data
predict = function(data,
num_iteration = NULL,
rawscore = FALSE,
predleaf = FALSE,
header = FALSE,
reshape = FALSE) {
# Check if number of iterations is existing - if not, then set it to -1 (use all)
if (is.null(num_iteration)) {
num_iteration <- -1
}
# Set temporary variable
num_row <- 0L
# Check if data is a file name
if (is.character(data)) {
# Data is a filename, create a temporary file with a "lightgbm_" pattern in it
tmp_filename <- tempfile(pattern = "lightgbm_")
on.exit(unlink(tmp_filename), add = TRUE)
# Predict from temporary file
lgb.call("LGBM_BoosterPredictForFile_R", ret = NULL, private$handle, data,
as.integer(header),
as.integer(rawscore),
as.integer(predleaf),
as.integer(num_iteration),
lgb.c_str(tmp_filename))
preds <- read.delim(tmp_filename, header = FALSE, seq = "\t")
# Get predictions from file
preds <- read.delim(tmp_filename, header = FALSE, seq = "\t")
num_row <- nrow(preds)
preds <- as.vector(t(preds))
preds <- as.vector(t(preds))
} else {
# Not a file, we need to predict from R object
num_row <- nrow(data)
npred <- 0L
npred <- lgb.call("LGBM_BoosterCalcNumPredict_R", ret = npred,
private$handle,
as.integer(num_row),
as.integer(rawscore),
as.integer(predleaf),
as.integer(num_iteration))
# allocte space for prediction
preds <- rep(0.0, npred)
npred <- 0L
# Check number of predictions to do
npred <- lgb.call("LGBM_BoosterCalcNumPredict_R",
ret = npred,
private$handle,
as.integer(num_row),
as.integer(rawscore),
as.integer(predleaf),
as.integer(num_iteration))
# Pre-allocate empty vector
preds <- numeric(npred)
# Check if data is a matrix
if (is.matrix(data)) {
preds <- lgb.call("LGBM_BoosterPredictForMat_R", ret = preds,
private$handle,
data,
as.integer(nrow(data)),
as.integer(ncol(data)),
as.integer(rawscore),
as.integer(predleaf),
as.integer(num_iteration))
preds <- lgb.call("LGBM_BoosterPredictForMat_R",
ret = preds,
private$handle,
data,
as.integer(nrow(data)),
as.integer(ncol(data)),
as.integer(rawscore),
as.integer(predleaf),
as.integer(num_iteration))
} else if (is(data, "dgCMatrix")) {
preds <- lgb.call("LGBM_BoosterPredictForCSC_R", ret = preds,
private$handle,
data@p,
data@i,
data@x,
length(data@p),
length(data@x),
nrow(data),
as.integer(rawscore),
as.integer(predleaf),
as.integer(num_iteration))
# Check if data is a dgCMatrix (sparse matrix, column compressed format)
preds <- lgb.call("LGBM_BoosterPredictForCSC_R",
ret = preds,
private$handle,
data@p,
data@i,
data@x,
length(data@p),
length(data@x),
nrow(data),
as.integer(rawscore),
as.integer(predleaf),
as.integer(num_iteration))
} else {
# Cannot predict on unknown class
# to-do: predict from lgb.Dataset
stop("predict: cannot predict on data of class ", sQuote(class(data)))
}
}
# Check if number of rows is strange (not a multiple of the dataset rows)
if (length(preds) %% num_row != 0) {
stop("predict: prediction length ", sQuote(length(preds))," is not a multiple of nrows(data): ", sQuote(num_row))
}
# Get number of cases per row
npred_per_case <- length(preds) / num_row
# Data reshaping
if (predleaf) {
# Predict leaves only, reshaping is mandatory
preds <- matrix(preds, ncol = npred_per_case, byrow = TRUE)
} else if (reshape && npred_per_case > 1) {
# Predict with data reshaping
preds <- matrix(preds, ncol = npred_per_case, byrow = TRUE)
}
preds
# Return predictions
return(preds)
}
),
private = list( handle = NULL, need_free_handle = FALSE )
private = list(handle = NULL,
need_free_handle = FALSE)
)
CVBooster <- R6Class(
"lgb.CVBooster",
classname = "lgb.CVBooster",
cloneable = FALSE,
public = list(
best_iter = -1,
best_iter = -1,
record_evals = list(),
boosters = list(),
initialize = function(x) {
boosters = list(),
initialize = function(x) {
self$boosters <- x
},
reset_parameter = function(new_params) {
......@@ -15,7 +15,6 @@ CVBooster <- R6Class(
)
)
#' Main CV logic for LightGBM
#'
#' Main CV logic for LightGBM
......@@ -58,233 +57,377 @@ CVBooster <- R6Class(
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param ... other parameters, see parameters.md for more informations
#'
#' @return a trained model \code{lgb.CVBooster}.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' params <- list(objective="regression", metric="l2")
#' model <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' params <- list(objective = "regression", metric = "l2")
#' model <- lgb.cv(params,
#' dtrain,
#' 10,
#' nfold = 5,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' }
#' @rdname lgb.train
#' @export
lgb.cv <- function(params=list(), data, nrounds = 10,
nfold = 3,
label = NULL,
weight = NULL,
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
showsd = TRUE,
stratified = TRUE,
folds = NULL,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
lgb.cv <- function(params = list(),
data,
nrounds = 10,
nfold = 3,
label = NULL,
weight = NULL,
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
showsd = TRUE,
stratified = TRUE,
folds = NULL,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(), ...) {
callbacks = list(),
...) {
# Setup temporary variables
addiction_params <- list(...)
params <- append(params, addiction_params)
params$verbose <- verbose
params <- lgb.check.obj(params, obj)
params <- lgb.check.eval(params, eval)
fobj <- NULL
feval <- NULL
params <- append(params, addiction_params)
params$verbose <- verbose
params <- lgb.check.obj(params, obj)
params <- lgb.check.eval(params, eval)
fobj <- NULL
feval <- NULL
# Check for objective (function or not)
if (is.function(params$objective)) {
fobj <- params$objective
params$objective <- "NONE"
}
if (is.function(eval)) { feval <- eval }
# Check for loss (function or not)
if (is.function(eval)) {
feval <- eval
}
# Check for parameters
lgb.check.params(params)
# Init predictor to empty
predictor <- NULL
# Check for boosting from a trained model
if (is.character(init_model)) {
predictor <- Predictor$new(init_model)
} else if (lgb.is.Booster(init_model)) {
predictor <- init_model$to_predictor()
}
# Set the iteration to start from / end to (and check for boosting from a trained model, again)
begin_iteration <- 1
if (!is.null(predictor)) {
begin_iteration <- predictor$current_iter() + 1
}
end_iteration <- begin_iteration + nrounds - 1
# Check for training dataset type correctness
if (!lgb.is.Dataset(data)) {
if (is.null(label)) { stop("Labels must be provided for lgb.cv") }
if (is.null(label)) {
stop("Labels must be provided for lgb.cv")
}
data <- lgb.Dataset(data, label = label)
}
if (!is.null(weight)) { data$set_info("weight", weight) }
# Check for weights
if (!is.null(weight)) {
data$set_info("weight", weight)
}
# Update parameters with parsed parameters
data$update_params(params)
# Create the predictor set
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
# Write column names
if (!is.null(colnames)) {
data$set_colnames(colnames)
}
# Write categorical features
if (!is.null(categorical_feature)) {
data$set_categorical_feature(categorical_feature)
}
# Construct datasets, if needed
data$construct()
# Check for folds
if (!is.null(folds)) {
if (!is.list(folds) | length(folds) < 2)
# Check for list of folds or for single value
if (!is.list(folds) | length(folds) < 2) {
stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
}
# Set number of folds
nfold <- length(folds)
} else {
if (nfold <= 1) { stop(sQuote("nfold"), " must be > 1") }
folds <- generate.cv.folds(nfold, nrow(data), stratified, getinfo(data, 'label'), params)
# Check fold value
if (nfold <= 1) {
stop(sQuote("nfold"), " must be > 1")
}
# Create folds
folds <- generate.cv.folds(nfold,
nrow(data),
stratified,
getinfo(data, "label"),
params)
}
# Add printing log callback
if (verbose > 0 & eval_freq > 0) {
callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
}
if (record) { callbacks <- add.cb(callbacks, cb.record.evaluation()) }
# Add evaluation log callback
if (record) {
callbacks <- add.cb(callbacks, cb.record.evaluation())
}
# Add early stopping callback
if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
}
}
# Categorize callbacks
cb <- categorize.callbacks(callbacks)
# construct booster
# Construct booster using a list apply
bst_folds <- lapply(seq_along(folds), function(k) {
dtest <- slice(data, folds[[k]])
dtrain <- slice(data, unlist(folds[-k]))
dtest <- slice(data, folds[[k]])
dtrain <- slice(data, unlist(folds[-k]))
booster <- Booster$new(params, dtrain)
booster$add_valid(dtest, "valid")
list(booster = booster)
})
# Create new booster
cv_booster <- CVBooster$new(bst_folds)
# callback env
env <- CB_ENV$new()
env$model <- cv_booster
# Callback env
env <- CB_ENV$new()
env$model <- cv_booster
env$begin_iteration <- begin_iteration
env$end_iteration <- end_iteration
#start training
env$end_iteration <- end_iteration
# Start training model using number of iterations to start and end with
for (i in seq(from = begin_iteration, to = end_iteration)) {
# Overwrite iteration in environment
env$iteration <- i
env$eval_list <- list()
for (f in cb$pre_iter) { f(env) }
# update one iter
# Loop through "pre_iter" element
for (f in cb$pre_iter) {
f(env)
}
# Update one boosting iteration
msg <- lapply(cv_booster$boosters, function(fd) {
fd$booster$update(fobj = fobj)
fd$booster$eval_valid(feval = feval)
})
# Prepare collection of evaluation results
merged_msg <- lgb.merge.cv.result(msg)
# Write evaluation result in environment
env$eval_list <- merged_msg$eval_list
if(showsd) { env$eval_err_list <- merged_msg$eval_err_list }
for (f in cb$post_iter) { f(env) }
# met early stopping
# Check for standard deviation requirement
if(showsd) {
env$eval_err_list <- merged_msg$eval_err_list
}
# Loop through env
for (f in cb$post_iter) {
f(env)
}
# Check for early stopping and break if needed
if (env$met_early_stop) break
}
cv_booster
# Return booster
return(cv_booster)
}
# Generates random (stratified if needed) CV folds
generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
# cannot do it for rank
if (exists('objective', where = params) &&
is.character(params$objective) &&
params$objective == 'lambdarank') {
stop("\n\tAutomatic generation of CV-folds is not implemented for lambdarank!\n",
"\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
# Cannot do it for rank
if (exists('objective', where = params) && is.character(params$objective) && params$objective == "lambdarank") {
stop("\n\tAutomatic generation of CV-folds is not implemented for lambdarank!\n", "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
}
# shuffle
# Shuffle
rnd_idx <- sample(seq_len(nrows))
if (isTRUE(stratified) &&
length(label) == length(rnd_idx)) {
y <- label[rnd_idx]
y <- factor(y)
# Request stratified folds
if (isTRUE(stratified) && length(label) == length(rnd_idx)) {
y <- label[rnd_idx]
y <- factor(y)
folds <- lgb.stratified.folds(y, nfold)
} else {
# make simple non-stratified folds
# Make simple non-stratified folds
kstep <- length(rnd_idx) %/% nfold
folds <- list()
# Loop through each fold
for (i in seq_len(nfold - 1)) {
folds[[i]] <- rnd_idx[seq_len(kstep)]
rnd_idx <- rnd_idx[-(seq_len(kstep))]
rnd_idx <- rnd_idx[-(seq_len(kstep))]
}
folds[[nfold]] <- rnd_idx
}
folds
# Return folds
return(folds)
}
# Creates CV folds stratified by the values of y.
# It was borrowed from caret::lgb.stratified.folds and simplified
# by always returning an unnamed list of fold indices.
lgb.stratified.folds <- function(y, k = 10) {
## Group the numeric data based on their magnitudes
## and sample within those groups.
## When the number of samples is low, we may have
## issues further slicing the numeric data into
## groups. The number of groups will depend on the
## ratio of the number of folds to the sample size.
## At most, we will use quantiles. If the sample
## is too small, we just do regular unstratified CV
if (is.numeric(y)) {
## Group the numeric data based on their magnitudes
## and sample within those groups.
## When the number of samples is low, we may have
## issues further slicing the numeric data into
## groups. The number of groups will depend on the
## ratio of the number of folds to the sample size.
## At most, we will use quantiles. If the sample
## is too small, we just do regular unstratified CV
cuts <- floor(length(y) / k)
if (cuts < 2) { cuts <- 2 }
if (cuts > 5) { cuts <- 5 }
y <- cut(y,
unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
include.lowest = TRUE)
unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
include.lowest = TRUE)
}
if (k < length(y)) {
## reset levels so that the possible levels and
## Reset levels so that the possible levels and
## the levels in the vector are the same
y <- factor(as.character(y))
numInClass <- table(y)
foldVector <- vector(mode = "integer", length(y))
## For each class, balance the fold allocation as far
## as possible, then resample the remainder.
## The final assignment of folds is also randomized.
for (i in seq_along(numInClass)) {
## create a vector of integers from 1:k as many times as possible without
## Create a vector of integers from 1:k as many times as possible without
## going over the number of samples in the class. Note that if the number
## of samples in a class is less than k, nothing is producd here.
seqVector <- rep(seq_len(k), numInClass[i] %/% k)
## add enough random integers to get length(seqVector) == numInClass[i]
## Add enough random integers to get length(seqVector) == numInClass[i]
if (numInClass[i] %% k > 0) {
seqVector <- c(seqVector, sample(seq_len(k), numInClass[i] %% k))
}
## shuffle the integers for fold assignment and assign to this classes's data
## Shuffle the integers for fold assignment and assign to this classes's data
foldVector[y == dimnames(numInClass)$y[i]] <- sample(seqVector)
}
} else {
foldVector <- seq(along = y)
}
# Return data
out <- split(seq(along = y), foldVector)
`names<-`(out, NULL)
}
lgb.merge.cv.result <- function(msg, showsd = TRUE){
if (length(msg) == 0) { stop("lgb.cv: size of cv result error") }
lgb.merge.cv.result <- function(msg, showsd = TRUE) {
# Get CV message length
if (length(msg) == 0) {
stop("lgb.cv: size of cv result error")
}
# Get evaluation message length
eval_len <- length(msg[[1]])
if (eval_len == 0) { stop("lgb.cv: should provide at least one metric for CV") }
# Is evaluation message empty?
if (eval_len == 0) {
stop("lgb.cv: should provide at least one metric for CV")
}
# Get evaluation results using a list apply
eval_result <- lapply(seq_len(eval_len), function(j) {
as.numeric(lapply(seq_along(msg), function(i) { msg[[i]][[j]]$value }))
as.numeric(lapply(seq_along(msg), function(i) {
msg[[i]][[j]]$value }))
})
# Get evaluation
ret_eval <- msg[[1]]
for (j in seq_len(eval_len)) { ret_eval[[j]]$value <- mean(eval_result[[j]]) }
# Go through evaluation length items
for (j in seq_len(eval_len)) {
ret_eval[[j]]$value <- mean(eval_result[[j]])
}
# Preinit evaluation error
ret_eval_err <- NULL
# Check for standard deviation
if (showsd) {
# Parse standard deviation
for (j in seq_len(eval_len)) {
ret_eval_err <- c( ret_eval_err, sqrt( mean(eval_result[[j]]^2) - mean(eval_result[[j]])^2 ))
ret_eval_err <- c(ret_eval_err,
sqrt(mean(eval_result[[j]] ^ 2) - mean(eval_result[[j]]) ^ 2))
}
# Convert to list
ret_eval_err <- as.list(ret_eval_err)
}
list(eval_list = ret_eval, eval_err_list = ret_eval_err)
# Return errors
list(eval_list = ret_eval,
eval_err_list = ret_eval_err)
}
#' Compute feature importance in a model
#'
#'
#' Creates a \code{data.table} of feature importances in a model.
#'
#'
#' @param model object of class \code{lgb.Booster}.
#' @param percentage whether to show importance in relative percentage.
#'
#'
#' @return
#'
#'
#' For a tree model, a \code{data.table} with the following columns:
#' \itemize{
#' \item \code{Feature} Feature names in the model.
......@@ -14,10 +14,11 @@
#' \item \code{Cover} The number of observation related to this feature.
#' \item \code{Frequency} The number of times a feature splited in trees.
#' }
#'
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
......@@ -29,16 +30,22 @@
#'
#' tree_imp1 <- lgb.importance(model, percentage = TRUE)
#' tree_imp2 <- lgb.importance(model, percentage = FALSE)
#'
#' }
#'
#' @importFrom magrittr %>% %T>%
#' @importFrom data.table :=
#' @export
lgb.importance <- function(model, percentage = TRUE) {
# Check if model is a lightgbm model
if (!any(class(model) == "lgb.Booster")) {
stop("'model' has to be an object of class lgb.Booster")
}
# Setup importance
tree_dt <- lgb.model.dt.tree(model)
# Extract elements
tree_imp <- tree_dt %>%
magrittr::extract(.,
i = is.na(split_index) == FALSE,
......@@ -46,10 +53,15 @@ lgb.importance <- function(model, percentage = TRUE) {
by = "split_feature") %T>%
data.table::setnames(., old = "split_feature", new = "Feature") %>%
magrittr::extract(., i = order(Gain, decreasing = TRUE))
# Check if relative values are requested
if (percentage) {
tree_imp[, ":="(Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency))]
}
# Return importance table
return(tree_imp)
}
#' Compute feature contribution of prediction
#'
#'
#' Computes feature contribution components of rawscore prediction.
#'
#'
#' @param model object of class \code{lgb.Booster}.
#' @param data a matrix object or a dgCMatrix object.
#' @param idxset a integer vector of indices of rows needed.
#' @param num_iteration number of iteration want to predict with, NULL or <= 0 means use best iteration.
#'
#'
#' @return
#'
#'
#' For regression, binary classification and lambdarank model, a \code{list} of \code{data.table} with the following columns:
#' \itemize{
#' \item \code{Feature} Feature names in the model.
#' \item \code{Contribution} The total contribution of this feature's splits.
#' }
#' For multiclass classification, a \code{list} of \code{data.table} with the Feature column and Contribution columns to each class.
#'
#'
#' @examples
#'
#' \dontrun{
#' library(lightgbm)
#' Sigmoid <- function(x) 1 / (1 + exp(-x))
#' Logit <- function(x) log(x / (1 - x))
#' data(agaricus.train, package = 'lightgbm')
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
#' data(agaricus.test, package = 'lightgbm')
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#'
#' params = list(objective = "binary",
......@@ -34,47 +35,94 @@
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
#'
#' }
#'
#' @importFrom magrittr %>% %T>%
#' @export
lgb.interprete <- function(model, data, idxset, num_iteration = NULL) {
lgb.interprete <- function(model,
data,
idxset,
num_iteration = NULL) {
# Get tree model
tree_dt <- lgb.model.dt.tree(model, num_iteration)
# Check number of classes
num_class <- model$.__enclos_env__$private$num_class
# Get vector list
tree_interpretation_dt_list <- vector(mode = "list", length = length(idxset))
# Get parsed predictions of data
leaf_index_mat_list <- model$predict(data[idxset, , drop = FALSE],
num_iteration = num_iteration,
predleaf = TRUE) %>%
t(.) %>%
data.table::as.data.table(.) %>%
lapply(., FUN = function(x) matrix(x, ncol = num_class, byrow = TRUE))
# Get list of trees
tree_index_mat_list <- lapply(leaf_index_mat_list,
FUN = function(x) matrix(seq_len(length(x)) - 1, ncol = num_class, byrow = TRUE))
# Sequence over idxset
for (i in seq_along(idxset)) {
tree_interpretation_dt_list[[i]] <- single.row.interprete(tree_dt, num_class, tree_index_mat_list[[i]], leaf_index_mat_list[[i]])
}
# Return interpretation list
return(tree_interpretation_dt_list)
}
single.tree.interprete <- function(tree_dt, tree_id, leaf_id) {
single.tree.interprete <- function(tree_dt,
tree_id,
leaf_id) {
# Match tree id
single_tree_dt <- tree_dt[tree_index == tree_id, ]
# Get leaves
leaf_dt <- single_tree_dt[leaf_index == leaf_id, .(leaf_index, leaf_parent, leaf_value)]
# Get nodes
node_dt <- single_tree_dt[!is.na(split_index), .(split_index, split_feature, node_parent, internal_value)]
# Prepare sequences
feature_seq <- character(0)
value_seq <- numeric(0)
# Get to root from leaf
leaf_to_root <- function(parent_id, current_value) {
# Store value
value_seq <<- c(current_value, value_seq)
# Check for null parent id
if (!is.na(parent_id)) {
# Not null means existing node
this_node <- node_dt[split_index == parent_id, ]
feature_seq <<- c(this_node[["split_feature"]], feature_seq)
leaf_to_root(this_node[["node_parent"]], this_node[["internal_value"]])
}
}
# Perform leaf to root conversion
leaf_to_root(leaf_dt[["leaf_parent"]], leaf_dt[["leaf_value"]])
# Return formatted data.table
data.table::data.table(Feature = feature_seq, Contribution = diff.default(value_seq))
}
multiple.tree.interprete <- function(tree_dt, tree_index, leaf_index) {
multiple.tree.interprete <- function(tree_dt,
tree_index,
leaf_index) {
# Apply each trees
mapply(single.tree.interprete,
tree_id = tree_index, leaf_id = leaf_index,
MoreArgs = list(tree_dt = tree_dt),
......@@ -82,29 +130,51 @@ multiple.tree.interprete <- function(tree_dt, tree_index, leaf_index) {
data.table::rbindlist(., use.names = TRUE) %>%
magrittr::extract(., j = .(Contribution = sum(Contribution)), by = "Feature") %>%
magrittr::extract(., i = order(abs(Contribution), decreasing = TRUE))
}
single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index_mat) {
# Prepare vector list
tree_interpretation <- vector(mode = "list", length = num_class)
# Loop throughout each class
for (i in seq_len(num_class)) {
tree_interpretation[[i]] <- multiple.tree.interprete(tree_dt, tree_index_mat[,i], leaf_index_mat[,i]) %T>%
{
tree_interpretation[[i]] <- multiple.tree.interprete(tree_dt, tree_index_mat[,i], leaf_index_mat[,i]) %T>% {
# Number of classes larger than 1 requires adjustment
if (num_class > 1) {
data.table::setnames(., old = "Contribution", new = paste("Class", i - 1))
}
}
}
# Check for numbe rof classes larger than 1
if (num_class == 1) {
# First interpretation element
tree_interpretation_dt <- tree_interpretation[[1]]
} else {
# Full interpretation elements
tree_interpretation_dt <- Reduce(f = function(x, y) merge(x, y, by = "Feature", all = TRUE),
x = tree_interpretation)
# Loop throughout each tree
for (j in 2:ncol(tree_interpretation_dt)) {
data.table::set(tree_interpretation_dt,
i = which(is.na(tree_interpretation_dt[[j]])),
j = j,
value = 0)
}
}
# Return interpretation tree
return(tree_interpretation_dt)
}
#' Parse a LightGBM model json dump
#'
#'
#' Parse a LightGBM model json dump into a \code{data.table} structure.
#'
#'
#' @param model object of class \code{lgb.Booster}
#'
#'
#' @return
#' A \code{data.table} with detailed information about model trees' nodes and leafs.
#'
#'
#' The columns of the \code{data.table} are:
#'
#'
#' \itemize{
#' \item \code{tree_index}: ID of a tree in a model (integer)
#' \item \code{split_index}: ID of a node in a tree (integer)
#' \item \code{split_feature}: for a node, it's a feature name (character);
#' for a leaf, it simply labels it as \code{'NA'}
#' for a leaf, it simply labels it as \code{"NA"}
#' \item \code{node_parent}: ID of the parent node for current node (integer)
#' \item \code{leaf_index}: ID of a leaf in a tree (integer)
#' \item \code{leaf_parent}: ID of the parent node for current leaf (integer)
......@@ -25,10 +25,12 @@
#' \item \code{leaf_value}: Leaf value
#' \item \code{leaf_count}: The number of observation collected by a leaf
#' }
#'
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' \dontrun{
#' library(lightgbm)
#'
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
......@@ -39,57 +41,112 @@
#' model <- lgb.train(params, dtrain, 20)
#'
#' tree_dt <- lgb.model.dt.tree(model)
#'
#' }
#'
#' @importFrom magrittr %>%
#' @importFrom data.table :=
#' @export
lgb.model.dt.tree <- function(model, num_iteration = NULL) {
# Dump json model first
json_model <- lgb.dump(model, num_iteration = num_iteration)
# Parse json model second
parsed_json_model <- jsonlite::fromJSON(json_model,
simplifyVector = TRUE,
simplifyDataFrame = FALSE,
simplifyMatrix = FALSE,
flatten = FALSE)
# Parse tree model third
tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
# Combine into single data.table fourth
tree_dt <- data.table::rbindlist(tree_list, use.names = TRUE)
# Lookup sequence
tree_dt[, split_feature := Lookup(split_feature,
seq(0, parsed_json_model$max_feature_idx, by = 1),
parsed_json_model$feature_names)]
# Return tree
return(tree_dt)
}
single.tree.parse <- function(lgb_tree) {
# Setup initial default data.table with default types
single_tree_dt <- data.table::data.table(tree_index = integer(0),
split_index = integer(0), split_feature = integer(0), node_parent = integer(0),
leaf_index = integer(0), leaf_parent = integer(0),
split_gain = numeric(0), threshold = numeric(0), decision_type = character(0),
internal_value = integer(0), internal_count = integer(0),
leaf_value = integer(0), leaf_count = integer(0))
split_index = integer(0),
split_feature = integer(0),
node_parent = integer(0),
leaf_index = integer(0),
leaf_parent = integer(0),
split_gain = numeric(0),
threshold = numeric(0),
decision_type = character(0),
internal_value = integer(0),
internal_count = integer(0),
leaf_value = integer(0),
leaf_count = integer(0))
# Traverse tree function
pre_order_traversal <- function(tree_node_leaf, parent_index = NA) {
# Check if split index is not null in leaf
if (!is.null(tree_node_leaf$split_index)) {
# Overwrite data.table - this should be switched to an envir in the future
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
c(tree_node_leaf[c("split_index", "split_feature",
"split_gain", "threshold", "decision_type",
"internal_value", "internal_count")],
c(tree_node_leaf[c("split_index",
"split_feature",
"split_gain",
"threshold",
"decision_type",
"internal_value",
"internal_count")],
"node_parent" = parent_index)),
use.names = TRUE, fill = TRUE)
pre_order_traversal(tree_node_leaf$left_child, parent_index = tree_node_leaf$split_index)
pre_order_traversal(tree_node_leaf$right_child, parent_index = tree_node_leaf$split_index)
use.names = TRUE,
fill = TRUE)
# Traverse tree again both left and right
pre_order_traversal(tree_node_leaf$left_child,
parent_index = tree_node_leaf$split_index)
pre_order_traversal(tree_node_leaf$right_child,
parent_index = tree_node_leaf$split_index)
} else if (!is.null(tree_node_leaf$leaf_index)) {
# Overwrite data.table - this should be switched to an envir in the future
single_tree_dt <<- data.table::rbindlist(l = list(single_tree_dt,
tree_node_leaf[c("leaf_index", "leaf_parent",
"leaf_value", "leaf_count")]),
use.names = TRUE, fill = TRUE)
tree_node_leaf[c("leaf_index",
"leaf_parent",
"leaf_value",
"leaf_count")]),
use.names = TRUE,
fill = TRUE)
}
}
# Traverse structure
pre_order_traversal(lgb_tree$tree_structure)
# Store index
single_tree_dt[, tree_index := lgb_tree$tree_index]
# Return tree
return(single_tree_dt)
}
Lookup <- function(key, key_lookup, value_lookup, missing = NA) {
# Match key by looked up key
match(key, key_lookup) %>%
magrittr::extract(value_lookup, .) %>%
magrittr::inset(. , is.na(.), missing)
}
#' Plot feature importance as a bar graph
#'
#'
#' Plot previously calculated feature importance: Gain, Cover and Frequency, as a bar graph.
#'
#'
#' @param tree_imp a \code{data.table} returned by \code{\link{lgb.importance}}.
#' @param top_n maximal number of top features to include into the plot.
#' @param measure the name of importance measure to plot, can be "Gain", "Cover" or "Frequency".
#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#'
#'
#' @details
#' The graph represents each feature as a horizontal bar of length proportional to the defined importance of a feature.
#' Features are shown ranked in a decreasing importance order.
#'
#'
#' @return
#' The \code{lgb.plot.importance} function creates a \code{barplot}
#' and silently returns a processed data.table with \code{top_n} features sorted by defined importance.
#'
#'
#' @examples
#'
#' data(agaricus.train, package = 'lightgbm')
#' \dontrun{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#'
......@@ -30,21 +30,48 @@
#'
#' tree_imp <- lgb.importance(model, percentage = TRUE)
#' lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
#'
#' }
#'
#' @export
lgb.plot.importance <- function(tree_imp, top_n = 10, measure = "Gain", left_margin = 10, cex = NULL) {
lgb.plot.importance <- function(tree_imp,
top_n = 10,
measure = "Gain",
left_margin = 10,
cex = NULL) {
# Check for measurement (column names) correctness
measure <- match.arg(measure, choices = c("Gain", "Cover", "Frequency"), several.ok = FALSE)
# Get top N importance (defaults to 10)
top_n <- min(top_n, nrow(tree_imp))
# Parse importance
tree_imp <- tree_imp[order(abs(get(measure)), decreasing = TRUE),][1:top_n,]
# Attempt to setup a correct cex
if (is.null(cex)) {
cex <- 2.5 / log2(1 + top_n)
}
# Refresh plot
op <- par(no.readonly = TRUE)
on.exit(par(op))
# Do some magic plotting
par(mar = op$mar %>% magrittr::inset(., 2, left_margin))
# Do plot
tree_imp[.N:1,
barplot(height = get(measure), names.arg = Feature, horiz = TRUE, border = NA,
main = "Feature Importance", xlab = measure, cex.names = cex, las = 1)]
barplot(height = get(measure),
names.arg = Feature,
horiz = TRUE,
border = NA,
main = "Feature Importance",
xlab = measure,
cex.names = cex,
las = 1)]
# Return invisibly
invisible(tree_imp)
}
#' Plot feature contribution as a bar graph
#'
#'
#' Plot previously calculated feature contribution as a bar graph.
#'
#'
#' @param tree_interpretation_dt a \code{data.table} returned by \code{\link{lgb.interprete}}.
#' @param top_n maximal number of top features to include into the plot.
#' @param cols the column numbers of layout, will be used only for multiclass classification feature contribution.
#' @param left_margin (base R barplot) allows to adjust the left margin size to fit feature names.
#' @param cex (base R barplot) passed as \code{cex.names} parameter to \code{barplot}.
#'
#'
#' @details
#' The graph represents each feature as a horizontal bar of length proportional to the defined contribution of a feature.
#' Features are shown ranked in a decreasing contribution order.
#'
#'
#' @return
#' The \code{lgb.plot.interpretation} function creates a \code{barplot}
#'
#' The \code{lgb.plot.interpretation} function creates a \code{barplot}.
#'
#' @examples
#'
#' Sigmoid <- function(x) 1 / (1 + exp(-x))
#' Logit <- function(x) log(x / (1 - x))
#' data(agaricus.train, package = 'lightgbm')
#' \dontrun{
#' library(lightgbm)
#' Sigmoid <- function(x) {1 / (1 + exp(-x))}
#' Logit <- function(x) {log(x / (1 - x))}
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' setinfo(dtrain, "init_score", rep(Logit(mean(train$label)), length(train$label)))
#' data(agaricus.test, package = 'lightgbm')
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#'
#'
#' params = list(objective = "binary",
#' learning_rate = 0.01, num_leaves = 63, max_depth = -1,
#' min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
#' model <- lgb.train(params, dtrain, 20)
#' model <- lgb.train(params, dtrain, 20)
#'
#'
#' tree_interpretation <- lgb.interprete(model, test$data, 1:5)
#' lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10)
#'
#' }
#'
#' @export
lgb.plot.interpretation <- function(tree_interpretation_dt, top_n = 10, cols = 1, left_margin = 10, cex = NULL) {
lgb.plot.interpretation <- function(tree_interpretation_dt,
top_n = 10,
cols = 1,
left_margin = 10,
cex = NULL) {
# Get number of columns
num_class <- ncol(tree_interpretation_dt) - 1
# Refresh plot
op <- par(no.readonly = TRUE)
on.exit(par(op))
# Do some magic plotting
par(mar = op$mar %>% magrittr::inset(., 1:3, c(3, left_margin, 2)))
# Check for number of classes
if (num_class == 1) {
multiple.tree.plot.interpretation(tree_interpretation_dt, top_n = top_n, title = NULL, cex = cex)
# Only one class, plot straight away
multiple.tree.plot.interpretation(tree_interpretation_dt,
top_n = top_n,
title = NULL,
cex = cex)
} else {
# More than one class, shape data first
layout_mat <- matrix(seq(1, cols * ceiling(num_class / cols)),
ncol = cols, nrow = ceiling(num_class / cols))
# Shape output
par(mfcol = c(nrow(layout_mat), ncol(layout_mat)))
# Loop throughout all classes
for (i in seq_len(num_class)) {
# Prepare interpretation, perform T, get the names, and plot straight away
tree_interpretation_dt[, c(1, i + 1), with = FALSE] %T>%
data.table::setnames(., old = names(.), new = c("Feature", "Contribution")) %>%
multiple.tree.plot.interpretation(., top_n = top_n, title = paste("Class", i - 1), cex = cex)
multiple.tree.plot.interpretation(., # Self
top_n = top_n,
title = paste("Class", i - 1),
cex = cex)
}
}
}
multiple.tree.plot.interpretation <- function(tree_interpretation, top_n, title, cex) {
multiple.tree.plot.interpretation <- function(tree_interpretation,
top_n,
title,
cex) {
# Parse tree
tree_interpretation <- tree_interpretation[order(abs(Contribution), decreasing = TRUE),][1:min(top_n, .N),]
# Attempt to setup a correct cex
if (is.null(cex)) {
cex <- 2.5 / log2(1 + top_n)
}
# Do plot
tree_interpretation[.N:1,
barplot(height = Contribution, names.arg = Feature, horiz = TRUE,
barplot(height = Contribution,
names.arg = Feature,
horiz = TRUE,
col = ifelse(Contribution > 0, "firebrick", "steelblue"),
border = NA, main = title, cex.names = cex, las = 1)]
border = NA,
main = title,
cex.names = cex,
las = 1)]
# Return invisibly
invisible(NULL)
}
#' Main training logic for LightGBM
#'
#'
#' @param params List of parameters
#' @param data a \code{lgb.Dataset} object, used for training
#' @param nrounds number of training rounds
......@@ -30,146 +30,236 @@
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param ... other parameters, see parameters.md for more informations
#'
#' @return a trained booster model \code{lgb.Booster}.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' }
#'
#' @rdname lgb.train
#'
#' @export
lgb.train <- function(params = list(), data, nrounds = 10,
valids = list(),
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
lgb.train <- function(params = list(),
data,
nrounds = 10,
valids = list(),
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(), ...) {
callbacks = list(),
...) {
# Setup temporary variables
additional_params <- list(...)
params <- append(params, additional_params)
params <- append(params, additional_params)
params$verbose <- verbose
params <- lgb.check.obj(params, obj)
params <- lgb.check.eval(params, eval)
fobj <- NULL
feval <- NULL
params <- lgb.check.obj(params, obj)
params <- lgb.check.eval(params, eval)
fobj <- NULL
feval <- NULL
# Check for objective (function or not)
if (is.function(params$objective)) {
fobj <- params$objective
fobj <- params$objective
params$objective <- "NONE"
}
if (is.function(eval)) { feval <- eval }
# Check for loss (function or not)
if (is.function(eval)) {
feval <- eval
}
# Check for parameters
lgb.check.params(params)
# Init predictor to empty
predictor <- NULL
# Check for boosting from a trained model
if (is.character(init_model)) {
predictor <- Predictor$new(init_model)
} else if (lgb.is.Booster(init_model)) {
predictor <- init_model$to_predictor()
}
# Set the iteration to start from / end to (and check for boosting from a trained model, again)
begin_iteration <- 1
if (!is.null(predictor)) {
begin_iteration <- predictor$current_iter() + 1
}
end_iteration <- begin_iteration + nrounds - 1
# check dataset
# Check for training dataset type correctness
if (!lgb.is.Dataset(data)) {
stop("lgb.train: data only accepts lgb.Dataset object")
}
# Check for validation dataset type correctness
if (length(valids) > 0) {
# One or more validation dataset
# Check for list as input and type correctness by object
if (!is.list(valids) || !all(sapply(valids, lgb.is.Dataset))) {
stop("lgb.train: valids must be a list of lgb.Dataset elements")
}
# Attempt to get names
evnames <- names(valids)
# Check for names existance
if (is.null(evnames) || !all(nzchar(evnames))) {
stop("lgb.train: each element of the valids must have a name tag")
}
}
# Update parameters with parsed parameters
data$update_params(params)
# Create the predictor set
data$.__enclos_env__$private$set_predictor(predictor)
if (!is.null(colnames)) { data$set_colnames(colnames) }
if (!is.null(categorical_feature)) { data$set_categorical_feature(categorical_feature) }
# Write column names
if (!is.null(colnames)) {
data$set_colnames(colnames)
}
# Write categorical features
if (!is.null(categorical_feature)) {
data$set_categorical_feature(categorical_feature)
}
# Construct datasets, if needed
data$construct()
vaild_contain_train <- FALSE
train_data_name <- "train"
reduced_valid_sets <- list()
train_data_name <- "train"
reduced_valid_sets <- list()
# Parse validation datasets
if (length(valids) > 0) {
# Loop through all validation datasets using name
for (key in names(valids)) {
# Use names to get validation datasets
valid_data <- valids[[key]]
# Check for duplicate train/validation dataset
if (identical(data, valid_data)) {
vaild_contain_train <- TRUE
train_data_name <- key
train_data_name <- key
next
}
# Update parameters, data
valid_data$update_params(params)
valid_data$set_reference(data)
reduced_valid_sets[[key]] <- valid_data
}
}
# process callbacks
# Add printing log callback
if (verbose > 0 & eval_freq > 0) {
callbacks <- add.cb(callbacks, cb.print.evaluation(eval_freq))
}
# Add evaluation log callback
if (record & length(valids) > 0) {
callbacks <- add.cb(callbacks, cb.record.evaluation())
}
# Early stopping callback
# Add early stopping callback
if (!is.null(early_stopping_rounds)) {
if (early_stopping_rounds > 0) {
callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds, verbose = verbose))
}
}
# "Categorize" callbacks
cb <- categorize.callbacks(callbacks)
# construct booster
# Construct booster with datasets
booster <- Booster$new(params = params, train_set = data)
if (vaild_contain_train) { booster$set_train_data_name(train_data_name) }
for (key in names(reduced_valid_sets)) {
booster$add_valid(reduced_valid_sets[[key]], key)
}
# callback env
env <- CB_ENV$new()
env$model <- booster
# Callback env
env <- CB_ENV$new()
env$model <- booster
env$begin_iteration <- begin_iteration
env$end_iteration <- end_iteration
env$end_iteration <- end_iteration
#start training
# Start training model using number of iterations to start and end with
for (i in seq(from = begin_iteration, to = end_iteration)) {
# Overwrite iteration in environment
env$iteration <- i
env$eval_list <- list()
for (f in cb$pre_iter) { f(env) }
# update one iter
# Loop through "pre_iter" element
for (f in cb$pre_iter) {
f(env)
}
# Update one boosting iteration
booster$update(fobj = fobj)
# collect eval result
# Prepare collection of evaluation results
eval_list <- list()
# Collection: Has validation dataset?
if (length(valids) > 0) {
# Validation has training dataset?
if (vaild_contain_train) {
eval_list <- append(eval_list, booster$eval_train(feval = feval))
}
# Has no validation dataset
eval_list <- append(eval_list, booster$eval_valid(feval = feval))
}
# Write evaluation result in environment
env$eval_list <- eval_list
for (f in cb$post_iter) { f(env) }
# met early stopping
# Loop through env
for (f in cb$post_iter) {
f(env)
}
# Check for early stopping and break if needed
if (env$met_early_stop) break
}
booster
# Return booster
return(booster)
}
#' LightGBM unloading error fix
#'
#' Attempts to unload LightGBM packages so you can remove objects cleanly without having to restart R. This is useful for instance if an object becomes stuck for no apparent reason and you do not want to restart R to fix the lost object.
#'
#' @param restart Whether to reload \code{LightGBM} immediately after detaching from R. Defaults to \code{TRUE} which means automatically reload \code{LightGBM} once unloading is performed.
#' @param wipe Whether to wipe all \code{lgb.Dataset} and \code{lgb.Booster} from the global environment. Defaults to \code{FALSE} which means to not remove them.
#' @param envir The environment to perform wiping on if \code{wipe == TRUE}. Defaults to \code{.GlobalEnv} which is the global environment.
#'
#' @return NULL invisibly.
#'
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' lgb.unloader(restore = FALSE, wipe = FALSE, envir = .GlobalEnv)
#' rm(model, dtrain, dtest) # Not needed if wipe = TRUE
#' gc() # Not needed if wipe = TRUE
#'
#' library(lightgbm)
#' # Do whatever you want again with LightGBM without object clashing
#' }
#'
#' @export
lgb.unloader <- function(restore = TRUE, wipe = FALSE, envir = .GlobalEnv) {
# Unload package
try(detach("package:lightgbm", unload = TRUE), silent = TRUE)
# Should we wipe variables? (lgb.Booster, lgb.Dataset)
if (wipe) {
rm(list = ls(envir = envir)[which(sapply(ls(.GlobalEnv), function(x) {"lgb.Booster" %in% class(get(x, envir = envir))}))], envir = envir)
rm(list = ls(envir = envir)[which(sapply(ls(.GlobalEnv), function(x) {"lgb.Dataset" %in% class(get(x, envir = envir))}))], envir = envir)
gc(verbose = FALSE)
}
# Load package back?
if (restore) {
library(lightgbm)
}
invisible()
}
......@@ -3,25 +3,43 @@
#'
#' @rdname lgb.train
#' @export
lightgbm <- function(data, label = NULL, weight = NULL,
params = list(), nrounds = 10,
verbose = 1, eval_freq = 1L,
early_stopping_rounds = NULL,
save_name = "lightgbm.model",
init_model = NULL, callbacks = list(), ...) {
lightgbm <- function(data,
label = NULL,
weight = NULL,
params = list(),
nrounds = 10,
verbose = 1,
eval_freq = 1L,
early_stopping_rounds = NULL,
save_name = "lightgbm.model",
init_model = NULL,
callbacks = list(),
...) {
# Set data to a temporary variable
dtrain <- data
# Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
if (!lgb.is.Dataset(dtrain)) {
dtrain <- lgb.Dataset(data, label = label, weight = weight)
}
# Set validation as oneself
valids <- list()
if (verbose > 0) { valids$train = dtrain }
if (verbose > 0) {
valids$train = dtrain
}
# Train a model using the regular way
bst <- lgb.train(params, dtrain, nrounds, valids, verbose = verbose, eval_freq = eval_freq,
early_stopping_rounds = early_stopping_rounds,
init_model = init_model, callbacks = callbacks, ...)
# Store model under a specific name
bst$save_model(save_name)
bst
# Return booster
return(bst)
}
#' Training part from Mushroom Data Set
......
......@@ -10,33 +10,52 @@
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' saveRDS.lgb.Booster(model, "model.rds")
#' new_model <- readRDS.lgb.Booster("model.rds")
#' }
#'
#' @export
readRDS.lgb.Booster <- function(file = "", refhook = NULL) {
# Read RDS file
object <- readRDS(file = file, refhook = refhook)
# Check if object has the model stored
if (!is.na(object$raw)) {
# Create temporary file for the model loading
temp <- tempfile()
write(object$raw, temp)
object2 <- lgb.load(temp)
file.remove(temp)
# Restore best iteration and recorded evaluations
object2$best_iter <- object$best_iter
object2$record_evals <- object$record_evals
# Return newly loaded object
return(object2)
} else {
# Return RDS loaded object
return(object)
}
}
......@@ -15,27 +15,60 @@
#' @examples
#' \dontrun{
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' data(agaricus.test, package='lightgbm')
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label=test$label)
#' params <- list(objective="regression", metric="l2")
#' valids <- list(test=dtest)
#' model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10)
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(params,
#' dtrain,
#' 100,
#' valids,
#' min_data = 1,
#' learning_rate = 1,
#' early_stopping_rounds = 10)
#' saveRDS.lgb.Booster(model, "model.rds")
#' }
#'
#' @export
saveRDS.lgb.Booster <- function(object, file = "", ascii = FALSE, version = NULL, compress = TRUE, refhook = NULL, raw = TRUE) {
saveRDS.lgb.Booster <- function(object,
file = "",
ascii = FALSE,
version = NULL,
compress = TRUE,
refhook = NULL,
raw = TRUE) {
# Check if object has a raw value (and if the user wants to store the raw)
if (is.na(object$raw) & (raw)) {
# Save model
object$save()
saveRDS(object, file = file, ascii = ascii, version = version, compress = compress, refhook = refhook)
# Save RDS
saveRDS(object,
file = file,
ascii = ascii,
version = version,
compress = compress,
refhook = refhook)
# Free model from memory
object$raw <- NA
} else {
saveRDS(object, file = file, ascii = ascii, version = version, compress = compress, refhook = refhook)
# Save as usual
saveRDS(object,
file = file,
ascii = ascii,
version = version,
compress = compress,
refhook = refhook)
}
}
lgb.is.Booster <- function(x) { lgb.check.r6.class(x, "lgb.Booster") }
lgb.is.Booster <- function(x) {
lgb.check.r6.class(x, "lgb.Booster") # Checking if it is of class lgb.Booster or not
}
lgb.is.Dataset <- function(x) { lgb.check.r6.class(x, "lgb.Dataset") }
lgb.is.Dataset <- function(x) {
lgb.check.r6.class(x, "lgb.Dataset") # Checking if it is of class lgb.Dataset or not
}
# use 64bit data to store address
lgb.new.handle <- function() { 0.0 }
lgb.new.handle <- function() {
0.0 # Return numeric type in R
}
lgb.is.null.handle <- function(x) { is.null(x) || x == 0 }
lgb.is.null.handle <- function(x) {
is.null(x) || x == 0 # Is it null or zero?
}
lgb.encode.char <- function(arr, len) {
if (!is.raw(arr)) {
stop("lgb.encode.char: Can only encode from raw type")
stop("lgb.encode.char: Can only encode from raw type") # Not an object of type raw
}
rawToChar(arr[seq_len(len)])
rawToChar(arr[seq_len(len)]) # Return the conversion of raw type to character type
}
lgb.call <- function(fun_name, ret, ...) {
# Set call state to a zero value
call_state <- 0L
# Check for a ret call
if (!is.null(ret)) {
call_state <- .Call(fun_name, ..., ret, call_state, PACKAGE = "lightgbm")
call_state <- .Call(fun_name, ..., ret, call_state, PACKAGE = "lightgbm") # Call with ret
} else {
call_state <- .Call(fun_name, ..., call_state, PACKAGE = "lightgbm")
call_state <- .Call(fun_name, ..., call_state, PACKAGE = "lightgbm") # Call without ret
}
# Check for call state value post call
if (call_state != 0L) {
# Perform text error buffering
buf_len <- 200L
act_len <- 0L
err_msg <- raw(buf_len)
err_msg <- .Call("LGBM_GetLastError_R", buf_len, act_len, err_msg, PACKAGE = "lightgbm")
# Check error buffer
if (act_len > buf_len) {
buf_len <- act_len
err_msg <- raw(buf_len)
......@@ -35,107 +55,200 @@ lgb.call <- function(fun_name, ret, ...) {
err_msg,
PACKAGE = "lightgbm")
}
# Return error
stop(paste0("api error: ", lgb.encode.char(err_msg, act_len)))
}
ret
return(ret)
}
lgb.call.return.str <- function(fun_name, ...) {
# Create buffer
buf_len <- as.integer(1024 * 1024)
act_len <- 0L
buf <- raw(buf_len)
# Call buffer
buf <- lgb.call(fun_name, ret = buf, ..., buf_len, act_len)
# Check for buffer content
if (act_len > buf_len) {
buf_len <- act_len
buf <- raw(buf_len)
buf <- lgb.call(fun_name, ret = buf, ..., buf_len, act_len)
buf <- raw(buf_len)
buf <- lgb.call(fun_name, ret = buf, ..., buf_len, act_len)
}
lgb.encode.char(buf, act_len)
# Return encoded character
return(lgb.encode.char(buf, act_len))
}
lgb.params2str <- function(params, ...) {
if (!is.list(params)) { stop("params must be a list") }
# Check for a list as input
if (!is.list(params)) {
stop("params must be a list")
}
# Split parameter names
names(params) <- gsub("\\.", "_", names(params))
# merge parameters from the params and the dots-expansion
# Merge parameters from the params and the dots-expansion
dot_params <- list(...)
names(dot_params) <- gsub("\\.", "_", names(dot_params))
if (length(intersect(names(params),
names(dot_params))) > 0)
stop(
"Same parameters in ", sQuote("params"), " and in the call are not allowed. Please check your ", sQuote("params"), " list"
)
# Check for identical parameters
if (length(intersect(names(params), names(dot_params))) > 0) {
stop("Same parameters in ", sQuote("params"), " and in the call are not allowed. Please check your ", sQuote("params"), " list")
}
# Merge parameters
params <- c(params, dot_params)
ret <- list()
# Setup temporary variable
ret <- list()
# Perform key value join
for (key in names(params)) {
# join multi value first
# Join multi value first
val <- paste0(params[[key]], collapse = ",")
if (nchar(val) <= 0) next
# join key value
if (nchar(val) <= 0) next # Skip join
# Join key value
pair <- paste0(c(key, val), collapse = "=")
ret <- c(ret, pair)
ret <- c(ret, pair)
}
# Check ret length
if (length(ret) == 0) {
# Return empty string
lgb.c_str("")
} else {
# Return string separated by a space per element
lgb.c_str(paste0(ret, collapse = " "))
}
}
lgb.c_str <- function(x) {
# Perform character to raw conversion
ret <- charToRaw(as.character(x))
ret <- c(ret, as.raw(0))
ret
}
lgb.check.r6.class <- function(object, name) {
# Check for non-existence of R6 class
if (!("R6" %in% class(object))) {
return(FALSE)
}
# Check for non-existance of a named class
if (!(name %in% class(object))) {
return(FALSE)
}
TRUE
# Return default value
return(TRUE)
}
lgb.check.params <- function(params) {
# To-do
params
params # Currently return params because this is not finalized
}
lgb.check.obj <- function(params, obj) {
OBJECTIVES <- c("regression", "binary", "multiclass", "lambdarank")
# List known objectives in a vector
OBJECTIVES <- c("regression", "regression_l1", "regression_l2", "huber", "fair", "poisson", "binary", "lambdarank", "multiclass")
# Check whether the objective is empty or not, and take it from params if needed
if (!is.null(obj)) { params$objective <- obj }
# Check whether the objective is a character
if (is.character(params$objective)) {
# If the objective is a character, check if it is a known objective
if (!(params$objective %in% OBJECTIVES)) {
# Interrupt on unknown objective name
stop("lgb.check.obj: objective name error should be one of (", paste0(OBJECTIVES, collapse = ", "), ")")
}
} else if (!is.function(params$objective)) {
# If objective is not a character nor a function, then stop
stop("lgb.check.obj: objective should be a character or a function")
}
params
# Return parameters
return(params)
}
lgb.check.eval <- function(params, eval) {
if (is.null(params$metric)) { params$metric <- list() }
# Check if metric is null, if yes put a list instead
if (is.null(params$metric)) {
params$metric <- list()
}
# Check if evaluation metric is null, if not then append it
if (!is.null(eval)) {
# append metric
# Append metric if character or list
if (is.character(eval) || is.list(eval)) {
# Append metrics
params$metric <- append(params$metric, eval)
}
}
# Check if evaluation metric is not a function
if (!is.function(eval)) {
# Check if there is no parameter
if (length(params$metric) == 0) {
# add default metric
# Add default metric
params$metric <- switch(
params$objective,
regression = "l2",
binary = "binary_logloss",
multiclass = "multi_logloss",
lambdarank = "ndcg",
stop("lgb.check.eval: No default metric available for objective ", sQuote(params$objective))
regression = "l2", # MSE
regression_l1 = "l1", # MAE
regression_l2 = "l2", # MSE
huber = "l1", # Proxy for MAE
fair = "l1", # Proxy for MAE
poisson = "poisson", # Poisson
binary = "binary_logloss", # Logloss
multiclass = "multi_logloss", # Multiclass logloss
lambdarank = "ndcg", # Normalized discounted cumulative gain
stop("lgb.check.eval: No default metric available for objective ", sQuote(params$objective)) # Unknown objective parameter
)
}
}
params
# Return parameters
return(params)
}
require(lightgbm)
require(methods)
# we load in the agaricus dataset
# We load in the agaricus dataset
# In this example, we are aiming to predict whether a mushroom is edible
data(agaricus.train, package='lightgbm')
data(agaricus.test, package='lightgbm')
data(agaricus.train, package = "lightgbm")
data(agaricus.test, package = "lightgbm")
train <- agaricus.train
test <- agaricus.test
# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
# The loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
class(train$label)
class(train$data)
#-------------Basic Training using lightgbm-----------------
# this is the basic usage of lightgbm you can put matrix in data field
# note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
#--------------------Basic Training using lightgbm----------------
# This is the basic usage of lightgbm you can put matrix in data field
# Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
# Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector)
print("Training lightgbm with sparseMatrix")
bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4, learning_rate = 1, nrounds = 2,
bst <- lightgbm(data = train$data,
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
# alternatively, you can put in dense matrix, i.e. basic R-matrix
# Alternatively, you can put in dense matrix, i.e. basic R-matrix
print("Training lightgbm with Matrix")
bst <- lightgbm(data = as.matrix(train$data), label = train$label, num_leaves = 4, learning_rate = 1, nrounds = 2,
bst <- lightgbm(data = as.matrix(train$data),
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
# you can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
# You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
print("Training lightgbm with lgb.Dataset")
dtrain <- lgb.Dataset(data = train$data, label = train$label)
bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2,
objective = "binary")
dtrain <- lgb.Dataset(data = train$data,
label = train$label)
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
# Verbose = 0,1,2
print("Train lightgbm with verbose 0, no message")
bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2,
objective = "binary", verbose = 0)
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary",
verbose = 0)
print("Train lightgbm with verbose 1, print evaluation metric")
bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2,
nthread = 2, objective = "binary", verbose = 1)
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 1)
print("Train lightgbm with verbose 2, also print information about tree")
bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2,
nthread = 2, objective = "binary", verbose = 2)
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 2)
# you can also specify data as file path to a LibSVM/TCV/CSV format input
# since we do not have this file with us, the following line is just for illustration
# bst <- lightgbm(data = 'agaricus.train.svm', num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
# You can also specify data as file path to a LibSVM/TCV/CSV format input
# Since we do not have this file with us, the following line is just for illustration
# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
#--------------------basic prediction using lightgbm--------------
# you can do prediction using the following line
# you can put in Matrix, sparseMatrix, or lgb.Dataset
#--------------------Basic prediction using lightgbm--------------
# You can do prediction using the following line
# You can put in Matrix, sparseMatrix, or lgb.Dataset
pred <- predict(bst, test$data)
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
#-------------------save and load models-------------------------
# save model to binary local file
#--------------------Save and load models-------------------------
# Save model to binary local file
lgb.save(bst, "lightgbm.model")
# load binary model to R
# Load binary model to R
bst2 <- lgb.load("lightgbm.model")
pred2 <- predict(bst2, test$data)
# pred2 should be identical to pred
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred))))
#--------------------Advanced features ---------------------------
# To use advanced features, we need to put data in lgb.Dataset
dtrain <- lgb.Dataset(data = train$data, label = train$label, free_raw_data = FALSE)
dtest <- lgb.Dataset(data = test$data, label = test$label, free_raw_data = FALSE)
#----------------Advanced features --------------
# to use advanced features, we need to put data in lgb.Dataset
dtrain <- lgb.Dataset(data = train$data, label=train$label, free_raw_data=FALSE)
dtest <- lgb.Dataset(data = test$data, label=test$label, free_raw_data=FALSE)
#---------------Using valids----------------
#--------------------Using validation set-------------------------
# valids is a list of lgb.Dataset, each of them is tagged with name
valids <- list(train=dtrain, test=dtest)
# to train with valids, use lgb.train, which contains more advanced features
valids <- list(train = dtrain, test = dtest)
# To train with valids, use lgb.train, which contains more advanced features
# valids allows us to monitor the evaluation result on all data in the list
print("Train lightgbm using lgb.train with valids")
bst <- lgb.train(data=dtrain, num_leaves=4, learning_rate=1, nrounds=2, valids=valids,
nthread = 2, objective = "binary")
# we can change evaluation metrics, or use multiple evaluation metrics
print("train lightgbm using lgb.train with valids, watch logloss and error")
bst <- lgb.train(data=dtrain, num_leaves=4, learning_rate=1, nrounds=2, valids=valids,
eval = c("binary_error","binary_logloss"),
nthread = 2, objective = "binary")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
# We can change evaluation metrics, or use multiple evaluation metrics
print("Train lightgbm using lgb.train with valids, watch logloss and error")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
eval = c("binary_error", "binary_logloss"),
nthread = 2,
objective = "binary")
# lgb.Dataset can also be saved using lgb.Dataset.save
lgb.Dataset.save(dtrain, "dtrain.buffer")
# to load it in, simply call lgb.Dataset
# To load it in, simply call lgb.Dataset
dtrain2 <- lgb.Dataset("dtrain.buffer")
bst <- lgb.train(data=dtrain2, num_leaves=4, learning_rate=1, nrounds=2, valids=valids,
nthread = 2, objective = "binary")
bst <- lgb.train(data = dtrain2,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
# information can be extracted from lgb.Dataset using getinfo
label = getinfo(dtest, "label")
pred <- predict(bst, test$data)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
print(paste("test-error=", err))
require(lightgbm)
require(methods)
# load in the agaricus dataset
data(agaricus.train, package='lightgbm')
data(agaricus.test, package='lightgbm')
# Load in the agaricus dataset
data(agaricus.train, package = "lightgbm")
data(agaricus.test, package = "lightgbm")
dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
valids <- list(eval = dtest, train = dtrain)
###
#--------------------Advanced features ---------------------------
# advanced: start from a initial base prediction
#
print('start running example to start from a initial prediction')
# train lightgbm for 1 round
param <- list(num_leaves=4, learning_rate=1, nthread = 2, silent=1, objective='binary')
bst <- lgb.train(param, dtrain, 1, valids=valids)
print("Start running example to start from a initial prediction")
# Train lightgbm for 1 round
param <- list(num_leaves = 4,
learning_rate = 1,
nthread = 2,
silent = 1,
objective = "binary")
bst <- lgb.train(param, dtrain, 1, valids = valids)
# Note: we need the margin value instead of transformed prediction in set_init_score
ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE)
ptest <- predict(bst, agaricus.test$data, rawscore = TRUE)
ptrain <- predict(bst, agaricus.train$data, rawscore=TRUE)
ptest <- predict(bst, agaricus.test$data, rawscore=TRUE)
# set the init_score property of dtrain and dtest
# base margin is the base prediction we will boost from
setinfo(dtrain, "init_score", ptrain)
setinfo(dtest, "init_score", ptest)
print('this is result of boost from initial prediction')
bst <- lgb.train(params = param, data = dtrain, nrounds = 5, valids = valids)
print("This is result of boost from initial prediction")
bst <- lgb.train(params = param,
data = dtrain,
nrounds = 5,
valids = valids)
require(lightgbm)
# load in the agaricus dataset
data(agaricus.train, package='lightgbm')
data(agaricus.test, package='lightgbm')
data(agaricus.train, package = "lightgbm")
data(agaricus.test, package = "lightgbm")
dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
nround <- 2
param <- list(num_leaves=4, learning_rate=1, objective='binary')
nrounds <- 2
param <- list(num_leaves = 4,
learning_rate = 1,
objective = "binary")
cat('running cross validation\n')
# do cross validation, this will print result out as
print("Running cross validation")
# Do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv(param, dtrain, nround, nfold=5, eval={'binary_error'})
lgb.cv(param,
dtrain,
nrounds,
nfold = 5,
eval = "binary_error")
cat('running cross validation, disable standard deviation display\n')
print("Running cross validation, disable standard deviation display")
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv(param, dtrain, nround, nfold=5,
eval='binary_error', showsd = FALSE)
lgb.cv(param,
dtrain,
nrounds,
nfold = 5,
eval = "binary_error",
showsd = FALSE)
###
# you can also do cross validation with cutomized loss function
# Tou can also do cross validation with cutomized loss function
# See custom_objective.R
##
print ('running cross validation, with cutomsized loss function')
print("Running cross validation, with cutomsized loss function")
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
preds <- 1 / (1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(name = "error", value = err, higher_better=FALSE))
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(name = "error", value = err, higher_better = FALSE))
}
# train with customized objective
lgb.cv(params = param, data = dtrain, nrounds = nround, obj=logregobj, eval=evalerror, nfold = 5)
lgb.cv(params = param,
data = dtrain,
nrounds = nrounds,
obj = logregobj,
eval = evalerror,
nfold = 5)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment