Unverified Commit fc991c9d authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] added R linting and changed R code to comma-first (fixes #2373) (#2437)

parent b4bb38d9
#' saveRDS for \code{lgb.Booster} models
#'
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides whether to save the raw model or not.
#' Attempts to save a model using RDS. Has an additional parameter (\code{raw}) which decides
#' whether to save the raw model or not.
#'
#' @param object R object to serialize.
#' @param file a connection or the name of the file where the R object is saved to or read from.
#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default), a binary one is used. See the comments in the help for save.
#' @param version the workspace format version to use. \code{NULL} specifies the current default version (2). Versions prior to 2 are not supported, so this will only be relevant when there are later versions.
#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression, or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of compression to be used. Ignored if file is a connection.
#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default),
#' a binary one is used. See the comments in the help for save.
#' @param version the workspace format version to use. \code{NULL} specifies the current default
#' version (2). Versions prior to 2 are not supported, so this will only be relevant
#' when there are later versions.
#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression,
#' or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of
#' compression to be used. Ignored if file is a connection.
#' @param refhook a hook function for handling reference objects.
#' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.
#'
......@@ -23,10 +29,10 @@
#' params <- list(objective = "regression", metric = "l2")
#' valids <- list(test = dtest)
#' model <- lgb.train(
#' params
#' , dtrain
#' , 10
#' , valids
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
......@@ -48,12 +54,14 @@ saveRDS.lgb.Booster <- function(object,
object$save()
# Save RDS
saveRDS(object,
file = file,
ascii = ascii,
version = version,
compress = compress,
refhook = refhook)
saveRDS(
object
, file = file
, ascii = ascii
, version = version
, compress = compress
, refhook = refhook
)
# Free model from memory
object$raw <- NA
......@@ -61,12 +69,14 @@ saveRDS.lgb.Booster <- function(object,
} else {
# Save as usual
saveRDS(object,
file = file,
ascii = ascii,
version = version,
compress = compress,
refhook = refhook)
saveRDS(
object
, file = file
, ascii = ascii
, version = version
, compress = compress
, refhook = refhook
)
}
......
......@@ -25,9 +25,20 @@ lgb.call <- function(fun_name, ret, ...) {
# Check for a ret call
if (!is.null(ret)) {
call_state <- .Call(fun_name, ..., ret, call_state, PACKAGE = "lib_lightgbm") # Call with ret
call_state <- .Call(
fun_name
, ...
, ret
, call_state
, PACKAGE = "lib_lightgbm"
)
} else {
call_state <- .Call(fun_name, ..., call_state, PACKAGE = "lib_lightgbm") # Call without ret
call_state <- .Call(
fun_name
, ...
, call_state
, PACKAGE = "lib_lightgbm"
)
}
call_state <- as.integer(call_state)
# Check for call state value post call
......@@ -37,17 +48,25 @@ lgb.call <- function(fun_name, ret, ...) {
buf_len <- 200L
act_len <- 0L
err_msg <- raw(buf_len)
err_msg <- .Call("LGBM_GetLastError_R", buf_len, act_len, err_msg, PACKAGE = "lib_lightgbm")
err_msg <- .Call(
"LGBM_GetLastError_R"
, buf_len
, act_len
, err_msg
, PACKAGE = "lib_lightgbm"
)
# Check error buffer
if (act_len > buf_len) {
buf_len <- act_len
err_msg <- raw(buf_len)
err_msg <- .Call("LGBM_GetLastError_R",
buf_len,
act_len,
err_msg,
PACKAGE = "lib_lightgbm")
err_msg <- .Call(
"LGBM_GetLastError_R"
, buf_len
, act_len
, err_msg
, PACKAGE = "lib_lightgbm"
)
}
# Return error
......@@ -97,7 +116,13 @@ lgb.params2str <- function(params, ...) {
# Check for identical parameters
if (length(intersect(names(params), names(dot_params))) > 0) {
stop("Same parameters in ", sQuote("params"), " and in the call are not allowed. Please check your ", sQuote("params"), " list")
stop(
"Same parameters in "
, sQuote("params")
, " and in the call are not allowed. Please check your "
, sQuote("params")
, " list"
)
}
# Merge parameters
......@@ -160,15 +185,43 @@ lgb.check.params <- function(params) {
lgb.check.obj <- function(params, obj) {
# List known objectives in a vector
OBJECTIVES <- c("regression", "regression_l1", "regression_l2", "mean_squared_error", "mse", "l2_root", "root_mean_squared_error", "rmse",
"mean_absolute_error", "mae", "quantile",
"huber", "fair", "poisson", "binary", "lambdarank",
"multiclass", "softmax", "multiclassova", "multiclass_ova", "ova", "ovr",
"xentropy", "cross_entropy", "xentlambda", "cross_entropy_lambda", "mean_absolute_percentage_error", "mape",
"gamma", "tweedie")
OBJECTIVES <- c(
"regression"
, "regression_l1"
, "regression_l2"
, "mean_squared_error"
, "mse"
, "l2_root"
, "root_mean_squared_error"
, "rmse"
, "mean_absolute_error"
, "mae"
, "quantile"
, "huber"
, "fair"
, "poisson"
, "binary"
, "lambdarank"
, "multiclass"
, "softmax"
, "multiclassova"
, "multiclass_ova"
, "ova"
, "ovr"
, "xentropy"
, "cross_entropy"
, "xentlambda"
, "cross_entropy_lambda"
, "mean_absolute_percentage_error"
, "mape"
, "gamma"
, "tweedie"
)
# Check whether the objective is empty or not, and take it from params if needed
if (!is.null(obj)) { params$objective <- obj }
if (!is.null(obj)) {
params$objective <- obj
}
# Check whether the objective is a character
if (is.character(params$objective)) {
......
......@@ -17,62 +17,82 @@ class(train$data)
# Note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
# Use sparse matrix when your feature is sparse (e.g. when you are using one-hot encoding vector)
print("Training lightgbm with sparseMatrix")
bst <- lightgbm(data = train$data,
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
bst <- lightgbm(
data = train$data
, label = train$label
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
)
# Alternatively, you can put in dense matrix, i.e. basic R-matrix
print("Training lightgbm with Matrix")
bst <- lightgbm(data = as.matrix(train$data),
label = train$label,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
bst <- lightgbm(
data = as.matrix(train$data)
, label = train$label
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
)
# You can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
print("Training lightgbm with lgb.Dataset")
dtrain <- lgb.Dataset(data = train$data,
label = train$label)
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary")
dtrain <- lgb.Dataset(
data = train$data
, label = train$label
)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
)
# Verbose = 0,1,2
print("Train lightgbm with verbose 0, no message")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
objective = "binary",
verbose = 0)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, objective = "binary"
, verbose = 0
)
print("Train lightgbm with verbose 1, print evaluation metric")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 1)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, nthread = 2
, objective = "binary"
, verbose = 1
)
print("Train lightgbm with verbose 2, also print information about tree")
bst <- lightgbm(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
nthread = 2,
objective = "binary",
verbose = 2)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, nthread = 2
, objective = "binary"
, verbose = 2
)
# You can also specify data as file path to a LibSVM/TCV/CSV format input
# Since we do not have this file with us, the following line is just for illustration
# bst <- lightgbm(data = "agaricus.train.svm", num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
# bst <- lightgbm(
# data = "agaricus.train.svm"
# , num_leaves = 4
# , learning_rate = 1
# , nrounds = 2
# , objective = "binary"
# )
#--------------------Basic prediction using lightgbm--------------
# You can do prediction using the following line
......@@ -104,37 +124,43 @@ valids <- list(train = dtrain, test = dtest)
# To train with valids, use lgb.train, which contains more advanced features
# valids allows us to monitor the evaluation result on all data in the list
print("Train lightgbm using lgb.train with valids")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
bst <- lgb.train(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, valids = valids
, nthread = 2
, objective = "binary"
)
# We can change evaluation metrics, or use multiple evaluation metrics
print("Train lightgbm using lgb.train with valids, watch logloss and error")
bst <- lgb.train(data = dtrain,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
eval = c("binary_error", "binary_logloss"),
nthread = 2,
objective = "binary")
bst <- lgb.train(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, valids = valids
, eval = c("binary_error", "binary_logloss")
, nthread = 2
, objective = "binary"
)
# lgb.Dataset can also be saved using lgb.Dataset.save
lgb.Dataset.save(dtrain, "dtrain.buffer")
# To load it in, simply call lgb.Dataset
dtrain2 <- lgb.Dataset("dtrain.buffer")
bst <- lgb.train(data = dtrain2,
num_leaves = 4,
learning_rate = 1,
nrounds = 2,
valids = valids,
nthread = 2,
objective = "binary")
bst <- lgb.train(
data = dtrain2
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, valids = valids
, nthread = 2
, objective = "binary"
)
# information can be extracted from lgb.Dataset using getinfo
label = getinfo(dtest, "label")
......
......@@ -13,10 +13,12 @@ valids <- list(eval = dtest, train = dtrain)
print("Start running example to start from an initial prediction")
# Train lightgbm for 1 round
param <- list(num_leaves = 4,
learning_rate = 1,
nthread = 2,
objective = "binary")
param <- list(
num_leaves = 4
, learning_rate = 1
, nthread = 2
, objective = "binary"
)
bst <- lgb.train(param, dtrain, 1, valids = valids)
# Note: we need the margin value instead of transformed prediction in set_init_score
......@@ -29,7 +31,9 @@ setinfo(dtrain, "init_score", ptrain)
setinfo(dtest, "init_score", ptest)
print("This is result of boost from initial prediction")
bst <- lgb.train(params = param,
data = dtrain,
nrounds = 5,
valids = valids)
bst <- lgb.train(
params = param
, data = dtrain
, nrounds = 5
, valids = valids
)
......@@ -60,21 +60,28 @@ my_data <- as.matrix(bank[, 1:16, with = FALSE])
# Creating the LightGBM dataset with categorical features
# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
lgb_data <- lgb.Dataset(data = my_data,
label = bank$y,
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
lgb_data <- lgb.Dataset(
data = my_data
, label = bank$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
)
# We can now train a model
model <- lgb.train(list(objective = "binary",
metric = "l2",
min_data = 1,
learning_rate = 0.1,
min_data = 0,
min_hessian = 1,
max_depth = 2),
lgb_data,
100,
valids = list(train = lgb_data))
params <- list(
objective = "binary"
, metric = "l2"
, min_data = 1
, learning_rate = 0.1
, min_data = 0
, min_hessian = 1
, max_depth = 2
)
model <- lgb.train(
params = params
, data = lgb_data
, nrounds = 100
, valids = list(train = lgb_data)
)
# Try to find split_feature: 2
# If you find it, it means it used a categorical feature in the first tree
......
......@@ -68,24 +68,33 @@ my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE])
# Creating the LightGBM dataset with categorical features
# The categorical features can be passed to lgb.train to not copy and paste a lot
dtrain <- lgb.Dataset(data = my_data_train,
label = bank_train$y,
categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16))
dtest <- lgb.Dataset.create.valid(dtrain,
data = my_data_test,
label = bank_test$y)
dtrain <- lgb.Dataset(
data = my_data_train
, label = bank_train$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
)
dtest <- lgb.Dataset.create.valid(
dtrain
, data = my_data_test
, label = bank_test$y
)
# We can now train a model
model <- lgb.train(list(objective = "binary",
metric = "l2",
min_data = 1,
learning_rate = 0.1,
min_data = 0,
min_hessian = 1,
max_depth = 2),
dtrain,
100,
valids = list(train = dtrain, valid = dtest))
params <- list(
objective = "binary"
, metric = "l2"
, min_data = 1
, learning_rate = 0.1
, min_data = 0
, min_hessian = 1
, max_depth = 2
)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 100
, valids = list(train = dtrain, valid = dtest)
)
# Try to find split_feature: 11
# If you find it, it means it used a categorical feature in the first tree
......
......@@ -6,30 +6,36 @@ dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = agaricus.test$label)
nrounds <- 2
param <- list(num_leaves = 4,
learning_rate = 1,
objective = "binary")
param <- list(
num_leaves = 4
, learning_rate = 1
, objective = "binary"
)
print("Running cross validation")
# Do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv(param,
dtrain,
nrounds,
nfold = 5,
eval = "binary_error")
lgb.cv(
param
, dtrain
, nrounds
, nfold = 5
, eval = "binary_error"
)
print("Running cross validation, disable standard deviation display")
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv(param,
dtrain,
nrounds,
nfold = 5,
eval = "binary_error",
showsd = FALSE)
lgb.cv(
param
, dtrain
, nrounds
, nfold = 5
, eval = "binary_error"
, showsd = FALSE
)
# You can also do cross validation with cutomized loss function
print("Running cross validation, with cutomsized loss function")
......@@ -48,9 +54,11 @@ evalerror <- function(preds, dtrain) {
}
# train with customized objective
lgb.cv(params = param,
data = dtrain,
nrounds = nrounds,
obj = logregobj,
eval = evalerror,
nfold = 5)
lgb.cv(
params = param
, data = dtrain
, nrounds = nrounds
, obj = logregobj
, eval = evalerror
, nfold = 5
)
......@@ -11,8 +11,10 @@ dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = aga
# Note: for customized objective function, we leave objective as default
# Note: what we are getting is margin value in prediction
# You must know what you are doing
param <- list(num_leaves = 4,
learning_rate = 1)
param <- list(
num_leaves = 4
, learning_rate = 1
)
valids <- list(eval = dtest)
num_round <- 20
......@@ -39,10 +41,12 @@ evalerror <- function(preds, dtrain) {
}
print("Start training with early Stopping setting")
bst <- lgb.train(param,
dtrain,
num_round,
valids,
objective = logregobj,
eval = evalerror,
early_stopping_round = 3)
bst <- lgb.train(
param
, dtrain
, num_round
, valids
, objective = logregobj
, eval = evalerror
, early_stopping_round = 3
)
......@@ -26,9 +26,11 @@ gbm <- list()
for (i in 1:1000) {
print(i)
gbm[[i]] <- lgb.train(params = list(objective = "regression"),
data = data,
1,
reset_data = TRUE)
gbm[[i]] <- lgb.train(
params = list(objective = "regression")
, data = data
, 1
, reset_data = TRUE
)
gc(verbose = FALSE)
}
......@@ -17,32 +17,47 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
# Third, we setup parameters and we train a model
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 0.1,
bagging_fraction = 0.1,
bagging_freq = 1,
bagging_seed = 1)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 0.1
, bagging_fraction = 0.1
, bagging_freq = 1
, bagging_seed = 1
)
# We create a data.frame with the following structure:
# X = average leaf of the observation throughout all trees
# Y = prediction probability (clamped to [1e-15, 1-1e-15])
# Z = logloss
# binned = binned quantile of average leaf
new_data <- data.frame(X = rowMeans(predict(model,
agaricus.test$data,
predleaf = TRUE)),
Y = pmin(pmax(predict(model,
agaricus.test$data), 1e-15), 1 - 1e-15))
new_data$Z <- -(agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
new_data$binned <- .bincode(x = new_data$X,
breaks = quantile(x = new_data$X,
probs = (1:9)/10),
right = TRUE,
include.lowest = TRUE)
new_data <- data.frame(
X = rowMeans(predict(
model
, agaricus.test$data
, predleaf = TRUE
))
, Y = pmin(
pmax(
predict(model, agaricus.test$data)
, 1e-15
)
, 1 - 1e-15
)
)
new_data$Z <- -1 * (agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
new_data$binned <- .bincode(
x = new_data$X
, breaks = quantile(
x = new_data$X
, probs = (1:9) / 10
)
, right = TRUE
, include.lowest = TRUE
)
new_data$binned[is.na(new_data$binned)] <- 0
new_data$binned <- as.factor(new_data$binned)
......@@ -52,31 +67,64 @@ table(new_data$binned)
# We can plot the binned content
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is smooth!
ggplot(data = new_data, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(data = new_data, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(data = new_data, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data
, mapping = aes(x = X, y = Y, color = binned)
) + geom_point() +
theme_bw() +
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(
data = new_data
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
) + geom_boxplot() +
theme_bw() +
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(
data = new_data
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
# Now, let's show with other parameters
model2 <- lgb.train(params,
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1)
model2 <- lgb.train(
params
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
)
# We create the data structure, but for model2
new_data2 <- data.frame(X = rowMeans(predict(model2,
agaricus.test$data,
predleaf = TRUE)),
Y = pmin(pmax(predict(model2,
agaricus.test$data), 1e-15), 1 - 1e-15))
new_data2$Z <- -(agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
new_data2$binned <- .bincode(x = new_data2$X,
breaks = quantile(x = new_data2$X,
probs = (1:9)/10),
right = TRUE,
include.lowest = TRUE)
new_data2 <- data.frame(
X = rowMeans(predict(
model2
, agaricus.test$data
, predleaf = TRUE
))
, Y = pmin(
pmax(
predict(
model2
, agaricus.test$data
)
, 1e-15
)
, 1 - 1e-15
)
)
new_data2$Z <- -1 * (agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
new_data2$binned <- .bincode(
x = new_data2$X
, breaks = quantile(
x = new_data2$X
, probs = (1:9) / 10
)
, right = TRUE
, include.lowest = TRUE
)
new_data2$binned[is.na(new_data2$binned)] <- 0
new_data2$binned <- as.factor(new_data2$binned)
......@@ -87,31 +135,64 @@ table(new_data2$binned)
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue
# However, if the rules were not true, the loss would explode.
ggplot(data = new_data2, mapping = aes(x = X, y = Y, color = binned)) + geom_point() + theme_bw() + labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(data = new_data2, mapping = aes(x = binned, y = Z, fill = binned, group = binned)) + geom_boxplot() + theme_bw() + labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data2
, mapping = aes(x = X, y = Y, color = binned)
) + geom_point() +
theme_bw() +
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
ggplot(
data = new_data2
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
) + geom_boxplot() +
theme_bw() +
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
ggplot(
data = new_data2
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
# Now, try with very severe overfitting
model3 <- lgb.train(params,
dtrain,
1000,
valids,
min_data = 1,
learning_rate = 1)
model3 <- lgb.train(
params
, dtrain
, 1000
, valids
, min_data = 1
, learning_rate = 1
)
# We create the data structure, but for model3
new_data3 <- data.frame(X = rowMeans(predict(model3,
agaricus.test$data,
predleaf = TRUE)),
Y = pmin(pmax(predict(model3,
agaricus.test$data), 1e-15), 1 - 1e-15))
new_data3$Z <- -(agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
new_data3$binned <- .bincode(x = new_data3$X,
breaks = quantile(x = new_data3$X,
probs = (1:9)/10),
right = TRUE,
include.lowest = TRUE)
new_data3 <- data.frame(
X = rowMeans(predict(
model3
, agaricus.test$data
, predleaf = TRUE
))
, Y = pmin(
pmax(
predict(
model3
, agaricus.test$data
)
, 1e-15
)
, 1 - 1e-15
)
)
new_data3$Z <- -1 * (agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
new_data3$binned <- .bincode(
x = new_data3$X
, breaks = quantile(
x = new_data3$X
, probs = (1:9) / 10
)
, right = TRUE
, include.lowest = TRUE
)
new_data3$binned[is.na(new_data3$binned)] <- 0
new_data3$binned <- as.factor(new_data3$binned)
......@@ -119,9 +200,21 @@ new_data3$binned <- as.factor(new_data3$binned)
table(new_data3$binned)
# We can plot the binned content
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue.
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules
# are real thus it is not an issue.
# However, if the rules were not true, the loss would explode. See the sudden spikes?
ggplot(data = new_data3, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data3
, mapping = aes(x = Y, y = ..count.., fill = binned)
) +
geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
# Compare with our second model, the difference is severe. This is smooth.
ggplot(data = new_data2, mapping = aes(x = Y, y = ..count.., fill = binned)) + geom_density(position = "fill") + theme_bw() + labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
ggplot(
data = new_data2
, mapping = aes(x = Y, y = ..count.., fill = binned)
) + geom_density(position = "fill") +
theme_bw() +
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
......@@ -19,29 +19,33 @@ valids <- list(test = dtest)
# Method 1 of training
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
model <- lgb.train(params,
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
model <- lgb.train(
params
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
# We can predict on test data, outputs a 90-length vector
# Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
my_preds <- predict(model, test[, 1:4])
# Method 2 of training, identical
model <- lgb.train(list(),
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10,
objective = "multiclass",
metric = "multi_error",
num_class = 3)
model <- lgb.train(
list()
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, objective = "multiclass"
, metric = "multi_error"
, num_class = 3
)
# We can predict on test data, identical
my_preds <- predict(model, test[, 1:4])
......
......@@ -20,17 +20,19 @@ valids <- list(train = dtrain, test = dtest)
# Method 1 of training with built-in multiclass objective
# Note: need to turn off boost from average to match custom objective
# (https://github.com/microsoft/LightGBM/issues/1846)
model_builtin <- lgb.train(list(),
dtrain,
boost_from_average = FALSE,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10,
objective = "multiclass",
metric = "multi_logloss",
num_class = 3)
model_builtin <- lgb.train(
list()
, dtrain
, boost_from_average = FALSE
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, objective = "multiclass"
, metric = "multi_logloss"
, num_class = 3
)
preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE, reshape = TRUE)
probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin))
......@@ -65,21 +67,25 @@ custom_multiclass_metric = function(preds, dtrain) {
preds = preds - apply(preds, 1, max)
prob = exp(preds) / rowSums(exp(preds))
return(list(name = "error",
value = -mean(log(prob[cbind(1:length(labels), labels + 1)])),
higher_better = FALSE))
return(list(
name = "error"
, value = -mean(log(prob[cbind(1:length(labels), labels + 1)]))
, higher_better = FALSE
))
}
model_custom <- lgb.train(list(),
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10,
objective = custom_multiclass_obj,
eval = custom_multiclass_metric,
num_class = 3)
model_custom <- lgb.train(
list()
, dtrain
, 100
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, objective = custom_multiclass_obj
, eval = custom_multiclass_metric
, num_class = 3
)
preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE, reshape = TRUE)
probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
......@@ -87,4 +93,3 @@ probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
# compare predictions
stopifnot(identical(probs_builtin, probs_custom))
stopifnot(identical(preds_builtin, preds_custom))
......@@ -11,8 +11,8 @@ library(lightgbm)
# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
# Setup small weights
weights1 <- rep(1/100000, 6513)
weights2 <- rep(1/100000, 1611)
weights1 <- rep(1 / 100000, 6513)
weights2 <- rep(1 / 100000, 1611)
# Load data and create datasets
data(agaricus.train, package = "lightgbm")
......@@ -26,40 +26,48 @@ valids <- list(test = dtest)
# Run 1: sum of weights equal to 0.06513 without adjusted regularization (not learning)
# It cannot learn because regularization is too large!
# min_sum_hessian alone is bigger than the sum of weights, thus you will never learn anything
params <- list(objective = "regression",
metric = "l2",
device = "cpu",
min_sum_hessian = 10,
num_leaves = 7,
max_depth = 3,
nthread = 1)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 10
, num_leaves = 7
, max_depth = 3
, nthread = 1
)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(weight_loss) # Shows how poor the learning was: a straight line!
# Run 2: sum of weights equal to 0.06513 with adjusted regularization (learning)
# Adjusted regularization just consisting in multiplicating results by 1e4 (x10000)
# Notice how it learns, there is no issue as we adjusted regularization ourselves
params <- list(objective = "regression",
metric = "l2",
device = "cpu",
min_sum_hessian = 1e-4,
num_leaves = 7,
max_depth = 3,
nthread = 1)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 1e-4
, num_leaves = 7
, max_depth = 3
, nthread = 1
)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(small_weight_loss) # It learns!
......@@ -78,24 +86,28 @@ dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
valids <- list(test = dtest)
# Setup parameters and run model...
params <- list(objective = "regression",
metric = "l2",
device = "cpu",
min_sum_hessian = 10,
num_leaves = 7,
max_depth = 3,
nthread = 1)
model <- lgb.train(params,
dtrain,
50,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 10
, num_leaves = 7
, max_depth = 3
, nthread = 1
)
model <- lgb.train(
params
, dtrain
, 50
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
)
large_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(large_weight_loss) # It learns!
# Do you want to compare the learning? They both converge.
plot(small_weight_loss, large_weight_loss)
curve(1*x, from = 0, to = 0.02, add = TRUE)
curve(1 * x, from = 0, to = 0.02, add = TRUE)
......@@ -12,9 +12,9 @@ getinfo(dataset, ...)
\arguments{
\item{dataset}{Object of class \code{lgb.Dataset}}
\item{...}{other parameters}
\item{name}{the name of the information field to get (see details)}
\item{...}{other parameters}
}
\value{
info data
......
......@@ -4,9 +4,16 @@
\alias{lgb.Dataset}
\title{Construct \code{lgb.Dataset} object}
\usage{
lgb.Dataset(data, params = list(), reference = NULL, colnames = NULL,
categorical_feature = NULL, free_raw_data = TRUE, info = list(),
...)
lgb.Dataset(
data,
params = list(),
reference = NULL,
colnames = NULL,
categorical_feature = NULL,
free_raw_data = TRUE,
info = list(),
...
)
}
\arguments{
\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename}
......
......@@ -4,13 +4,29 @@
\alias{lgb.cv}
\title{Main CV logic for LightGBM}
\usage{
lgb.cv(params = list(), data, nrounds = 10, nfold = 3,
label = NULL, weight = NULL, obj = NULL, eval = NULL,
verbose = 1, record = TRUE, eval_freq = 1L, showsd = TRUE,
stratified = TRUE, folds = NULL, init_model = NULL,
colnames = NULL, categorical_feature = NULL,
early_stopping_rounds = NULL, callbacks = list(),
reset_data = FALSE, ...)
lgb.cv(
params = list(),
data,
nrounds = 10,
nfold = 3,
label = NULL,
weight = NULL,
obj = NULL,
eval = NULL,
verbose = 1,
record = TRUE,
eval_freq = 1L,
showsd = TRUE,
stratified = TRUE,
folds = NULL,
init_model = NULL,
colnames = NULL,
categorical_feature = NULL,
early_stopping_rounds = NULL,
callbacks = list(),
reset_data = FALSE,
...
)
}
\arguments{
\item{params}{List of parameters}
......@@ -27,7 +43,7 @@ lgb.cv(params = list(), data, nrounds = 10, nfold = 3,
\item{obj}{objective function, can be character or custom objective function. Examples include
\code{regression}, \code{regression_l1}, \code{huber},
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
\item{eval}{evaluation function, can be (list of) character or custom eval function}
......@@ -54,17 +70,15 @@ the \code{nfold} and \code{stratified} parameters are ignored.}
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there's more than one, will check all of them except the training data
Returns the model with (best_iter + early_stopping_rounds)
If early stopping occurs, the model will have 'best_iter' field}
\item{early_stopping_rounds}{int. Activates early stopping. Requires at least one validation data
and one metric. If there's more than one, will check all of them
except the training data. Returns the model with (best_iter + early_stopping_rounds).
If early stopping occurs, the model will have 'best_iter' field.}
\item{callbacks}{list of callback functions
List of callback functions that are applied at each iteration.}
\item{callbacks}{List of callback functions that are applied at each iteration.}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model into a predictor model which frees up memory and the original datasets}
\item{reset_data}{Boolean, setting it to TRUE (not the default value) will transform the booster model
into a predictor model which frees up memory and the original datasets}
\item{...}{other parameters, see Parameters.rst for more information. A few key parameters:
\itemize{
......@@ -89,11 +103,13 @@ data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "regression", metric = "l2")
model <- lgb.cv(params,
dtrain,
10,
nfold = 3,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.cv(
params = params
, data = dtrain
, nrounds = 10
, nfold = 3
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
}
......@@ -27,13 +27,15 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
json_model <- lgb.dump(model)
}
......@@ -4,8 +4,13 @@
\alias{lgb.get.eval.result}
\title{Get record evaluation result from booster}
\usage{
lgb.get.eval.result(booster, data_name, eval_name, iters = NULL,
is_err = FALSE)
lgb.get.eval.result(
booster,
data_name,
eval_name,
iters = NULL,
is_err = FALSE
)
}
\arguments{
\item{booster}{Object of class \code{lgb.Booster}}
......@@ -34,12 +39,14 @@ test <- agaricus.test
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
params <- list(objective = "regression", metric = "l2")
valids <- list(test = dtest)
model <- lgb.train(params,
dtrain,
10,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 5)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
)
lgb.get.eval.result(model, "test", "l2")
}
......@@ -29,9 +29,14 @@ data(agaricus.train, package = "lightgbm")
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "binary",
learning_rate = 0.01, num_leaves = 63, max_depth = -1,
min_data_in_leaf = 1, min_sum_hessian_in_leaf = 1)
params <- list(
objective = "binary"
, learning_rate = 0.01
, num_leaves = 63
, max_depth = -1
, min_data_in_leaf = 1
, min_sum_hessian_in_leaf = 1
)
model <- lgb.train(params, dtrain, 10)
tree_imp1 <- lgb.importance(model, percentage = TRUE)
......
......@@ -11,7 +11,7 @@ lgb.interprete(model, data, idxset, num_iteration = NULL)
\item{data}{a matrix object or a dgCMatrix object.}
\item{idxset}{a integer vector of indices of rows needed.}
\item{idxset}{an integer vector of indices of rows needed.}
\item{num_iteration}{number of iteration want to predict with, NULL or <= 0 means use best iteration.}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment