Commit 1f7b06b9 authored by James Lamb's avatar James Lamb Committed by Nikita Titov
Browse files

[R-package][ci] Added more R linters (fixes #2477) (#2533)



* Added more linters on R code

* started working on implicit integers

* finished style changes to handle implicit integers

* regenned documentation and added concatenation linter

* changed channel for r-lintr

* try building stringi before lintr

* trying to get libicui18n

* trying another thing

* trying conda-forge again

* added re-install of stringi

* uncommented other stages

* Update .ci/test.sh
Co-Authored-By: default avatarNikita Titov <nekit94-08@mail.ru>

* removed apt update and changed lintr version floor

* get lintr from CRAN

* R needs to come before C++ linting

* testing lintr install from CRAN

* trying one more thing

* more verbose

* order might matter

* removed commented code

* cleaned up linting block in test.sh

* grouped conda install calls and fixed a few integer array things
parent 11f9682b
...@@ -20,11 +20,11 @@ ...@@ -20,11 +20,11 @@
#' model <- lgb.train( #' model <- lgb.train(
#' params = params #' params = params
#' , data = dtrain #' , data = dtrain
#' , nrounds = 10 #' , nrounds = 10L
#' , valids = valids #' , valids = valids
#' , min_data = 1 #' , min_data = 1L
#' , learning_rate = 1 #' , learning_rate = 1.0
#' , early_stopping_rounds = 5 #' , early_stopping_rounds = 5L
#' ) #' )
#' saveRDS.lgb.Booster(model, "model.rds") #' saveRDS.lgb.Booster(model, "model.rds")
#' new_model <- readRDS.lgb.Booster("model.rds") #' new_model <- readRDS.lgb.Booster("model.rds")
......
...@@ -31,11 +31,11 @@ ...@@ -31,11 +31,11 @@
#' model <- lgb.train( #' model <- lgb.train(
#' params = params #' params = params
#' , data = dtrain #' , data = dtrain
#' , nrounds = 10 #' , nrounds = 10L
#' , valids = valids #' , valids = valids
#' , min_data = 1 #' , min_data = 1L
#' , learning_rate = 1 #' , learning_rate = 1.0
#' , early_stopping_rounds = 5 #' , early_stopping_rounds = 5L
#' ) #' )
#' saveRDS.lgb.Booster(model, "model.rds") #' saveRDS.lgb.Booster(model, "model.rds")
#' @export #' @export
......
...@@ -81,7 +81,7 @@ lgb.call <- function(fun_name, ret, ...) { ...@@ -81,7 +81,7 @@ lgb.call <- function(fun_name, ret, ...) {
lgb.call.return.str <- function(fun_name, ...) { lgb.call.return.str <- function(fun_name, ...) {
# Create buffer # Create buffer
buf_len <- as.integer(1024 * 1024) buf_len <- as.integer(1024L * 1024L)
act_len <- 0L act_len <- 0L
buf <- raw(buf_len) buf <- raw(buf_len)
...@@ -115,7 +115,7 @@ lgb.params2str <- function(params, ...) { ...@@ -115,7 +115,7 @@ lgb.params2str <- function(params, ...) {
names(dot_params) <- gsub("\\.", "_", names(dot_params)) names(dot_params) <- gsub("\\.", "_", names(dot_params))
# Check for identical parameters # Check for identical parameters
if (length(intersect(names(params), names(dot_params))) > 0) { if (length(intersect(names(params), names(dot_params))) > 0L) {
stop( stop(
"Same parameters in " "Same parameters in "
, sQuote("params") , sQuote("params")
...@@ -136,7 +136,7 @@ lgb.params2str <- function(params, ...) { ...@@ -136,7 +136,7 @@ lgb.params2str <- function(params, ...) {
# Join multi value first # Join multi value first
val <- paste0(format(params[[key]], scientific = FALSE), collapse = ",") val <- paste0(format(params[[key]], scientific = FALSE), collapse = ",")
if (nchar(val) <= 0) next # Skip join if (nchar(val) <= 0L) next # Skip join
# Join key value # Join key value
pair <- paste0(c(key, val), collapse = "=") pair <- paste0(c(key, val), collapse = "=")
...@@ -145,7 +145,7 @@ lgb.params2str <- function(params, ...) { ...@@ -145,7 +145,7 @@ lgb.params2str <- function(params, ...) {
} }
# Check ret length # Check ret length
if (length(ret) == 0) { if (length(ret) == 0L) {
# Return empty string # Return empty string
lgb.c_str("") lgb.c_str("")
...@@ -163,7 +163,7 @@ lgb.c_str <- function(x) { ...@@ -163,7 +163,7 @@ lgb.c_str <- function(x) {
# Perform character to raw conversion # Perform character to raw conversion
ret <- charToRaw(as.character(x)) ret <- charToRaw(as.character(x))
ret <- c(ret, as.raw(0)) ret <- c(ret, as.raw(0L))
ret ret
} }
......
...@@ -20,9 +20,9 @@ print("Training lightgbm with sparseMatrix") ...@@ -20,9 +20,9 @@ print("Training lightgbm with sparseMatrix")
bst <- lightgbm( bst <- lightgbm(
data = train$data data = train$data
, label = train$label , label = train$label
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, objective = "binary" , objective = "binary"
) )
...@@ -31,9 +31,9 @@ print("Training lightgbm with Matrix") ...@@ -31,9 +31,9 @@ print("Training lightgbm with Matrix")
bst <- lightgbm( bst <- lightgbm(
data = as.matrix(train$data) data = as.matrix(train$data)
, label = train$label , label = train$label
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, objective = "binary" , objective = "binary"
) )
...@@ -45,9 +45,9 @@ dtrain <- lgb.Dataset( ...@@ -45,9 +45,9 @@ dtrain <- lgb.Dataset(
) )
bst <- lightgbm( bst <- lightgbm(
data = dtrain data = dtrain
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, objective = "binary" , objective = "binary"
) )
...@@ -55,42 +55,42 @@ bst <- lightgbm( ...@@ -55,42 +55,42 @@ bst <- lightgbm(
print("Train lightgbm with verbose 0, no message") print("Train lightgbm with verbose 0, no message")
bst <- lightgbm( bst <- lightgbm(
data = dtrain data = dtrain
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, objective = "binary" , objective = "binary"
, verbose = 0 , verbose = 0L
) )
print("Train lightgbm with verbose 1, print evaluation metric") print("Train lightgbm with verbose 1, print evaluation metric")
bst <- lightgbm( bst <- lightgbm(
data = dtrain data = dtrain
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, nthread = 2 , nthread = 2L
, objective = "binary" , objective = "binary"
, verbose = 1 , verbose = 1L
) )
print("Train lightgbm with verbose 2, also print information about tree") print("Train lightgbm with verbose 2, also print information about tree")
bst <- lightgbm( bst <- lightgbm(
data = dtrain data = dtrain
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, nthread = 2 , nthread = 2L
, objective = "binary" , objective = "binary"
, verbose = 2 , verbose = 2L
) )
# You can also specify data as file path to a LibSVM/TCV/CSV format input # You can also specify data as file path to a LibSVM/TCV/CSV format input
# Since we do not have this file with us, the following line is just for illustration # Since we do not have this file with us, the following line is just for illustration
# bst <- lightgbm( # bst <- lightgbm(
# data = "agaricus.train.svm" # data = "agaricus.train.svm"
# , num_leaves = 4 # , num_leaves = 4L
# , learning_rate = 1 # , learning_rate = 1.0
# , nrounds = 2 # , nrounds = 2L
# , objective = "binary" # , objective = "binary"
# ) # )
...@@ -126,11 +126,11 @@ valids <- list(train = dtrain, test = dtest) ...@@ -126,11 +126,11 @@ valids <- list(train = dtrain, test = dtest)
print("Train lightgbm using lgb.train with valids") print("Train lightgbm using lgb.train with valids")
bst <- lgb.train( bst <- lgb.train(
data = dtrain data = dtrain
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, valids = valids , valids = valids
, nthread = 2 , nthread = 2L
, objective = "binary" , objective = "binary"
) )
...@@ -138,12 +138,12 @@ bst <- lgb.train( ...@@ -138,12 +138,12 @@ bst <- lgb.train(
print("Train lightgbm using lgb.train with valids, watch logloss and error") print("Train lightgbm using lgb.train with valids, watch logloss and error")
bst <- lgb.train( bst <- lgb.train(
data = dtrain data = dtrain
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, valids = valids , valids = valids
, eval = c("binary_error", "binary_logloss") , eval = c("binary_error", "binary_logloss")
, nthread = 2 , nthread = 2L
, objective = "binary" , objective = "binary"
) )
...@@ -154,16 +154,16 @@ lgb.Dataset.save(dtrain, "dtrain.buffer") ...@@ -154,16 +154,16 @@ lgb.Dataset.save(dtrain, "dtrain.buffer")
dtrain2 <- lgb.Dataset("dtrain.buffer") dtrain2 <- lgb.Dataset("dtrain.buffer")
bst <- lgb.train( bst <- lgb.train(
data = dtrain2 data = dtrain2
, num_leaves = 4 , num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nrounds = 2 , nrounds = 2L
, valids = valids , valids = valids
, nthread = 2 , nthread = 2L
, objective = "binary" , objective = "binary"
) )
# information can be extracted from lgb.Dataset using getinfo # information can be extracted from lgb.Dataset using getinfo
label = getinfo(dtest, "label") label <- getinfo(dtest, "label")
pred <- predict(bst, test$data) pred <- predict(bst, test$data)
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
print(paste("test-error=", err)) print(paste("test-error=", err))
...@@ -14,12 +14,12 @@ print("Start running example to start from an initial prediction") ...@@ -14,12 +14,12 @@ print("Start running example to start from an initial prediction")
# Train lightgbm for 1 round # Train lightgbm for 1 round
param <- list( param <- list(
num_leaves = 4 num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, nthread = 2 , nthread = 2L
, objective = "binary" , objective = "binary"
) )
bst <- lgb.train(param, dtrain, 1, valids = valids) bst <- lgb.train(param, dtrain, 1L, valids = valids)
# Note: we need the margin value instead of transformed prediction in set_init_score # Note: we need the margin value instead of transformed prediction in set_init_score
ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE) ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE)
...@@ -34,6 +34,6 @@ print("This is result of boost from initial prediction") ...@@ -34,6 +34,6 @@ print("This is result of boost from initial prediction")
bst <- lgb.train( bst <- lgb.train(
params = param params = param
, data = dtrain , data = dtrain
, nrounds = 5 , nrounds = 5L
, valids = valids , valids = valids
) )
...@@ -53,36 +53,36 @@ bank <- lgb.prepare(data = bank) ...@@ -53,36 +53,36 @@ bank <- lgb.prepare(data = bank)
str(bank) str(bank)
# Remove 1 to label because it must be between 0 and 1 # Remove 1 to label because it must be between 0 and 1
bank$y <- bank$y - 1 bank$y <- bank$y - 1L
# Data input to LightGBM must be a matrix, without the label # Data input to LightGBM must be a matrix, without the label
my_data <- as.matrix(bank[, 1:16, with = FALSE]) my_data <- as.matrix(bank[, 1L:16L, with = FALSE])
# Creating the LightGBM dataset with categorical features # Creating the LightGBM dataset with categorical features
# The categorical features must be indexed like in R (1-indexed, not 0-indexed) # The categorical features must be indexed like in R (1-indexed, not 0-indexed)
lgb_data <- lgb.Dataset( lgb_data <- lgb.Dataset(
data = my_data data = my_data
, label = bank$y , label = bank$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16) , categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L)
) )
# We can now train a model # We can now train a model
params <- list( params <- list(
objective = "binary" objective = "binary"
, metric = "l2" , metric = "l2"
, min_data = 1 , min_data = 1L
, learning_rate = 0.1 , learning_rate = 0.1
, min_data = 0 , min_data = 0L
, min_hessian = 1 , min_hessian = 1.0
, max_depth = 2 , max_depth = 2L
) )
model <- lgb.train( model <- lgb.train(
params = params params = params
, data = lgb_data , data = lgb_data
, nrounds = 100 , nrounds = 100L
, valids = list(train = lgb_data) , valids = list(train = lgb_data)
) )
# Try to find split_feature: 2 # Try to find split_feature: 2
# If you find it, it means it used a categorical feature in the first tree # If you find it, it means it used a categorical feature in the first tree
lgb.dump(model, num_iteration = 1) lgb.dump(model, num_iteration = 1L)
...@@ -28,8 +28,8 @@ data(bank, package = "lightgbm") ...@@ -28,8 +28,8 @@ data(bank, package = "lightgbm")
str(bank) str(bank)
# We are dividing the dataset into two: one train, one validation # We are dividing the dataset into two: one train, one validation
bank_train <- bank[1:4000, ] bank_train <- bank[1L:4000L, ]
bank_test <- bank[4001:4521, ] bank_test <- bank[4001L:4521L, ]
# We must now transform the data to fit in LightGBM # We must now transform the data to fit in LightGBM
# For this task, we use lgb.prepare # For this task, we use lgb.prepare
...@@ -59,19 +59,19 @@ bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data ...@@ -59,19 +59,19 @@ bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data
str(bank_test) str(bank_test)
# Remove 1 to label because it must be between 0 and 1 # Remove 1 to label because it must be between 0 and 1
bank_train$y <- bank_train$y - 1 bank_train$y <- bank_train$y - 1L
bank_test$y <- bank_test$y - 1 bank_test$y <- bank_test$y - 1L
# Data input to LightGBM must be a matrix, without the label # Data input to LightGBM must be a matrix, without the label
my_data_train <- as.matrix(bank_train[, 1:16, with = FALSE]) my_data_train <- as.matrix(bank_train[, 1L:16L, with = FALSE])
my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE]) my_data_test <- as.matrix(bank_test[, 1L:16L, with = FALSE])
# Creating the LightGBM dataset with categorical features # Creating the LightGBM dataset with categorical features
# The categorical features can be passed to lgb.train to not copy and paste a lot # The categorical features can be passed to lgb.train to not copy and paste a lot
dtrain <- lgb.Dataset( dtrain <- lgb.Dataset(
data = my_data_train data = my_data_train
, label = bank_train$y , label = bank_train$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16) , categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L)
) )
dtest <- lgb.Dataset.create.valid( dtest <- lgb.Dataset.create.valid(
dtrain dtrain
...@@ -83,19 +83,19 @@ dtest <- lgb.Dataset.create.valid( ...@@ -83,19 +83,19 @@ dtest <- lgb.Dataset.create.valid(
params <- list( params <- list(
objective = "binary" objective = "binary"
, metric = "l2" , metric = "l2"
, min_data = 1 , min_data = 1L
, learning_rate = 0.1 , learning_rate = 0.1
, min_data = 0 , min_data = 0L
, min_hessian = 1 , min_hessian = 1.0
, max_depth = 2 , max_depth = 2L
) )
model <- lgb.train( model <- lgb.train(
params = params params = params
, data = dtrain , data = dtrain
, nrounds = 100 , nrounds = 100L
, valids = list(train = dtrain, valid = dtest) , valids = list(train = dtrain, valid = dtest)
) )
# Try to find split_feature: 11 # Try to find split_feature: 11
# If you find it, it means it used a categorical feature in the first tree # If you find it, it means it used a categorical feature in the first tree
lgb.dump(model, num_iteration = 1) lgb.dump(model, num_iteration = 1L)
...@@ -5,10 +5,10 @@ data(agaricus.test, package = "lightgbm") ...@@ -5,10 +5,10 @@ data(agaricus.test, package = "lightgbm")
dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = agaricus.test$label) dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = agaricus.test$label)
nrounds <- 2 nrounds <- 2L
param <- list( param <- list(
num_leaves = 4 num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
, objective = "binary" , objective = "binary"
) )
...@@ -20,7 +20,7 @@ lgb.cv( ...@@ -20,7 +20,7 @@ lgb.cv(
param param
, dtrain , dtrain
, nrounds , nrounds
, nfold = 5 , nfold = 5L
, eval = "binary_error" , eval = "binary_error"
) )
...@@ -32,7 +32,7 @@ lgb.cv( ...@@ -32,7 +32,7 @@ lgb.cv(
param param
, dtrain , dtrain
, nrounds , nrounds
, nfold = 5 , nfold = 5L
, eval = "binary_error" , eval = "binary_error"
, showsd = FALSE , showsd = FALSE
) )
...@@ -42,14 +42,14 @@ print("Running cross validation, with cutomsized loss function") ...@@ -42,14 +42,14 @@ print("Running cross validation, with cutomsized loss function")
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds)) preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1.0 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels) err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(name = "error", value = err, higher_better = FALSE)) return(list(name = "error", value = err, higher_better = FALSE))
} }
...@@ -60,5 +60,5 @@ lgb.cv( ...@@ -60,5 +60,5 @@ lgb.cv(
, nrounds = nrounds , nrounds = nrounds
, obj = logregobj , obj = logregobj
, eval = evalerror , eval = evalerror
, nfold = 5 , nfold = 5L
) )
...@@ -12,19 +12,19 @@ dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = aga ...@@ -12,19 +12,19 @@ dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = aga
# Note: what we are getting is margin value in prediction # Note: what we are getting is margin value in prediction
# You must know what you are doing # You must know what you are doing
param <- list( param <- list(
num_leaves = 4 num_leaves = 4L
, learning_rate = 1 , learning_rate = 1.0
) )
valids <- list(eval = dtest) valids <- list(eval = dtest)
num_round <- 20 num_round <- 20L
# User define objective function, given prediction, return gradient and second order gradient # User define objective function, given prediction, return gradient and second order gradient
# This is loglikelihood loss # This is loglikelihood loss
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds)) preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1.0 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
...@@ -48,5 +48,5 @@ bst <- lgb.train( ...@@ -48,5 +48,5 @@ bst <- lgb.train(
, valids , valids
, objective = logregobj , objective = logregobj
, eval = evalerror , eval = evalerror
, early_stopping_round = 3 , early_stopping_round = 3L
) )
...@@ -12,9 +12,9 @@ ...@@ -12,9 +12,9 @@
library(lightgbm) library(lightgbm)
# Generate fictive data of size 1M x 100 # Generate fictive data of size 1M x 100
set.seed(11111) set.seed(11111L)
x_data <- matrix(rnorm(n = 100000000, mean = 0, sd = 100), nrow = 1000000, ncol = 100) x_data <- matrix(rnorm(n = 100000000L, mean = 0.0, sd = 100.0), nrow = 1000000L, ncol = 100L)
y_data <- rnorm(n = 1000000, mean = 0, sd = 5) y_data <- rnorm(n = 1000000L, mean = 0.0, sd = 5.0)
# Create lgb.Dataset for training # Create lgb.Dataset for training
data <- lgb.Dataset(x_data, label = y_data) data <- lgb.Dataset(x_data, label = y_data)
...@@ -24,12 +24,12 @@ data$construct() ...@@ -24,12 +24,12 @@ data$construct()
# It MUST remain constant (if not increasing very slightly) # It MUST remain constant (if not increasing very slightly)
gbm <- list() gbm <- list()
for (i in 1:1000) { for (i in 1L:1000L) {
print(i) print(i)
gbm[[i]] <- lgb.train( gbm[[i]] <- lgb.train(
params = list(objective = "regression") params = list(objective = "regression")
, data = data , data = data
, 1 , 1L
, reset_data = TRUE , reset_data = TRUE
) )
gc(verbose = FALSE) gc(verbose = FALSE)
......
...@@ -20,13 +20,13 @@ valids <- list(test = dtest) ...@@ -20,13 +20,13 @@ valids <- list(test = dtest)
model <- lgb.train( model <- lgb.train(
params params
, dtrain , dtrain
, 50 , 50L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 0.1 , learning_rate = 0.1
, bagging_fraction = 0.1 , bagging_fraction = 0.1
, bagging_freq = 1 , bagging_freq = 1L
, bagging_seed = 1 , bagging_seed = 1L
) )
# We create a data.frame with the following structure: # We create a data.frame with the following structure:
...@@ -45,20 +45,20 @@ new_data <- data.frame( ...@@ -45,20 +45,20 @@ new_data <- data.frame(
predict(model, agaricus.test$data) predict(model, agaricus.test$data)
, 1e-15 , 1e-15
) )
, 1 - 1e-15 , 1.0 - 1e-15
) )
) )
new_data$Z <- -1 * (agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y)) new_data$Z <- -1.0 * (agaricus.test$label * log(new_data$Y) + (1L - agaricus.test$label) * log(1L - new_data$Y))
new_data$binned <- .bincode( new_data$binned <- .bincode(
x = new_data$X x = new_data$X
, breaks = quantile( , breaks = quantile(
x = new_data$X x = new_data$X
, probs = (1:9) / 10 , probs = seq_len(9L) / 10.0
) )
, right = TRUE , right = TRUE
, include.lowest = TRUE , include.lowest = TRUE
) )
new_data$binned[is.na(new_data$binned)] <- 0 new_data$binned[is.na(new_data$binned)] <- 0L
new_data$binned <- as.factor(new_data$binned) new_data$binned <- as.factor(new_data$binned)
# We can check the binned content # We can check the binned content
...@@ -91,10 +91,10 @@ ggplot( ...@@ -91,10 +91,10 @@ ggplot(
model2 <- lgb.train( model2 <- lgb.train(
params params
, dtrain , dtrain
, 100 , 100L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
) )
# We create the data structure, but for model2 # We create the data structure, but for model2
...@@ -112,20 +112,20 @@ new_data2 <- data.frame( ...@@ -112,20 +112,20 @@ new_data2 <- data.frame(
) )
, 1e-15 , 1e-15
) )
, 1 - 1e-15 , 1.0 - 1e-15
) )
) )
new_data2$Z <- -1 * (agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y)) new_data2$Z <- -1.0 * (agaricus.test$label * log(new_data2$Y) + (1L - agaricus.test$label) * log(1L - new_data2$Y))
new_data2$binned <- .bincode( new_data2$binned <- .bincode(
x = new_data2$X x = new_data2$X
, breaks = quantile( , breaks = quantile(
x = new_data2$X x = new_data2$X
, probs = (1:9) / 10 , probs = seq_len(9L) / 10.0
) )
, right = TRUE , right = TRUE
, include.lowest = TRUE , include.lowest = TRUE
) )
new_data2$binned[is.na(new_data2$binned)] <- 0 new_data2$binned[is.na(new_data2$binned)] <- 0L
new_data2$binned <- as.factor(new_data2$binned) new_data2$binned <- as.factor(new_data2$binned)
# We can check the binned content # We can check the binned content
...@@ -133,7 +133,8 @@ table(new_data2$binned) ...@@ -133,7 +133,8 @@ table(new_data2$binned)
# We can plot the binned content # We can plot the binned content
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss # On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue # On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are
# real thus it is not an issue
# However, if the rules were not true, the loss would explode. # However, if the rules were not true, the loss would explode.
ggplot( ggplot(
data = new_data2 data = new_data2
...@@ -159,10 +160,10 @@ ggplot( ...@@ -159,10 +160,10 @@ ggplot(
model3 <- lgb.train( model3 <- lgb.train(
params params
, dtrain , dtrain
, 1000 , 1000L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
) )
# We create the data structure, but for model3 # We create the data structure, but for model3
...@@ -180,20 +181,20 @@ new_data3 <- data.frame( ...@@ -180,20 +181,20 @@ new_data3 <- data.frame(
) )
, 1e-15 , 1e-15
) )
, 1 - 1e-15 , 1.0 - 1e-15
) )
) )
new_data3$Z <- -1 * (agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y)) new_data3$Z <- -1.0 * (agaricus.test$label * log(new_data3$Y) + (1L - agaricus.test$label) * log(1L - new_data3$Y))
new_data3$binned <- .bincode( new_data3$binned <- .bincode(
x = new_data3$X x = new_data3$X
, breaks = quantile( , breaks = quantile(
x = new_data3$X x = new_data3$X
, probs = (1:9) / 10 , probs = seq_len(9L) / 10.0
) )
, right = TRUE , right = TRUE
, include.lowest = TRUE , include.lowest = TRUE
) )
new_data3$binned[is.na(new_data3$binned)] <- 0 new_data3$binned[is.na(new_data3$binned)] <- 0L
new_data3$binned <- as.factor(new_data3$binned) new_data3$binned <- as.factor(new_data3$binned)
# We can check the binned content # We can check the binned content
......
...@@ -6,65 +6,65 @@ data(iris) ...@@ -6,65 +6,65 @@ data(iris)
# We must convert factors to numeric # We must convert factors to numeric
# They must be starting from number 0 to use multiclass # They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5... # For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1 iris$Species <- as.numeric(as.factor(iris$Species)) - 1L
# We cut the data set into 80% train and 20% validation # We cut the data set into 80% train and 20% validation
# The 10 last samples of each class are for validation # The 10 last samples of each class are for validation
train <- as.matrix(iris[c(1:40, 51:90, 101:140), ]) train <- as.matrix(iris[c(1L:40L, 51L:90L, 101L:140L), ])
test <- as.matrix(iris[c(41:50, 91:100, 141:150), ]) test <- as.matrix(iris[c(41L:50L, 91L:100L, 141L:150L), ])
dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5]) dtrain <- lgb.Dataset(data = train[, 1L:4L], label = train[, 5L])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5]) dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1L:4L], label = test[, 5L])
valids <- list(test = dtest) valids <- list(test = dtest)
# Method 1 of training # Method 1 of training
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3) params <- list(objective = "multiclass", metric = "multi_error", num_class = 3L)
model <- lgb.train( model <- lgb.train(
params params
, dtrain , dtrain
, 100 , 100L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 10 , early_stopping_rounds = 10L
) )
# We can predict on test data, outputs a 90-length vector # We can predict on test data, outputs a 90-length vector
# Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3... # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
my_preds <- predict(model, test[, 1:4]) my_preds <- predict(model, test[, 1L:4L])
# Method 2 of training, identical # Method 2 of training, identical
model <- lgb.train( model <- lgb.train(
list() list()
, dtrain , dtrain
, 100 , 100L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 10 , early_stopping_rounds = 10L
, objective = "multiclass" , objective = "multiclass"
, metric = "multi_error" , metric = "multi_error"
, num_class = 3 , num_class = 3L
) )
# We can predict on test data, identical # We can predict on test data, identical
my_preds <- predict(model, test[, 1:4]) my_preds <- predict(model, test[, 1L:4L])
# A (30x3) matrix with the predictions, use parameter reshape # A (30x3) matrix with the predictions, use parameter reshape
# class1 class2 class3 # class1 class2 class3
# obs1 obs1 obs1 # obs1 obs1 obs1
# obs2 obs2 obs2 # obs2 obs2 obs2
# .... .... .... # .... .... ....
my_preds <- predict(model, test[, 1:4], reshape = TRUE) my_preds <- predict(model, test[, 1L:4L], reshape = TRUE)
# We can also get the predicted scores before the Sigmoid/Softmax application # We can also get the predicted scores before the Sigmoid/Softmax application
my_preds <- predict(model, test[, 1:4], rawscore = TRUE) my_preds <- predict(model, test[, 1L:4L], rawscore = TRUE)
# Raw score predictions as matrix instead of vector # Raw score predictions as matrix instead of vector
my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE) my_preds <- predict(model, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
# We can also get the leaf index # We can also get the leaf index
my_preds <- predict(model, test[, 1:4], predleaf = TRUE) my_preds <- predict(model, test[, 1L:4L], predleaf = TRUE)
# Predict leaf index as matrix instead of vector # Predict leaf index as matrix instead of vector
my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE) my_preds <- predict(model, test[, 1L:4L], predleaf = TRUE, reshape = TRUE)
...@@ -6,15 +6,15 @@ data(iris) ...@@ -6,15 +6,15 @@ data(iris)
# We must convert factors to numeric # We must convert factors to numeric
# They must be starting from number 0 to use multiclass # They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5... # For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1 iris$Species <- as.numeric(as.factor(iris$Species)) - 1L
# Create imbalanced training data (20, 30, 40 examples for classes 0, 1, 2) # Create imbalanced training data (20, 30, 40 examples for classes 0, 1, 2)
train <- as.matrix(iris[c(1:20, 51:80, 101:140), ]) train <- as.matrix(iris[c(1L:20L, 51L:80L, 101L:140L), ])
# The 10 last samples of each class are for validation # The 10 last samples of each class are for validation
test <- as.matrix(iris[c(41:50, 91:100, 141:150), ]) test <- as.matrix(iris[c(41L:50L, 91L:100L, 141L:150L), ])
dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5]) dtrain <- lgb.Dataset(data = train[, 1L:4L], label = train[, 5L])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5]) dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1L:4L], label = test[, 5L])
valids <- list(train = dtrain, test = dtest) valids <- list(train = dtrain, test = dtest)
# Method 1 of training with built-in multiclass objective # Method 1 of training with built-in multiclass objective
...@@ -24,52 +24,52 @@ model_builtin <- lgb.train( ...@@ -24,52 +24,52 @@ model_builtin <- lgb.train(
list() list()
, dtrain , dtrain
, boost_from_average = FALSE , boost_from_average = FALSE
, 100 , 100L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 10 , early_stopping_rounds = 10L
, objective = "multiclass" , objective = "multiclass"
, metric = "multi_logloss" , metric = "multi_logloss"
, num_class = 3 , num_class = 3L
) )
preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE, reshape = TRUE) preds_builtin <- predict(model_builtin, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin)) probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin))
# Method 2 of training with custom objective function # Method 2 of training with custom objective function
# User defined objective function, given prediction, return gradient and second order gradient # User defined objective function, given prediction, return gradient and second order gradient
custom_multiclass_obj = function(preds, dtrain) { custom_multiclass_obj <- function(preds, dtrain) {
labels = getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
# preds is a matrix with rows corresponding to samples and colums corresponding to choices # preds is a matrix with rows corresponding to samples and colums corresponding to choices
preds = matrix(preds, nrow = length(labels)) preds <- matrix(preds, nrow = length(labels))
# to prevent overflow, normalize preds by row # to prevent overflow, normalize preds by row
preds = preds - apply(preds, 1, max) preds <- preds - apply(preds, 1L, max)
prob = exp(preds) / rowSums(exp(preds)) prob <- exp(preds) / rowSums(exp(preds))
# compute gradient # compute gradient
grad = prob grad <- prob
grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1 grad[cbind(seq_len(length(labels)), labels + 1L)] <- grad[cbind(seq_len(length(labels)), labels + 1L)] - 1L
# compute hessian (approximation) # compute hessian (approximation)
hess = 2 * prob * (1 - prob) hess <- 2.0 * prob * (1.0 - prob)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
# define custom metric # define custom metric
custom_multiclass_metric = function(preds, dtrain) { custom_multiclass_metric <- function(preds, dtrain) {
labels = getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds = matrix(preds, nrow = length(labels)) preds <- matrix(preds, nrow = length(labels))
preds = preds - apply(preds, 1, max) preds <- preds - apply(preds, 1L, max)
prob = exp(preds) / rowSums(exp(preds)) prob <- exp(preds) / rowSums(exp(preds))
return(list( return(list(
name = "error" name = "error"
, value = -mean(log(prob[cbind(1:length(labels), labels + 1)])) , value = -mean(log(prob[cbind(seq_len(length(labels)), labels + 1L)]))
, higher_better = FALSE , higher_better = FALSE
)) ))
} }
...@@ -77,17 +77,17 @@ custom_multiclass_metric = function(preds, dtrain) { ...@@ -77,17 +77,17 @@ custom_multiclass_metric = function(preds, dtrain) {
model_custom <- lgb.train( model_custom <- lgb.train(
list() list()
, dtrain , dtrain
, 100 , 100L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 10 , early_stopping_rounds = 10L
, objective = custom_multiclass_obj , objective = custom_multiclass_obj
, eval = custom_multiclass_metric , eval = custom_multiclass_metric
, num_class = 3 , num_class = 3L
) )
preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE, reshape = TRUE) preds_custom <- predict(model_custom, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom)) probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
# compare predictions # compare predictions
......
...@@ -11,8 +11,8 @@ library(lightgbm) ...@@ -11,8 +11,8 @@ library(lightgbm)
# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning) # - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
# Setup small weights # Setup small weights
weights1 <- rep(1 / 100000, 6513) weights1 <- rep(1.0 / 100000.0, 6513L)
weights2 <- rep(1 / 100000, 1611) weights2 <- rep(1.0 / 100000.0, 1611L)
# Load data and create datasets # Load data and create datasets
data(agaricus.train, package = "lightgbm") data(agaricus.train, package = "lightgbm")
...@@ -30,19 +30,19 @@ params <- list( ...@@ -30,19 +30,19 @@ params <- list(
objective = "regression" objective = "regression"
, metric = "l2" , metric = "l2"
, device = "cpu" , device = "cpu"
, min_sum_hessian = 10 , min_sum_hessian = 10.0
, num_leaves = 7 , num_leaves = 7L
, max_depth = 3 , max_depth = 3L
, nthread = 1 , nthread = 1L
) )
model <- lgb.train( model <- lgb.train(
params params
, dtrain , dtrain
, 50 , 50L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 10 , early_stopping_rounds = 10L
) )
weight_loss <- as.numeric(model$record_evals$test$l2$eval) weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(weight_loss) # Shows how poor the learning was: a straight line! plot(weight_loss) # Shows how poor the learning was: a straight line!
...@@ -55,18 +55,18 @@ params <- list( ...@@ -55,18 +55,18 @@ params <- list(
, metric = "l2" , metric = "l2"
, device = "cpu" , device = "cpu"
, min_sum_hessian = 1e-4 , min_sum_hessian = 1e-4
, num_leaves = 7 , num_leaves = 7L
, max_depth = 3 , max_depth = 3L
, nthread = 1 , nthread = 1L
) )
model <- lgb.train( model <- lgb.train(
params params
, dtrain , dtrain
, 50 , 50L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 10 , early_stopping_rounds = 10L
) )
small_weight_loss <- as.numeric(model$record_evals$test$l2$eval) small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(small_weight_loss) # It learns! plot(small_weight_loss) # It learns!
...@@ -90,19 +90,19 @@ params <- list( ...@@ -90,19 +90,19 @@ params <- list(
objective = "regression" objective = "regression"
, metric = "l2" , metric = "l2"
, device = "cpu" , device = "cpu"
, min_sum_hessian = 10 , min_sum_hessian = 10.0
, num_leaves = 7 , num_leaves = 7L
, max_depth = 3 , max_depth = 3L
, nthread = 1 , nthread = 1L
) )
model <- lgb.train( model <- lgb.train(
params params
, dtrain , dtrain
, 50 , 50L
, valids , valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 10 , early_stopping_rounds = 10L
) )
large_weight_loss <- as.numeric(model$record_evals$test$l2$eval) large_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(large_weight_loss) # It learns! plot(large_weight_loss) # It learns!
...@@ -110,4 +110,4 @@ plot(large_weight_loss) # It learns! ...@@ -110,4 +110,4 @@ plot(large_weight_loss) # It learns!
# Do you want to compare the learning? They both converge. # Do you want to compare the learning? They both converge.
plot(small_weight_loss, large_weight_loss) plot(small_weight_loss, large_weight_loss)
curve(1 * x, from = 0, to = 0.02, add = TRUE) curve(1.0 * x, from = 0L, to = 0.02, add = TRUE)
...@@ -31,7 +31,7 @@ dtrain <- lgb.Dataset(train$data, label = train$label) ...@@ -31,7 +31,7 @@ dtrain <- lgb.Dataset(train$data, label = train$label)
lgb.Dataset.construct(dtrain) lgb.Dataset.construct(dtrain)
dimnames(dtrain) dimnames(dtrain)
colnames(dtrain) colnames(dtrain)
colnames(dtrain) <- make.names(1:ncol(train$data)) colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
print(dtrain, verbose = TRUE) print(dtrain, verbose = TRUE)
} }
...@@ -12,9 +12,9 @@ getinfo(dataset, ...) ...@@ -12,9 +12,9 @@ getinfo(dataset, ...)
\arguments{ \arguments{
\item{dataset}{Object of class \code{lgb.Dataset}} \item{dataset}{Object of class \code{lgb.Dataset}}
\item{name}{the name of the information field to get (see details)}
\item{...}{other parameters} \item{...}{other parameters}
\item{name}{the name of the information field to get (see details)}
} }
\value{ \value{
info data info data
......
...@@ -24,6 +24,6 @@ train <- agaricus.train ...@@ -24,6 +24,6 @@ train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label) dtrain <- lgb.Dataset(train$data, label = train$label)
lgb.Dataset.save(dtrain, "lgb.Dataset.data") lgb.Dataset.save(dtrain, "lgb.Dataset.data")
dtrain <- lgb.Dataset("lgb.Dataset.data") dtrain <- lgb.Dataset("lgb.Dataset.data")
lgb.Dataset.set.categorical(dtrain, 1:2) lgb.Dataset.set.categorical(dtrain, 1L:2L)
} }
...@@ -7,16 +7,16 @@ ...@@ -7,16 +7,16 @@
lgb.cv( lgb.cv(
params = list(), params = list(),
data, data,
nrounds = 10, nrounds = 10L,
nfold = 3, nfold = 3L,
label = NULL, label = NULL,
weight = NULL, weight = NULL,
obj = NULL, obj = NULL,
eval = NULL, eval = NULL,
verbose = 1, verbose = 1L,
record = TRUE, record = TRUE,
eval_freq = 1L, eval_freq = 1L,
showsd = TRUE, shows = TRUE,
stratified = TRUE, stratified = TRUE,
folds = NULL, folds = NULL,
init_model = NULL, init_model = NULL,
...@@ -53,8 +53,6 @@ lgb.cv( ...@@ -53,8 +53,6 @@ lgb.cv(
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0} \item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified \item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
by the values of outcome labels.} by the values of outcome labels.}
...@@ -90,6 +88,8 @@ into a predictor model which frees up memory and the original datasets} ...@@ -90,6 +88,8 @@ into a predictor model which frees up memory and the original datasets}
the number of real CPU cores, not the number of threads (most the number of real CPU cores, not the number of threads (most
CPU using hyper-threading to generate 2 threads per CPU core).} CPU using hyper-threading to generate 2 threads per CPU core).}
}} }}
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
} }
\value{ \value{
a trained model \code{lgb.CVBooster}. a trained model \code{lgb.CVBooster}.
...@@ -106,10 +106,10 @@ params <- list(objective = "regression", metric = "l2") ...@@ -106,10 +106,10 @@ params <- list(objective = "regression", metric = "l2")
model <- lgb.cv( model <- lgb.cv(
params = params params = params
, data = dtrain , data = dtrain
, nrounds = 10 , nrounds = 10L
, nfold = 3 , nfold = 3L
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 5 , early_stopping_rounds = 5L
) )
} }
...@@ -30,11 +30,11 @@ valids <- list(test = dtest) ...@@ -30,11 +30,11 @@ valids <- list(test = dtest)
model <- lgb.train( model <- lgb.train(
params = params params = params
, data = dtrain , data = dtrain
, nrounds = 10 , nrounds = 10L
, valids = valids , valids = valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 5 , early_stopping_rounds = 5L
) )
json_model <- lgb.dump(model) json_model <- lgb.dump(model)
......
...@@ -42,11 +42,11 @@ valids <- list(test = dtest) ...@@ -42,11 +42,11 @@ valids <- list(test = dtest)
model <- lgb.train( model <- lgb.train(
params = params params = params
, data = dtrain , data = dtrain
, nrounds = 10 , nrounds = 10L
, valids = valids , valids = valids
, min_data = 1 , min_data = 1L
, learning_rate = 1 , learning_rate = 1.0
, early_stopping_rounds = 5 , early_stopping_rounds = 5L
) )
lgb.get.eval.result(model, "test", "l2") lgb.get.eval.result(model, "test", "l2")
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment