Commit 1f7b06b9 authored by James Lamb's avatar James Lamb Committed by Nikita Titov
Browse files

[R-package][ci] Added more R linters (fixes #2477) (#2533)



* Added more linters on R code

* started working on implicit integers

* finished style changes to handle implicit integers

* regenned documentation and added concatenation linter

* changed channel for r-lintr

* try building stringi before lintr

* trying to get libicui18n

* trying another thing

* trying conda-forge again

* added re-install of stringi

* uncommented other stages

* Update .ci/test.sh
Co-Authored-By: default avatarNikita Titov <nekit94-08@mail.ru>

* removed apt update and changed lintr version floor

* get lintr from CRAN

* R needs to come before C++ linting

* testing lintr install from CRAN

* trying one more thing

* more verbose

* order might matter

* removed commented code

* cleaned up linting block in test.sh

* grouped conda install calls and fixed a few integer array things
parent 11f9682b
......@@ -20,11 +20,11 @@
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , nrounds = 10L
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' , min_data = 1L
#' , learning_rate = 1.0
#' , early_stopping_rounds = 5L
#' )
#' saveRDS.lgb.Booster(model, "model.rds")
#' new_model <- readRDS.lgb.Booster("model.rds")
......
......@@ -31,11 +31,11 @@
#' model <- lgb.train(
#' params = params
#' , data = dtrain
#' , nrounds = 10
#' , nrounds = 10L
#' , valids = valids
#' , min_data = 1
#' , learning_rate = 1
#' , early_stopping_rounds = 5
#' , min_data = 1L
#' , learning_rate = 1.0
#' , early_stopping_rounds = 5L
#' )
#' saveRDS.lgb.Booster(model, "model.rds")
#' @export
......
......@@ -81,7 +81,7 @@ lgb.call <- function(fun_name, ret, ...) {
lgb.call.return.str <- function(fun_name, ...) {
# Create buffer
buf_len <- as.integer(1024 * 1024)
buf_len <- as.integer(1024L * 1024L)
act_len <- 0L
buf <- raw(buf_len)
......@@ -115,7 +115,7 @@ lgb.params2str <- function(params, ...) {
names(dot_params) <- gsub("\\.", "_", names(dot_params))
# Check for identical parameters
if (length(intersect(names(params), names(dot_params))) > 0) {
if (length(intersect(names(params), names(dot_params))) > 0L) {
stop(
"Same parameters in "
, sQuote("params")
......@@ -136,7 +136,7 @@ lgb.params2str <- function(params, ...) {
# Join multi value first
val <- paste0(format(params[[key]], scientific = FALSE), collapse = ",")
if (nchar(val) <= 0) next # Skip join
if (nchar(val) <= 0L) next # Skip join
# Join key value
pair <- paste0(c(key, val), collapse = "=")
......@@ -145,7 +145,7 @@ lgb.params2str <- function(params, ...) {
}
# Check ret length
if (length(ret) == 0) {
if (length(ret) == 0L) {
# Return empty string
lgb.c_str("")
......@@ -163,7 +163,7 @@ lgb.c_str <- function(x) {
# Perform character to raw conversion
ret <- charToRaw(as.character(x))
ret <- c(ret, as.raw(0))
ret <- c(ret, as.raw(0L))
ret
}
......
......@@ -20,9 +20,9 @@ print("Training lightgbm with sparseMatrix")
bst <- lightgbm(
data = train$data
, label = train$label
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, objective = "binary"
)
......@@ -31,9 +31,9 @@ print("Training lightgbm with Matrix")
bst <- lightgbm(
data = as.matrix(train$data)
, label = train$label
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, objective = "binary"
)
......@@ -45,9 +45,9 @@ dtrain <- lgb.Dataset(
)
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, objective = "binary"
)
......@@ -55,42 +55,42 @@ bst <- lightgbm(
print("Train lightgbm with verbose 0, no message")
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, objective = "binary"
, verbose = 0
, verbose = 0L
)
print("Train lightgbm with verbose 1, print evaluation metric")
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, nthread = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, nthread = 2L
, objective = "binary"
, verbose = 1
, verbose = 1L
)
print("Train lightgbm with verbose 2, also print information about tree")
bst <- lightgbm(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, nthread = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, nthread = 2L
, objective = "binary"
, verbose = 2
, verbose = 2L
)
# You can also specify data as file path to a LibSVM/TCV/CSV format input
# Since we do not have this file with us, the following line is just for illustration
# bst <- lightgbm(
# data = "agaricus.train.svm"
# , num_leaves = 4
# , learning_rate = 1
# , nrounds = 2
# , num_leaves = 4L
# , learning_rate = 1.0
# , nrounds = 2L
# , objective = "binary"
# )
......@@ -126,11 +126,11 @@ valids <- list(train = dtrain, test = dtest)
print("Train lightgbm using lgb.train with valids")
bst <- lgb.train(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, valids = valids
, nthread = 2
, nthread = 2L
, objective = "binary"
)
......@@ -138,12 +138,12 @@ bst <- lgb.train(
print("Train lightgbm using lgb.train with valids, watch logloss and error")
bst <- lgb.train(
data = dtrain
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, valids = valids
, eval = c("binary_error", "binary_logloss")
, nthread = 2
, nthread = 2L
, objective = "binary"
)
......@@ -154,16 +154,16 @@ lgb.Dataset.save(dtrain, "dtrain.buffer")
dtrain2 <- lgb.Dataset("dtrain.buffer")
bst <- lgb.train(
data = dtrain2
, num_leaves = 4
, learning_rate = 1
, nrounds = 2
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 2L
, valids = valids
, nthread = 2
, nthread = 2L
, objective = "binary"
)
# information can be extracted from lgb.Dataset using getinfo
label = getinfo(dtest, "label")
label <- getinfo(dtest, "label")
pred <- predict(bst, test$data)
err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label)
print(paste("test-error=", err))
......@@ -14,12 +14,12 @@ print("Start running example to start from an initial prediction")
# Train lightgbm for 1 round
param <- list(
num_leaves = 4
, learning_rate = 1
, nthread = 2
num_leaves = 4L
, learning_rate = 1.0
, nthread = 2L
, objective = "binary"
)
bst <- lgb.train(param, dtrain, 1, valids = valids)
bst <- lgb.train(param, dtrain, 1L, valids = valids)
# Note: we need the margin value instead of transformed prediction in set_init_score
ptrain <- predict(bst, agaricus.train$data, rawscore = TRUE)
......@@ -34,6 +34,6 @@ print("This is result of boost from initial prediction")
bst <- lgb.train(
params = param
, data = dtrain
, nrounds = 5
, nrounds = 5L
, valids = valids
)
......@@ -53,36 +53,36 @@ bank <- lgb.prepare(data = bank)
str(bank)
# Remove 1 to label because it must be between 0 and 1
bank$y <- bank$y - 1
bank$y <- bank$y - 1L
# Data input to LightGBM must be a matrix, without the label
my_data <- as.matrix(bank[, 1:16, with = FALSE])
my_data <- as.matrix(bank[, 1L:16L, with = FALSE])
# Creating the LightGBM dataset with categorical features
# The categorical features must be indexed like in R (1-indexed, not 0-indexed)
lgb_data <- lgb.Dataset(
data = my_data
, label = bank$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
, categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L)
)
# We can now train a model
params <- list(
objective = "binary"
, metric = "l2"
, min_data = 1
, min_data = 1L
, learning_rate = 0.1
, min_data = 0
, min_hessian = 1
, max_depth = 2
, min_data = 0L
, min_hessian = 1.0
, max_depth = 2L
)
model <- lgb.train(
params = params
, data = lgb_data
, nrounds = 100
, nrounds = 100L
, valids = list(train = lgb_data)
)
# Try to find split_feature: 2
# If you find it, it means it used a categorical feature in the first tree
lgb.dump(model, num_iteration = 1)
lgb.dump(model, num_iteration = 1L)
......@@ -28,8 +28,8 @@ data(bank, package = "lightgbm")
str(bank)
# We are dividing the dataset into two: one train, one validation
bank_train <- bank[1:4000, ]
bank_test <- bank[4001:4521, ]
bank_train <- bank[1L:4000L, ]
bank_test <- bank[4001L:4521L, ]
# We must now transform the data to fit in LightGBM
# For this task, we use lgb.prepare
......@@ -59,19 +59,19 @@ bank_test <- lgb.prepare_rules(data = bank_test, rules = bank_rules$rules)$data
str(bank_test)
# Remove 1 to label because it must be between 0 and 1
bank_train$y <- bank_train$y - 1
bank_test$y <- bank_test$y - 1
bank_train$y <- bank_train$y - 1L
bank_test$y <- bank_test$y - 1L
# Data input to LightGBM must be a matrix, without the label
my_data_train <- as.matrix(bank_train[, 1:16, with = FALSE])
my_data_test <- as.matrix(bank_test[, 1:16, with = FALSE])
my_data_train <- as.matrix(bank_train[, 1L:16L, with = FALSE])
my_data_test <- as.matrix(bank_test[, 1L:16L, with = FALSE])
# Creating the LightGBM dataset with categorical features
# The categorical features can be passed to lgb.train to not copy and paste a lot
dtrain <- lgb.Dataset(
data = my_data_train
, label = bank_train$y
, categorical_feature = c(2, 3, 4, 5, 7, 8, 9, 11, 16)
, categorical_feature = c(2L, 3L, 4L, 5L, 7L, 8L, 9L, 11L, 16L)
)
dtest <- lgb.Dataset.create.valid(
dtrain
......@@ -83,19 +83,19 @@ dtest <- lgb.Dataset.create.valid(
params <- list(
objective = "binary"
, metric = "l2"
, min_data = 1
, min_data = 1L
, learning_rate = 0.1
, min_data = 0
, min_hessian = 1
, max_depth = 2
, min_data = 0L
, min_hessian = 1.0
, max_depth = 2L
)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 100
, nrounds = 100L
, valids = list(train = dtrain, valid = dtest)
)
# Try to find split_feature: 11
# If you find it, it means it used a categorical feature in the first tree
lgb.dump(model, num_iteration = 1)
lgb.dump(model, num_iteration = 1L)
......@@ -5,10 +5,10 @@ data(agaricus.test, package = "lightgbm")
dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = agaricus.test$label)
nrounds <- 2
nrounds <- 2L
param <- list(
num_leaves = 4
, learning_rate = 1
num_leaves = 4L
, learning_rate = 1.0
, objective = "binary"
)
......@@ -20,7 +20,7 @@ lgb.cv(
param
, dtrain
, nrounds
, nfold = 5
, nfold = 5L
, eval = "binary_error"
)
......@@ -32,7 +32,7 @@ lgb.cv(
param
, dtrain
, nrounds
, nfold = 5
, nfold = 5L
, eval = "binary_error"
, showsd = FALSE
)
......@@ -42,14 +42,14 @@ print("Running cross validation, with cutomsized loss function")
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds))
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
hess <- preds * (1.0 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
err <- as.numeric(sum(labels != (preds > 0.0))) / length(labels)
return(list(name = "error", value = err, higher_better = FALSE))
}
......@@ -60,5 +60,5 @@ lgb.cv(
, nrounds = nrounds
, obj = logregobj
, eval = evalerror
, nfold = 5
, nfold = 5L
)
......@@ -12,19 +12,19 @@ dtest <- lgb.Dataset.create.valid(dtrain, data = agaricus.test$data, label = aga
# Note: what we are getting is margin value in prediction
# You must know what you are doing
param <- list(
num_leaves = 4
, learning_rate = 1
num_leaves = 4L
, learning_rate = 1.0
)
valids <- list(eval = dtest)
num_round <- 20
num_round <- 20L
# User define objective function, given prediction, return gradient and second order gradient
# This is loglikelihood loss
logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1 / (1 + exp(-preds))
preds <- 1.0 / (1.0 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
hess <- preds * (1.0 - preds)
return(list(grad = grad, hess = hess))
}
......@@ -48,5 +48,5 @@ bst <- lgb.train(
, valids
, objective = logregobj
, eval = evalerror
, early_stopping_round = 3
, early_stopping_round = 3L
)
......@@ -12,9 +12,9 @@
library(lightgbm)
# Generate fictive data of size 1M x 100
set.seed(11111)
x_data <- matrix(rnorm(n = 100000000, mean = 0, sd = 100), nrow = 1000000, ncol = 100)
y_data <- rnorm(n = 1000000, mean = 0, sd = 5)
set.seed(11111L)
x_data <- matrix(rnorm(n = 100000000L, mean = 0.0, sd = 100.0), nrow = 1000000L, ncol = 100L)
y_data <- rnorm(n = 1000000L, mean = 0.0, sd = 5.0)
# Create lgb.Dataset for training
data <- lgb.Dataset(x_data, label = y_data)
......@@ -24,12 +24,12 @@ data$construct()
# It MUST remain constant (if not increasing very slightly)
gbm <- list()
for (i in 1:1000) {
for (i in 1L:1000L) {
print(i)
gbm[[i]] <- lgb.train(
params = list(objective = "regression")
, data = data
, 1
, 1L
, reset_data = TRUE
)
gc(verbose = FALSE)
......
......@@ -20,13 +20,13 @@ valids <- list(test = dtest)
model <- lgb.train(
params
, dtrain
, 50
, 50L
, valids
, min_data = 1
, min_data = 1L
, learning_rate = 0.1
, bagging_fraction = 0.1
, bagging_freq = 1
, bagging_seed = 1
, bagging_freq = 1L
, bagging_seed = 1L
)
# We create a data.frame with the following structure:
......@@ -45,20 +45,20 @@ new_data <- data.frame(
predict(model, agaricus.test$data)
, 1e-15
)
, 1 - 1e-15
, 1.0 - 1e-15
)
)
new_data$Z <- -1 * (agaricus.test$label * log(new_data$Y) + (1 - agaricus.test$label) * log(1 - new_data$Y))
new_data$Z <- -1.0 * (agaricus.test$label * log(new_data$Y) + (1L - agaricus.test$label) * log(1L - new_data$Y))
new_data$binned <- .bincode(
x = new_data$X
, breaks = quantile(
x = new_data$X
, probs = (1:9) / 10
, probs = seq_len(9L) / 10.0
)
, right = TRUE
, include.lowest = TRUE
)
new_data$binned[is.na(new_data$binned)] <- 0
new_data$binned[is.na(new_data$binned)] <- 0L
new_data$binned <- as.factor(new_data$binned)
# We can check the binned content
......@@ -91,10 +91,10 @@ ggplot(
model2 <- lgb.train(
params
, dtrain
, 100
, 100L
, valids
, min_data = 1
, learning_rate = 1
, min_data = 1L
, learning_rate = 1.0
)
# We create the data structure, but for model2
......@@ -112,20 +112,20 @@ new_data2 <- data.frame(
)
, 1e-15
)
, 1 - 1e-15
, 1.0 - 1e-15
)
)
new_data2$Z <- -1 * (agaricus.test$label * log(new_data2$Y) + (1 - agaricus.test$label) * log(1 - new_data2$Y))
new_data2$Z <- -1.0 * (agaricus.test$label * log(new_data2$Y) + (1L - agaricus.test$label) * log(1L - new_data2$Y))
new_data2$binned <- .bincode(
x = new_data2$X
, breaks = quantile(
x = new_data2$X
, probs = (1:9) / 10
, probs = seq_len(9L) / 10.0
)
, right = TRUE
, include.lowest = TRUE
)
new_data2$binned[is.na(new_data2$binned)] <- 0
new_data2$binned[is.na(new_data2$binned)] <- 0L
new_data2$binned <- as.factor(new_data2$binned)
# We can check the binned content
......@@ -133,7 +133,8 @@ table(new_data2$binned)
# We can plot the binned content
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are real thus it is not an issue
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are
# real thus it is not an issue
# However, if the rules were not true, the loss would explode.
ggplot(
data = new_data2
......@@ -159,10 +160,10 @@ ggplot(
model3 <- lgb.train(
params
, dtrain
, 1000
, 1000L
, valids
, min_data = 1
, learning_rate = 1
, min_data = 1L
, learning_rate = 1.0
)
# We create the data structure, but for model3
......@@ -180,20 +181,20 @@ new_data3 <- data.frame(
)
, 1e-15
)
, 1 - 1e-15
, 1.0 - 1e-15
)
)
new_data3$Z <- -1 * (agaricus.test$label * log(new_data3$Y) + (1 - agaricus.test$label) * log(1 - new_data3$Y))
new_data3$Z <- -1.0 * (agaricus.test$label * log(new_data3$Y) + (1L - agaricus.test$label) * log(1L - new_data3$Y))
new_data3$binned <- .bincode(
x = new_data3$X
, breaks = quantile(
x = new_data3$X
, probs = (1:9) / 10
, probs = seq_len(9L) / 10.0
)
, right = TRUE
, include.lowest = TRUE
)
new_data3$binned[is.na(new_data3$binned)] <- 0
new_data3$binned[is.na(new_data3$binned)] <- 0L
new_data3$binned <- as.factor(new_data3$binned)
# We can check the binned content
......
......@@ -6,65 +6,65 @@ data(iris)
# We must convert factors to numeric
# They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1
iris$Species <- as.numeric(as.factor(iris$Species)) - 1L
# We cut the data set into 80% train and 20% validation
# The 10 last samples of each class are for validation
train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
train <- as.matrix(iris[c(1L:40L, 51L:90L, 101L:140L), ])
test <- as.matrix(iris[c(41L:50L, 91L:100L, 141L:150L), ])
dtrain <- lgb.Dataset(data = train[, 1L:4L], label = train[, 5L])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1L:4L], label = test[, 5L])
valids <- list(test = dtest)
# Method 1 of training
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3L)
model <- lgb.train(
params
, dtrain
, 100
, 100L
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 10L
)
# We can predict on test data, outputs a 90-length vector
# Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
my_preds <- predict(model, test[, 1:4])
my_preds <- predict(model, test[, 1L:4L])
# Method 2 of training, identical
model <- lgb.train(
list()
, dtrain
, 100
, 100L
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 10L
, objective = "multiclass"
, metric = "multi_error"
, num_class = 3
, num_class = 3L
)
# We can predict on test data, identical
my_preds <- predict(model, test[, 1:4])
my_preds <- predict(model, test[, 1L:4L])
# A (30x3) matrix with the predictions, use parameter reshape
# class1 class2 class3
# obs1 obs1 obs1
# obs2 obs2 obs2
# .... .... ....
my_preds <- predict(model, test[, 1:4], reshape = TRUE)
my_preds <- predict(model, test[, 1L:4L], reshape = TRUE)
# We can also get the predicted scores before the Sigmoid/Softmax application
my_preds <- predict(model, test[, 1:4], rawscore = TRUE)
my_preds <- predict(model, test[, 1L:4L], rawscore = TRUE)
# Raw score predictions as matrix instead of vector
my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
my_preds <- predict(model, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
# We can also get the leaf index
my_preds <- predict(model, test[, 1:4], predleaf = TRUE)
my_preds <- predict(model, test[, 1L:4L], predleaf = TRUE)
# Predict leaf index as matrix instead of vector
my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE)
my_preds <- predict(model, test[, 1L:4L], predleaf = TRUE, reshape = TRUE)
......@@ -6,15 +6,15 @@ data(iris)
# We must convert factors to numeric
# They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1
iris$Species <- as.numeric(as.factor(iris$Species)) - 1L
# Create imbalanced training data (20, 30, 40 examples for classes 0, 1, 2)
train <- as.matrix(iris[c(1:20, 51:80, 101:140), ])
train <- as.matrix(iris[c(1L:20L, 51L:80L, 101L:140L), ])
# The 10 last samples of each class are for validation
test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
test <- as.matrix(iris[c(41L:50L, 91L:100L, 141L:150L), ])
dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
dtrain <- lgb.Dataset(data = train[, 1L:4L], label = train[, 5L])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1L:4L], label = test[, 5L])
valids <- list(train = dtrain, test = dtest)
# Method 1 of training with built-in multiclass objective
......@@ -24,52 +24,52 @@ model_builtin <- lgb.train(
list()
, dtrain
, boost_from_average = FALSE
, 100
, 100L
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 10L
, objective = "multiclass"
, metric = "multi_logloss"
, num_class = 3
, num_class = 3L
)
preds_builtin <- predict(model_builtin, test[, 1:4], rawscore = TRUE, reshape = TRUE)
preds_builtin <- predict(model_builtin, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
probs_builtin <- exp(preds_builtin) / rowSums(exp(preds_builtin))
# Method 2 of training with custom objective function
# User defined objective function, given prediction, return gradient and second order gradient
custom_multiclass_obj = function(preds, dtrain) {
labels = getinfo(dtrain, "label")
custom_multiclass_obj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
# preds is a matrix with rows corresponding to samples and colums corresponding to choices
preds = matrix(preds, nrow = length(labels))
preds <- matrix(preds, nrow = length(labels))
# to prevent overflow, normalize preds by row
preds = preds - apply(preds, 1, max)
prob = exp(preds) / rowSums(exp(preds))
preds <- preds - apply(preds, 1L, max)
prob <- exp(preds) / rowSums(exp(preds))
# compute gradient
grad = prob
grad[cbind(1:length(labels), labels + 1)] = grad[cbind(1:length(labels), labels + 1)] - 1
grad <- prob
grad[cbind(seq_len(length(labels)), labels + 1L)] <- grad[cbind(seq_len(length(labels)), labels + 1L)] - 1L
# compute hessian (approximation)
hess = 2 * prob * (1 - prob)
hess <- 2.0 * prob * (1.0 - prob)
return(list(grad = grad, hess = hess))
}
# define custom metric
custom_multiclass_metric = function(preds, dtrain) {
labels = getinfo(dtrain, "label")
preds = matrix(preds, nrow = length(labels))
preds = preds - apply(preds, 1, max)
prob = exp(preds) / rowSums(exp(preds))
custom_multiclass_metric <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- matrix(preds, nrow = length(labels))
preds <- preds - apply(preds, 1L, max)
prob <- exp(preds) / rowSums(exp(preds))
return(list(
name = "error"
, value = -mean(log(prob[cbind(1:length(labels), labels + 1)]))
, value = -mean(log(prob[cbind(seq_len(length(labels)), labels + 1L)]))
, higher_better = FALSE
))
}
......@@ -77,17 +77,17 @@ custom_multiclass_metric = function(preds, dtrain) {
model_custom <- lgb.train(
list()
, dtrain
, 100
, 100L
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 10L
, objective = custom_multiclass_obj
, eval = custom_multiclass_metric
, num_class = 3
, num_class = 3L
)
preds_custom <- predict(model_custom, test[, 1:4], rawscore = TRUE, reshape = TRUE)
preds_custom <- predict(model_custom, test[, 1L:4L], rawscore = TRUE, reshape = TRUE)
probs_custom <- exp(preds_custom) / rowSums(exp(preds_custom))
# compare predictions
......
......@@ -11,8 +11,8 @@ library(lightgbm)
# - Run 3: sum of weights equal to 6513 (x 1e5) with adjusted regularization (learning)
# Setup small weights
weights1 <- rep(1 / 100000, 6513)
weights2 <- rep(1 / 100000, 1611)
weights1 <- rep(1.0 / 100000.0, 6513L)
weights2 <- rep(1.0 / 100000.0, 1611L)
# Load data and create datasets
data(agaricus.train, package = "lightgbm")
......@@ -30,19 +30,19 @@ params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 10
, num_leaves = 7
, max_depth = 3
, nthread = 1
, min_sum_hessian = 10.0
, num_leaves = 7L
, max_depth = 3L
, nthread = 1L
)
model <- lgb.train(
params
, dtrain
, 50
, 50L
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 10L
)
weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(weight_loss) # Shows how poor the learning was: a straight line!
......@@ -55,18 +55,18 @@ params <- list(
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 1e-4
, num_leaves = 7
, max_depth = 3
, nthread = 1
, num_leaves = 7L
, max_depth = 3L
, nthread = 1L
)
model <- lgb.train(
params
, dtrain
, 50
, 50L
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 10L
)
small_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(small_weight_loss) # It learns!
......@@ -90,19 +90,19 @@ params <- list(
objective = "regression"
, metric = "l2"
, device = "cpu"
, min_sum_hessian = 10
, num_leaves = 7
, max_depth = 3
, nthread = 1
, min_sum_hessian = 10.0
, num_leaves = 7L
, max_depth = 3L
, nthread = 1L
)
model <- lgb.train(
params
, dtrain
, 50
, 50L
, valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 10
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 10L
)
large_weight_loss <- as.numeric(model$record_evals$test$l2$eval)
plot(large_weight_loss) # It learns!
......@@ -110,4 +110,4 @@ plot(large_weight_loss) # It learns!
# Do you want to compare the learning? They both converge.
plot(small_weight_loss, large_weight_loss)
curve(1 * x, from = 0, to = 0.02, add = TRUE)
curve(1.0 * x, from = 0L, to = 0.02, add = TRUE)
......@@ -31,7 +31,7 @@ dtrain <- lgb.Dataset(train$data, label = train$label)
lgb.Dataset.construct(dtrain)
dimnames(dtrain)
colnames(dtrain)
colnames(dtrain) <- make.names(1:ncol(train$data))
colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
print(dtrain, verbose = TRUE)
}
......@@ -12,9 +12,9 @@ getinfo(dataset, ...)
\arguments{
\item{dataset}{Object of class \code{lgb.Dataset}}
\item{name}{the name of the information field to get (see details)}
\item{...}{other parameters}
\item{name}{the name of the information field to get (see details)}
}
\value{
info data
......
......@@ -24,6 +24,6 @@ train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
lgb.Dataset.save(dtrain, "lgb.Dataset.data")
dtrain <- lgb.Dataset("lgb.Dataset.data")
lgb.Dataset.set.categorical(dtrain, 1:2)
lgb.Dataset.set.categorical(dtrain, 1L:2L)
}
......@@ -7,16 +7,16 @@
lgb.cv(
params = list(),
data,
nrounds = 10,
nfold = 3,
nrounds = 10L,
nfold = 3L,
label = NULL,
weight = NULL,
obj = NULL,
eval = NULL,
verbose = 1,
verbose = 1L,
record = TRUE,
eval_freq = 1L,
showsd = TRUE,
shows = TRUE,
stratified = TRUE,
folds = NULL,
init_model = NULL,
......@@ -53,8 +53,6 @@ lgb.cv(
\item{eval_freq}{evaluation output frequency, only effect when verbose > 0}
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
by the values of outcome labels.}
......@@ -90,6 +88,8 @@ into a predictor model which frees up memory and the original datasets}
the number of real CPU cores, not the number of threads (most
CPU using hyper-threading to generate 2 threads per CPU core).}
}}
\item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
}
\value{
a trained model \code{lgb.CVBooster}.
......@@ -106,10 +106,10 @@ params <- list(objective = "regression", metric = "l2")
model <- lgb.cv(
params = params
, data = dtrain
, nrounds = 10
, nfold = 3
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
, nrounds = 10L
, nfold = 3L
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 5L
)
}
......@@ -30,11 +30,11 @@ valids <- list(test = dtest)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, nrounds = 10L
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 5L
)
json_model <- lgb.dump(model)
......
......@@ -42,11 +42,11 @@ valids <- list(test = dtest)
model <- lgb.train(
params = params
, data = dtrain
, nrounds = 10
, nrounds = 10L
, valids = valids
, min_data = 1
, learning_rate = 1
, early_stopping_rounds = 5
, min_data = 1L
, learning_rate = 1.0
, early_stopping_rounds = 5L
)
lgb.get.eval.result(model, "test", "l2")
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment