Commit b6c973af authored by Laurae's avatar Laurae Committed by Guolin Ke
Browse files

[R-package] Improvements, readability, and bug fixes (#378)

* Define environment in examples (xgboost clash)

* Large R code changes
parent e9275fb9
require(lightgbm) require(lightgbm)
require(methods) require(methods)
# load in the agaricus dataset
data(agaricus.train, package='lightgbm') # Load in the agaricus dataset
data(agaricus.test, package='lightgbm') data(agaricus.train, package = "lightgbm")
data(agaricus.test, package = "lightgbm")
dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label) dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label) dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
# note: for customized objective function, we leave objective as default
# note: what we are getting is margin value in prediction # Note: for customized objective function, we leave objective as default
# you must know what you are doing # Note: what we are getting is margin value in prediction
param <- list(num_leaves=4, learning_rate=1) # You must know what you are doing
param <- list(num_leaves = 4,
learning_rate = 1)
valids <- list(eval = dtest) valids <- list(eval = dtest)
num_round <- 20 num_round <- 20
# user define objective function, given prediction, return gradient and second order gradient
# this is loglikelihood loss # User define objective function, given prediction, return gradient and second order gradient
# This is loglikelihood loss
logregobj <- function(preds, dtrain) { logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds)) preds <- 1 / (1 + exp(-preds))
grad <- preds - labels grad <- preds - labels
hess <- preds * (1 - preds) hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess)) return(list(grad = grad, hess = hess))
} }
# user defined evaluation function, return a pair metric_name, result, higher_better
# User defined evaluation function, return a pair metric_name, result, higher_better
# NOTE: when you do customized loss function, the default prediction value is margin # NOTE: when you do customized loss function, the default prediction value is margin
# this may make buildin evalution metric not function properly # This may make buildin evalution metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation # For example, we are doing logistic loss, the prediction is score before logistic transformation
# the buildin evaluation error assumes input is after logistic transformation # The buildin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function # Take this in mind when you use the customization, and maybe you need write customized evaluation function
evalerror <- function(preds, dtrain) { evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label") labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels) err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
return(list(name = "error", value = err, higher_better=FALSE)) return(list(name = "error", value = err, higher_better = FALSE))
} }
print ('start training with early Stopping setting') print("Start training with early Stopping setting")
bst <- lgb.train(param, dtrain, num_round, valids, bst <- lgb.train(param,
objective = logregobj, eval = evalerror, dtrain,
num_round,
valids,
objective = logregobj,
eval = evalerror,
early_stopping_round = 3) early_stopping_round = 3)
require(lightgbm) require(lightgbm)
# we load the default iris dataset shipped with R # We load the default iris dataset shipped with R
data(iris) data(iris)
# we must convert factors to numeric # We must convert factors to numeric
# they must be starting from number 0 to use multiclass # They must be starting from number 0 to use multiclass
# for instance: 0, 1, 2, 3, 4, 5... # For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species))-1 iris$Species <- as.numeric(as.factor(iris$Species)) - 1
# we cut the data set into 80% train and 20% validation # We cut the data set into 80% train and 20% validation
# the 10 last samples of each class are for validation # The 10 last samples of each class are for validation
train <- as.matrix(iris[c(1:40, 51:90, 101:140), ]) train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
test <- as.matrix(iris[c(41:50, 91:100, 141:150), ]) test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
dtrain <- lgb.Dataset(data=train[, 1:4], label=train[, 5]) dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
dtest <- lgb.Dataset.create.valid(dtrain, data=test[, 1:4], label=test[, 5]) dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
valids <- list(test=dtest) valids <- list(test = dtest)
# method 1 of training # Method 1 of training
params <- list(objective="multiclass", metric="multi_error", num_class=3) params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
model <- lgb.train(params, dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10) model <- lgb.train(params,
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
# we can predict on test data, outputs a 90-length vector # We can predict on test data, outputs a 90-length vector
# order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3... # Order: obs1 class1, obs1 class2, obs1 class3, obs2 class1, obs2 class2, obs2 class3...
my_preds <- predict(model, test[, 1:4]) my_preds <- predict(model, test[, 1:4])
# method 2 of training, identical # Method 2 of training, identical
model <- lgb.train(list(), dtrain, 100, valids, min_data=1, learning_rate=1, early_stopping_rounds=10, objective="multiclass", metric="multi_error", num_class=3) model <- lgb.train(list(),
dtrain,
100,
valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10,
objective = "multiclass",
metric = "multi_error",
num_class = 3)
# we can predict on test data, identical # We can predict on test data, identical
my_preds <- predict(model, test[, 1:4]) my_preds <- predict(model, test[, 1:4])
# a (30x3) matrix with the predictions, use parameter reshape # A (30x3) matrix with the predictions, use parameter reshape
# class1 class2 class3 # class1 class2 class3
# obs1 obs1 obs1 # obs1 obs1 obs1
# obs2 obs2 obs2 # obs2 obs2 obs2
# .... .... .... # .... .... ....
my_preds <- predict(model, test[, 1:4], reshape=TRUE) my_preds <- predict(model, test[, 1:4], reshape = TRUE)
# we can also get the predicted scores before the Sigmoid/Softmax application # We can also get the predicted scores before the Sigmoid/Softmax application
my_preds <- predict(model, test[, 1:4], rawscore=TRUE) my_preds <- predict(model, test[, 1:4], rawscore = TRUE)
# raw score predictions as matrix instead of vector # Raw score predictions as matrix instead of vector
my_preds <- predict(model, test[, 1:4], rawscore=TRUE, reshape=TRUE) my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
# we can also get the leaf index # We can also get the leaf index
my_preds <- predict(model, test[, 1:4], predleaf=TRUE) my_preds <- predict(model, test[, 1:4], predleaf = TRUE)
# preddict leaf index as matrix instead of vector # Predict leaf index as matrix instead of vector
my_preds <- predict(model, test[, 1:4], predleaf=TRUE, reshape=TRUE) my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE)
...@@ -22,14 +22,16 @@ Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also ...@@ -22,14 +22,16 @@ Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
be directly used with an \code{lgb.Dataset} object. be directly used with an \code{lgb.Dataset} object.
} }
\examples{ \examples{
dontrun{ \dontrun{
data(agaricus.train, package='lightgbm') library(lightgbm)
train <- agaricus.train data(agaricus.train, package = "lightgbm")
dtrain <- lgb.Dataset(train$data, label=train$label) train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label = train$label)
stopifnot(nrow(dtrain) == nrow(train$data)) stopifnot(nrow(dtrain) == nrow(train$data))
stopifnot(ncol(dtrain) == ncol(train$data)) stopifnot(ncol(dtrain) == ncol(train$data))
stopifnot(all(dim(dtrain) == dim(train$data))) stopifnot(all(dim(dtrain) == dim(train$data)))
} }
} }
...@@ -24,15 +24,17 @@ Generic \code{dimnames} methods are used by \code{colnames}. ...@@ -24,15 +24,17 @@ Generic \code{dimnames} methods are used by \code{colnames}.
Since row names are irrelevant, it is recommended to use \code{colnames} directly. Since row names are irrelevant, it is recommended to use \code{colnames} directly.
} }
\examples{ \examples{
dontrun{ \dontrun{
data(agaricus.train, package='lightgbm') library(lightgbm)
train <- agaricus.train data(agaricus.train, package = "lightgbm")
dtrain <- lgb.Dataset(train$data, label=train$label) train <- agaricus.train
lgb.Dataset.construct(dtrain) dtrain <- lgb.Dataset(train$data, label = train$label)
dimnames(dtrain) lgb.Dataset.construct(dtrain)
colnames(dtrain) dimnames(dtrain)
colnames(dtrain) <- make.names(1:ncol(train$data)) colnames(dtrain)
print(dtrain, verbose=TRUE) colnames(dtrain) <- make.names(1:ncol(train$data))
print(dtrain, verbose = TRUE)
} }
} }
...@@ -34,15 +34,18 @@ The \code{name} field can be one of the following: ...@@ -34,15 +34,18 @@ The \code{name} field can be one of the following:
} }
\examples{ \examples{
\dontrun{ \dontrun{
data(agaricus.train, package='lightgbm') library(lightgbm)
train <- agaricus.train data(agaricus.train, package = "lightgbm")
dtrain <- lgb.Dataset(train$data, label=train$label) train <- agaricus.train
lgb.Dataset.construct(dtrain) dtrain <- lgb.Dataset(train$data, label = train$label)
labels <- getinfo(dtrain, 'label') lgb.Dataset.construct(dtrain)
setinfo(dtrain, 'label', 1-labels)
labels2 <- getinfo(dtrain, 'label') labels <- lightgbm::getinfo(dtrain, "label")
stopifnot(all(labels2 == 1-labels)) lightgbm::setinfo(dtrain, "label", 1 - labels)
labels2 <- lightgbm::getinfo(dtrain, "label")
stopifnot(all(labels2 == 1 - labels))
} }
} }
...@@ -33,12 +33,14 @@ or local file (that was created previously by saving an \code{lgb.Dataset}). ...@@ -33,12 +33,14 @@ or local file (that was created previously by saving an \code{lgb.Dataset}).
} }
\examples{ \examples{
\dontrun{ \dontrun{
data(agaricus.train, package='lightgbm') library(lightgbm)
train <- agaricus.train data(agaricus.train, package = "lightgbm")
dtrain <- lgb.Dataset(train$data, label=train$label) train <- agaricus.train
lgb.Dataset.save(dtrain, 'lgb.Dataset.data') dtrain <- lgb.Dataset(train$data, label = train$label)
dtrain <- lgb.Dataset('lgb.Dataset.data') lgb.Dataset.save(dtrain, "lgb.Dataset.data")
lgb.Dataset.construct(dtrain) dtrain <- lgb.Dataset("lgb.Dataset.data")
lgb.Dataset.construct(dtrain)
} }
} }
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment