[R-package] update examples

535cdc65 · Guolin Ke · 551d59ca · 535cdc65 · 535cdc65 · 535cdc65
Commit 535cdc65 authored Jan 08, 2017 by Guolin Ke
11 changed files
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -33,5 +33,6 @@ Depends:
    R (>= 3.0),
    R6
 Imports:
-    Matrix (>= 1.1-0)
+    Matrix (>= 1.1-0),
+    methods
 RoxygenNote: 5.0.1
\ No newline at end of file
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -22,5 +22,6 @@ export(lgb.train)
 export(lightgbm)
 export(setinfo)
 export(slice)
+import(methods)
 importFrom(R6,R6Class)
 useDynLib(lightgbm)
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -95,7 +95,7 @@ lgb.train <- function(params=list(), data, nrounds=10,
    data$set_colnames(colnames)
  }
  data$set_categorical_feature(categorical_feature)
+  data$construct()
  vaild_contain_train <- FALSE
  train_data_name <- "train"
  reduced_valid_sets <- list()

--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@@ -9,8 +9,10 @@ lightgbm <- function(data, label = NULL, weight = NULL,
                    early_stopping_rounds = NULL,
                    save_name = "lightgbm.model",
                    init_model = NULL, callbacks = list(), ...) {
+  dtrain <- data
-  dtrain <- lgb.Dataset(data, label=label, weight=weight)
+  if(!lgb.is.Dataset(dtrain)) {
+    dtrain <- lgb.Dataset(data, label=label, weight=weight)
+  }
  valids <- list()
  if (verbose > 0)
@@ -78,6 +80,7 @@ NULL
 NULL
 # Various imports
+#' @import methods
 #' @importFrom R6 R6Class
 #' @useDynLib lightgbm
 NULL
\ No newline at end of file
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -3,7 +3,15 @@ LightGBM R Package
 Installation
 ------------
 ```
 cd R-package
 R CMD INSTALL --build  .
 ```
\ No newline at end of file
+For windows user, you may need to run command prompt as administrator.
+Examples
+--------
+* Please visit [demo](demo).
\ No newline at end of file
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
+basic_walkthrough               Basic feature walkthrough
+boost_from_prediction           Boosting from existing prediction
+early_stopping                  Early Stop in training
--- a/R-package/demo/README.md
+++ b/R-package/demo/README.md
+LightGBM R examples
+====
+* [Basic walkthrough of wrappers](basic_walkthrough.R)
+* [Boosting from existing prediction](boost_from_prediction.R)
+* [Early Stopping](early_stopping.R)
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
+require(lightgbm)
+require(methods)
+# we load in the agaricus dataset
+# In this example, we are aiming to predict whether a mushroom is edible
+data(agaricus.train, package='lightgbm')
+data(agaricus.test, package='lightgbm')
+train <- agaricus.train
+test <- agaricus.test
+# the loaded data is stored in sparseMatrix, and label is a numeric vector in {0,1}
+class(train$label)
+class(train$data)
+#-------------Basic Training using lightgbm-----------------
+# this is the basic usage of lightgbm you can put matrix in data field
+# note: we are putting in sparse matrix here, lightgbm naturally handles sparse input
+# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
+print("Training lightgbm with sparseMatrix")
+bst <- lightgbm(data = train$data, label = train$label, num_leaves = 4, learning_rate = 1, nrounds = 2,
+                objective = "binary")
+# alternatively, you can put in dense matrix, i.e. basic R-matrix
+print("Training lightgbm with Matrix")
+bst <- lightgbm(data = as.matrix(train$data), label = train$label, num_leaves = 4, learning_rate = 1, nrounds = 2,
+                objective = "binary")
+# you can also put in lgb.Dataset object, which stores label, data and other meta datas needed for advanced features
+print("Training lightgbm with lgb.Dataset")
+dtrain <- lgb.Dataset(data = train$data, label = train$label)
+bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2, 
+               objective = "binary")
+# Verbose = 0,1,2
+print("Train lightgbm with verbose 0, no message")
+bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2,
+                objective = "binary", verbose = 0)
+print("Train lightgbm with verbose 1, print evaluation metric")
+bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2,
+               nthread = 2, objective = "binary", verbose = 1)
+print("Train lightgbm with verbose 2, also print information about tree")
+bst <- lightgbm(data = dtrain, num_leaves = 4, learning_rate = 1, nrounds = 2,
+               nthread = 2, objective = "binary", verbose = 2)
+# you can also specify data as file path to a LibSVM/TCV/CSV format input
+# since we do not have this file with us, the following line is just for illustration
+# bst <- lightgbm(data = 'agaricus.train.svm', num_leaves = 4, learning_rate = 1, nrounds = 2,objective = "binary")
+#--------------------basic prediction using lightgbm--------------
+# you can do prediction using the following line
+# you can put in Matrix, sparseMatrix, or lgb.Dataset 
+pred <- predict(bst, test$data)
+err <- mean(as.numeric(pred > 0.5) != test$label)
+print(paste("test-error=", err))
+#-------------------save and load models-------------------------
+# save model to binary local file
+lgb.save(bst, "lightgbm.model")
+# load binary model to R
+bst2 <- lgb.load("lightgbm.model")
+pred2 <- predict(bst2, test$data)
+# pred2 should be identical to pred
+print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
+#----------------Advanced features --------------
+# to use advanced features, we need to put data in lgb.Dataset
+dtrain <- lgb.Dataset(data = train$data, label=train$label, free_raw_data=FALSE)
+dtest <- lgb.Dataset(data = test$data, label=test$label, free_raw_data=FALSE)
+#---------------Using valids----------------
+# valids is a list of lgb.Dataset, each of them is tagged with name
+valids <- list(train=dtrain, test=dtest)
+# to train with valids, use lgb.train, which contains more advanced features
+# valids allows us to monitor the evaluation result on all data in the list 
+print("Train lightgbm using lgb.train with valids")
+bst <- lgb.train(data=dtrain, num_leaves=4, learning_rate=1, nrounds=2, valids=valids,
+                 nthread = 2, objective = "binary")
+# we can change evaluation metrics, or use multiple evaluation metrics
+print("train lightgbm using lgb.train with valids, watch logloss and error")
+bst <- lgb.train(data=dtrain, num_leaves=4, learning_rate=1, nrounds=2, valids=valids,
+                 eval = c("binary_error","binary_logloss"),
+                 nthread = 2, objective = "binary")
+# lgb.Dataset can also be saved using lgb.Dataset.save
+lgb.Dataset.save(dtrain, "dtrain.buffer")
+# to load it in, simply call lgb.Dataset
+dtrain2 <- lgb.Dataset("dtrain.buffer")
+bst <- lgb.train(data=dtrain2, num_leaves=4, learning_rate=1, nrounds=2, valids=valids,
+                 nthread = 2, objective = "binary")
+# information can be extracted from lgb.Dataset using getinfo
+label = getinfo(dtest, "label")
+pred <- predict(bst, test$data)
+err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
+print(paste("test-error=", err))
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
+require(lightgbm)
+require(methods)
+# load in the agaricus dataset
+data(agaricus.train, package='lightgbm')
+data(agaricus.test, package='lightgbm')
+dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
+dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
+valids <- list(eval = dtest, train = dtrain)
+###
+# advanced: start from a initial base prediction
+#
+print('start running example to start from a initial prediction')
+# train lightgbm for 1 round
+param <- list(num_leaves=4, learning_rate=1, nthread = 2, silent=1, objective='binary')
+bst <- lgb.train(param, dtrain, 1, valids=valids)
+# Note: we need the margin value instead of transformed prediction in set_init_score
+ptrain <- predict(bst, agaricus.train$data, rawscore=TRUE)
+ptest  <- predict(bst, agaricus.test$data, rawscore=TRUE)
+# set the init_score property of dtrain and dtest
+# base margin is the base prediction we will boost from
+setinfo(dtrain, "init_score", ptrain)
+setinfo(dtest, "init_score", ptest)
+print('this is result of boost from initial prediction')
+bst <- lgb.train(params = param, data = dtrain, nrounds = 5, valids = valids)
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
+require(lightgbm)
+require(methods)
+# load in the agaricus dataset
+data(agaricus.train, package='lightgbm')
+data(agaricus.test, package='lightgbm')
+dtrain <- lgb.Dataset(agaricus.train$data, label = agaricus.train$label)
+dtest <- lgb.Dataset(agaricus.test$data, label = agaricus.test$label)
+# note: for customized objective function, we leave objective as default
+# note: what we are getting is margin value in prediction
+# you must know what you are doing
+param <- list(num_leaves=4, learning_rate=1)
+valids <- list(eval = dtest)
+num_round <- 20
+# user define objective function, given prediction, return gradient and second order gradient
+# this is loglikelihood loss
+logregobj <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  preds <- 1/(1 + exp(-preds))
+  grad <- preds - labels
+  hess <- preds * (1 - preds)
+  return(list(grad = grad, hess = hess))
+}
+# user defined evaluation function, return a pair metric_name, result, higher_better
+# NOTE: when you do customized loss function, the default prediction value is margin
+# this may make buildin evalution metric not function properly
+# for example, we are doing logistic loss, the prediction is score before logistic transformation
+# the buildin evaluation error assumes input is after logistic transformation
+# Take this in mind when you use the customization, and maybe you need write customized evaluation function
+evalerror <- function(preds, dtrain) {
+  labels <- getinfo(dtrain, "label")
+  err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+  return(list(name = "error", value = err, higher_better=FALSE))
+}
+print ('start training with early Stopping setting')
+bst <- lgb.train(param, dtrain, num_round, valids, 
+                 objective = logregobj, eval = evalerror,
+                 early_stopping_round = 3)
--- a/README.md
+++ b/README.md
@@ -16,10 +16,12 @@ For more details, please refer to [Features](https://github.com/Microsoft/LightG
 News
 ----
+01/08/2017 : Release [**R-package**](./R-package) beta version, welcome to have a try and provide feedback.
 12/05/2016 : **Categorical Features as input directly**(without one-hot coding). Experiment on [Expo data](http://stat-computing.org/dataexpo/2009/) shows about 8x speed-up with same accuracy compared with one-hot coding (refer to [categorical log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_speed.log) and [one-hot log]( https://github.com/guolinke/boosting_tree_benchmarks/blob/master/lightgbm/lightgbm_dataexpo_onehot_speed.log)).
 For the setting details, please refer to [IO Parameters](./docs/Parameters.md#io-parameters).
-12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide issues and feedback.
+12/02/2016 : Release [**python-package**](./python-package) beta version, welcome to have a try and provide feedback.
 Get Started And Documents
 -------------------------