[R-package] Add lgb.convert from smart matrix building (#561)

* Add lgb.prepare * Update roxygen to 6.0.1 and add lgb.prepare * Add prepared rules. * Add recommendation when needing rules for transformation.

[R-package] Add lgb.convert from smart matrix building (#561)
* Add lgb.prepare * Update roxygen to 6.0.1 and add lgb.prepare * Add prepared rules. * Add recommendation when needing rules for transformation.
deaa5cda · Laurae · Guolin Ke · 86d3de78 · deaa5cda · deaa5cda
Commit deaa5cda authored May 28, 2017 by Laurae Committed by Guolin Ke May 28, 2017
17 changed files
--- a/R-package/man/lgb.interprete.Rd
+++ b/R-package/man/lgb.interprete.Rd
@@ -48,4 +48,3 @@ tree_interpretation <- lgb.interprete(model, test$data, 1:5)
 }

 }
-
--- a/R-package/man/lgb.load.Rd
+++ b/R-package/man/lgb.load.Rd
@@ -44,4 +44,3 @@ load_booster_from_str <- lgb.load(model_str = model_string)
 }

 }
-
--- a/R-package/man/lgb.model.dt.tree.Rd
+++ b/R-package/man/lgb.model.dt.tree.Rd
@@ -52,4 +52,3 @@ tree_dt <- lgb.model.dt.tree(model)
 }

 }
-
--- a/R-package/man/lgb.plot.importance.Rd
+++ b/R-package/man/lgb.plot.importance.Rd
@@ -46,4 +46,3 @@ lgb.plot.importance(tree_imp, top_n = 10, measure = "Gain")
 }

 }
-
--- a/R-package/man/lgb.plot.interpretation.Rd
+++ b/R-package/man/lgb.plot.interpretation.Rd
@@ -51,4 +51,3 @@ lgb.plot.interpretation(tree_interpretation[[1]], top_n = 10)
 }

 }
-
--- a/R-package/man/lgb.prepare.Rd
+++ b/R-package/man/lgb.prepare.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lgb.prepare.R
+\name{lgb.prepare}
+\alias{lgb.prepare}
+\title{Data preparator for LightGBM datasets (integer)}
+\usage{
+lgb.prepare(data)
+}
+\arguments{
+\item{data}{A data.frame or data.table to prepare.}
+}
+\value{
+The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
+}
+\description{
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
+}
+\examples{
+\dontrun{
+  library(lightgbm)
+  data(iris)
+  
+  str(iris)
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
+  
+  str(lgb.prepare(data = iris)) # Convert all factors/chars to integer
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+  
+  # When lightgbm package is installed, and you do not want to load it
+  # You can still use the function!
+  lgb.unloader()
+  str(lightgbm::lgb.prepare(data = iris))
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+  
+}
+
+}
--- a/R-package/man/lgb.prepare2.Rd
+++ b/R-package/man/lgb.prepare2.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lgb.prepare2.R
+\name{lgb.prepare2}
+\alias{lgb.prepare2}
+\title{Data preparator for LightGBM datasets (numeric)}
+\usage{
+lgb.prepare2(data)
+}
+\arguments{
+\item{data}{A data.frame or data.table to prepare.}
+}
+\value{
+The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
+}
+\description{
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. This is useful if you have a specific need for numeric dataset instead of integer dataset. There are programs which do not support integer-only input. Consider this is a fallback solution if you cannot use integers. Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
+}
+\examples{
+\dontrun{
+  library(lightgbm)
+  data(iris)
+  
+  str(iris)
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
+  
+  str(lgb.prepare2(data = iris)) # Convert all factors/chars to numeric
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+  
+  # When lightgbm package is installed, and you do not want to load it
+  # You can still use the function!
+  lgb.unloader()
+  str(lightgbm::lgb.prepare2(data = iris))
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+}
+
+}
--- a/R-package/man/lgb.prepare_rules.Rd
+++ b/R-package/man/lgb.prepare_rules.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lgb.prepare_rules.R
+\name{lgb.prepare_rules}
+\alias{lgb.prepare_rules}
+\title{Data preparator for LightGBM datasets with rules (integer)}
+\usage{
+lgb.prepare_rules(data, rules = NULL)
+}
+\arguments{
+\item{data}{A data.frame or data.table to prepare.}
+
+\item{rules}{A set of rules from the data preparator, if already used.}
+}
+\value{
+A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
+}
+\description{
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter.
+}
+\examples{
+\dontrun{
+  library(lightgbm)
+  data(iris)
+  
+  str(iris)
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
+  
+  new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
+  str(new_iris$data)
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+  
+  data(iris) # Erase iris dataset
+  iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
+  # Warning message:
+  In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
+    invalid factor level, NA generated
+  
+  # Use conversion using known rules
+  # Unknown factors become 0, excellent for sparse datasets
+  newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
+  
+  # Unknown factor is now zero, perfect for sparse datasets
+  newer_iris$data[1, ] # Species became 0 as it is an unknown factor
+  #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+  # 1          5.1         3.5          1.4         0.2       0
+  
+  newer_iris$data[1, 5] <- 1 # Put back real initial value
+  
+  # Is the newly created dataset equal? YES!
+  all.equal(new_iris$data, newer_iris$data)
+  # [1] TRUE
+  
+  # Can we test our own rules?
+  data(iris) # Erase iris dataset
+  
+  # We remapped values differently
+  personal_rules <- list(Species = c("setosa" = 3L,
+                                     "versicolor" = 2L,
+                                     "virginica" = 1L))
+  newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
+  str(newest_iris$data) # SUCCESS!
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : int  3 3 3 3 3 3 3 3 3 3 ...
+  
+}
+
+}
--- a/R-package/man/lgb.prepare_rules2.Rd
+++ b/R-package/man/lgb.prepare_rules2.Rd
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/lgb.prepare_rules2.R
+\name{lgb.prepare_rules2}
+\alias{lgb.prepare_rules2}
+\title{Data preparator for LightGBM datasets with rules (numeric)}
+\usage{
+lgb.prepare_rules2(data, rules = NULL)
+}
+\arguments{
+\item{data}{A data.frame or data.table to prepare.}
+
+\item{rules}{A set of rules from the data preparator, if already used.}
+}
+\value{
+A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
+}
+\description{
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for numeric dataset instead of integer dataset. There are programs which do not support integer-only input. Consider this is a fallback solution if you cannot use integers.
+}
+\examples{
+\dontrun{
+  library(lightgbm)
+  data(iris)
+  
+  str(iris)
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
+  
+  new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
+  str(new_iris$data)
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+  
+  data(iris) # Erase iris dataset
+  iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
+  # Warning message:
+  In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
+    invalid factor level, NA generated
+  
+  # Use conversion using known rules
+  # Unknown factors become 0, excellent for sparse datasets
+  newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
+  
+  # Unknown factor is now zero, perfect for sparse datasets
+  newer_iris$data[1, ] # Species became 0 as it is an unknown factor
+  #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+  # 1          5.1         3.5          1.4         0.2       0
+  
+  newer_iris$data[1, 5] <- 1 # Put back real initial value
+  
+  # Is the newly created dataset equal? YES!
+  all.equal(new_iris$data, newer_iris$data)
+  # [1] TRUE
+  
+  # Can we test our own rules?
+  data(iris) # Erase iris dataset
+  
+  # We remapped values differently
+  personal_rules <- list(Species = c("setosa" = 3,
+                                     "versicolor" = 2,
+                                     "virginica" = 1))
+  newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
+  str(newest_iris$data) # SUCCESS!
+  # 'data.frame':	150 obs. of  5 variables:
+  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
+  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
+  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
+  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
+  # $ Species     : num  3 3 3 3 3 3 3 3 3 3 ...
+  
+}
+
+}
--- a/R-package/man/lgb.save.Rd
+++ b/R-package/man/lgb.save.Rd
@@ -41,4 +41,3 @@ lgb.save(model, "model.txt")
 }

 }
-
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -182,4 +182,3 @@ model <- lgb.train(params,
 }

 }
-
--- a/R-package/man/lgb.unloader.Rd
+++ b/R-package/man/lgb.unloader.Rd
@@ -46,4 +46,3 @@ library(lightgbm)
 }

 }
-
--- a/R-package/man/predict.lgb.Booster.Rd
+++ b/R-package/man/predict.lgb.Booster.Rd
@@ -59,4 +59,3 @@ preds <- predict(model, test$data)
 }

 }
-
--- a/R-package/man/readRDS.lgb.Booster.Rd
+++ b/R-package/man/readRDS.lgb.Booster.Rd
@@ -40,4 +40,3 @@ Attemps to load a model using RDS.
 }

 }
-
--- a/R-package/man/saveRDS.lgb.Booster.Rd
+++ b/R-package/man/saveRDS.lgb.Booster.Rd
@@ -50,4 +50,3 @@ Attemps to save a model using RDS. Has an additional parameter (\code{raw}) whic
 }

 }
-
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -50,4 +50,3 @@ stopifnot(all.equal(labels2, 1 - labels))
 }

 }
-
--- a/R-package/man/slice.Rd
+++ b/R-package/man/slice.Rd
@@ -35,4 +35,3 @@ labels <- lightgbm::getinfo(dsub, "label")
 }

 }
-