Swap integers/numerics naming (#563)

* Fix https://github.com/Microsoft/LightGBM/pull/561 * GitHub local broke, uploading on browser (R-package) * GitHub local broke, uploading on browser (R-package)

Swap integers/numerics naming (#563)
* Fix https://github.com/Microsoft/LightGBM/pull/561 * GitHub local broke, uploading on browser (R-package) * GitHub local broke, uploading on browser (R-package)
2cca8283 · Laurae · Guolin Ke · 7517eefa · 2cca8283 · 2cca8283
Commit 2cca8283 authored May 29, 2017 by Laurae Committed by Guolin Ke May 29, 2017
8 changed files
--- a/R-package/R/lgb.prepare.R
+++ b/R-package/R/lgb.prepare.R
-#' Data preparator for LightGBM datasets (integer)
+#' Data preparator for LightGBM datasets (numeric)
 #'
-#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
+#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
 #' 
 #' @param data A data.frame or data.table to prepare.
 #' 
 #' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 #' 
 #' @examples
 #' \dontrun{
 #'   library(lightgbm)
 #'   data(iris)
 #'   
 #'   str(iris)
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 #'   # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
 #'   
-#'   str(lgb.prepare(data = iris)) # Convert all factors/chars to integer
+#'   str(lgb.prepare(data = iris)) # Convert all factors/chars to numeric
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+#'   # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
 #'   
 #'   # When lightgbm package is installed, and you do not want to load it
 #'   # You can still use the function!
 #'   lgb.unloader()
 #'   str(lightgbm::lgb.prepare(data = iris))
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+#'   # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
-#'   
+#' }
-#' }
+#' 
-#' 
+#' @export
-#' @export
+lgb.prepare <- function(data) {
-lgb.prepare <- function(data) {
+  # data.table not behaving like data.frame
-  # data.table not behaving like data.frame
+  if ("data.table" %in% class(data)) {
-  if ("data.table" %in% class(data)) {
+    # Get data classes
-    # Get data classes
+    list_classes <- sapply(data, class)
-    list_classes <- sapply(data, class)
+    # Convert characters to factors only (we can change them to numeric after)
-    # Convert characters to factors only (we can change them to numeric after)
+    is_char <- which(list_classes == "character")
-    is_char <- which(list_classes == "character")
+    if (length(is_char) > 0) {
-    if (length(is_char) > 0) {
+      data[, (is_char) := lapply(.SD, function(x) {as.numeric(as.factor(x))}), .SDcols = is_char]
-      data[, (is_char) := lapply(.SD, function(x) {as.integer(as.factor(x))}), .SDcols = is_char]
+    }
-    }
+    # Convert factors to numeric (integer is more efficient actually)
-    # Convert factors to numeric (integer is more efficient actually)
+    is_fact <- c(which(list_classes == "factor"), is_char)
-    is_fact <- c(which(list_classes == "factor"), is_char)
+    if (length(is_fact) > 0) {
-    if (length(is_fact) > 0) {
+      data[, (is_fact) := lapply(.SD, function(x) {as.numeric(x)}), .SDcols = is_fact]
-      data[, (is_fact) := lapply(.SD, function(x) {as.integer(x)}), .SDcols = is_fact]
+    }
-    }
+  } else {
-  } else {
+    # Default routine (data.frame)
-    # Default routine (data.frame)
+    if ("data.frame" %in% class(data)) {
-    if ("data.frame" %in% class(data)) {
+      # Get data classes
-      # Get data classes
+      list_classes <- sapply(data, class)
-      list_classes <- sapply(data, class)
+      # Convert characters to factors to numeric (integer is more efficient actually)
-      # Convert characters to factors to numeric (integer is more efficient actually)
+      is_char <- which(list_classes == "character")
-      is_char <- which(list_classes == "character")
+      if (length(is_char) > 0) {
-      if (length(is_char) > 0) {
+        data[is_char] <- lapply(data[is_char], function(x) {as.numeric(as.factor(x))})
-        data[is_char] <- lapply(data[is_char], function(x) {as.integer(as.factor(x))})
+      }
-      }
+      # Convert factors to numeric (integer is more efficient actually)
-      # Convert factors to numeric (integer is more efficient actually)
+      is_fact <- which(list_classes == "factor")
-      is_fact <- which(list_classes == "factor")
+      if (length(is_fact) > 0) {
-      if (length(is_fact) > 0) {
+        data[is_fact] <- lapply(data[is_fact], function(x) {as.numeric(x)})
-        data[is_fact] <- lapply(data[is_fact], function(x) {as.integer(x)})
+      }
-      }
+    } else {
-    } else {
+      # What do you think you are doing here? Throw error.
-      # What do you think you are doing here? Throw error.
+      stop("lgb.prepare2: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
-      stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
+    }
-    }
+  }
-  }
+  return(data)
-  return(data)
+}
-}
--- a/R-package/R/lgb.prepare2.R
+++ b/R-package/R/lgb.prepare2.R
-#' Data preparator for LightGBM datasets (numeric)
+#' Data preparator for LightGBM datasets (integer)
 #'
-#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. This is useful if you have a specific need for numeric dataset instead of integer dataset. There are programs which do not support integer-only input. Consider this is a fallback solution if you cannot use integers. Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
+#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
 #' 
 #' @param data A data.frame or data.table to prepare.
 #' 
 #' @return The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 #' 
 #' @examples
 #' \dontrun{
 #'   library(lightgbm)
 #'   data(iris)
 #'   
 #'   str(iris)
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 #'   # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
 #'   
-#'   str(lgb.prepare2(data = iris)) # Convert all factors/chars to numeric
+#'   str(lgb.prepare2(data = iris)) # Convert all factors/chars to integer
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+#'   # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
 #'   
 #'   # When lightgbm package is installed, and you do not want to load it
 #'   # You can still use the function!
 #'   lgb.unloader()
 #'   str(lightgbm::lgb.prepare2(data = iris))
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+#'   # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
-#' }
+#'   
-#' 
+#' }
-#' @export
+#' 
-lgb.prepare2 <- function(data) {
+#' @export
+lgb.prepare2 <- function(data) {
-  # data.table not behaving like data.frame
-  if ("data.table" %in% class(data)) {
+  # data.table not behaving like data.frame
+  if ("data.table" %in% class(data)) {
-    # Get data classes
-    list_classes <- sapply(data, class)
+    # Get data classes
+    list_classes <- sapply(data, class)
-    # Convert characters to factors only (we can change them to numeric after)
-    is_char <- which(list_classes == "character")
+    # Convert characters to factors only (we can change them to numeric after)
-    if (length(is_char) > 0) {
+    is_char <- which(list_classes == "character")
-      data[, (is_char) := lapply(.SD, function(x) {as.numeric(as.factor(x))}), .SDcols = is_char]
+    if (length(is_char) > 0) {
-    }
+      data[, (is_char) := lapply(.SD, function(x) {as.integer(as.factor(x))}), .SDcols = is_char]
+    }
-    # Convert factors to numeric (integer is more efficient actually)
-    is_fact <- c(which(list_classes == "factor"), is_char)
+    # Convert factors to numeric (integer is more efficient actually)
-    if (length(is_fact) > 0) {
+    is_fact <- c(which(list_classes == "factor"), is_char)
-      data[, (is_fact) := lapply(.SD, function(x) {as.numeric(x)}), .SDcols = is_fact]
+    if (length(is_fact) > 0) {
-    }
+      data[, (is_fact) := lapply(.SD, function(x) {as.integer(x)}), .SDcols = is_fact]
+    }
-  } else {
+  } else {
-    # Default routine (data.frame)
-    if ("data.frame" %in% class(data)) {
+    # Default routine (data.frame)
+    if ("data.frame" %in% class(data)) {
-      # Get data classes
-      list_classes <- sapply(data, class)
+      # Get data classes
+      list_classes <- sapply(data, class)
-      # Convert characters to factors to numeric (integer is more efficient actually)
-      is_char <- which(list_classes == "character")
+      # Convert characters to factors to numeric (integer is more efficient actually)
-      if (length(is_char) > 0) {
+      is_char <- which(list_classes == "character")
-        data[is_char] <- lapply(data[is_char], function(x) {as.numeric(as.factor(x))})
+      if (length(is_char) > 0) {
-      }
+        data[is_char] <- lapply(data[is_char], function(x) {as.integer(as.factor(x))})
+      }
-      # Convert factors to numeric (integer is more efficient actually)
-      is_fact <- which(list_classes == "factor")
+      # Convert factors to numeric (integer is more efficient actually)
-      if (length(is_fact) > 0) {
+      is_fact <- which(list_classes == "factor")
-        data[is_fact] <- lapply(data[is_fact], function(x) {as.numeric(x)})
+      if (length(is_fact) > 0) {
-      }
+        data[is_fact] <- lapply(data[is_fact], function(x) {as.integer(x)})
+      }
-    } else {
+    } else {
-      # What do you think you are doing here? Throw error.
-      stop("lgb.prepare2: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
+      # What do you think you are doing here? Throw error.
+      stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
-    }
+    }
-  }
+  }
-  return(data)
+  return(data)
-}
+}
--- a/R-package/R/lgb.prepare_rules.R
+++ b/R-package/R/lgb.prepare_rules.R
-#' Data preparator for LightGBM datasets with rules (integer)
+#' Data preparator for LightGBM datasets with rules (numeric)
 #'
-#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter.
+#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
 #' 
 #' @param data A data.frame or data.table to prepare.
 #' @param rules A set of rules from the data preparator, if already used.
 #' 
 #' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 #' 
 #' @examples
 #' \dontrun{
 #'   library(lightgbm)
 #'   data(iris)
 #'   
 #'   str(iris)
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 #'   # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
 #'   
 #'   new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
 #'   str(new_iris$data)
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+#'   # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
 #'   
 #'   data(iris) # Erase iris dataset
 #'   iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
 #'   # Warning message:
 #'   In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
 #'     invalid factor level, NA generated
 #'   
 #'   # Use conversion using known rules
 #'   # Unknown factors become 0, excellent for sparse datasets
 #'   newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
 #'   
 #'   # Unknown factor is now zero, perfect for sparse datasets
 #'   newer_iris$data[1, ] # Species became 0 as it is an unknown factor
 #'   #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
 #'   # 1          5.1         3.5          1.4         0.2       0
 #'   
 #'   newer_iris$data[1, 5] <- 1 # Put back real initial value
 #'   
 #'   # Is the newly created dataset equal? YES!
 #'   all.equal(new_iris$data, newer_iris$data)
 #'   # [1] TRUE
 #'   
 #'   # Can we test our own rules?
 #'   data(iris) # Erase iris dataset
 #'   
 #'   # We remapped values differently
-#'   personal_rules <- list(Species = c("setosa" = 3L,
+#'   personal_rules <- list(Species = c("setosa" = 3,
-#'                                      "versicolor" = 2L,
+#'                                      "versicolor" = 2,
-#'                                      "virginica" = 1L))
+#'                                      "virginica" = 1))
 #'   newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
 #'   str(newest_iris$data) # SUCCESS!
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : int  3 3 3 3 3 3 3 3 3 3 ...
+#'   # $ Species     : num  3 3 3 3 3 3 3 3 3 3 ...
 #'   
 #' }
 #' 
 #' @export
 lgb.prepare_rules <- function(data, rules = NULL) {
  # data.table not behaving like data.frame
  if ("data.table" %in% class(data)) {
    # Must use existing rules
    if (!is.null(rules)) {
      # Loop through rules
      for (i in names(rules)) {
        set(data, j = i, value = unname(rules[[i]][data[[i]]]))
-        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
+        data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
      }
    } else {
      # Get data classes
      list_classes <- sapply(data, class)
      # Map characters/factors
      is_fix <- which(list_classes %in% c("character", "factor"))
      ruleset <- list()
      # Need to create rules?
      if (length(is_fix) > 0) {
        # Go through all characters/factors
        for (i in is_fix) {
          # Store column elsewhere
          mini_data <- data[[i]]
          # Get unique values
          if (class(mini_data) == "factor") {
            mini_unique <- levels(mini_data) # Factor
-            mini_numeric <- 1:length(mini_unique) # Respect ordinal if needed
+            mini_numeric <- numeric(length(mini_unique))
-          } else {
+            mini_numeric[1:length(mini_unique)] <- 1:length(mini_unique) # Respect ordinal if needed
-            mini_unique <- as.factor(unique(mini_data)) # Character
+          } else {
-            mini_numeric <- as.integer(mini_unique) # No respect of ordinality
+            mini_unique <- as.factor(unique(mini_data)) # Character
-          }
+            mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
+          }
-          # Create ruleset
-          indexed <- colnames(data)[i] # Index value
+          # Create ruleset
-          ruleset[[indexed]] <- mini_numeric # Numeric content
+          indexed <- colnames(data)[i] # Index value
-          names(ruleset[[indexed]]) <- mini_unique # Character equivalent
+          ruleset[[indexed]] <- mini_numeric # Numeric content
+          names(ruleset[[indexed]]) <- mini_unique # Character equivalent
-          # Apply to real data column
-          set(data, j = i, value = unname(ruleset[[indexed]][mini_data]))
+          # Apply to real data column
+          set(data, j = i, value = unname(ruleset[[indexed]][mini_data]))
-        }
+        }
-      }
+      }
-    }
+    }
-  } else {
+  } else {
-    # Must use existing rules
-    if (!is.null(rules)) {
+    # Must use existing rules
+    if (!is.null(rules)) {
-      # Loop through rules
-      for (i in names(rules)) {
+      # Loop through rules
+      for (i in names(rules)) {
-        data[[i]] <- unname(rules[[i]][data[[i]]])
-        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
+        data[[i]] <- unname(rules[[i]][data[[i]]])
+        data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
-      }
+      }
-    } else {
+    } else {
-      # Default routine (data.frame)
-      if ("data.frame" %in% class(data)) {
+      # Default routine (data.frame)
+      if ("data.frame" %in% class(data)) {
-        # Get data classes
-        list_classes <- sapply(data, class)
+        # Get data classes
+        list_classes <- sapply(data, class)
-        # Map characters/factors
-        is_fix <- which(list_classes %in% c("character", "factor"))
+        # Map characters/factors
-        ruleset <- list()
+        is_fix <- which(list_classes %in% c("character", "factor"))
+        ruleset <- list()
-        # Need to create rules?
-        if (length(is_fix) > 0) {
+        # Need to create rules?
+        if (length(is_fix) > 0) {
-          # Go through all characters/factors
-          for (i in is_fix) {
+          # Go through all characters/factors
+          for (i in is_fix) {
-            # Store column elsewhere
-            mini_data <- data[[i]]
+            # Store column elsewhere
+            mini_data <- data[[i]]
-            # Get unique values
-            if (class(mini_data) == "factor") {
+            # Get unique values
-              mini_unique <- levels(mini_data) # Factor
+            if (class(mini_data) == "factor") {
-              mini_numeric <- 1:length(mini_unique) # Respect ordinal if needed
+              mini_unique <- levels(mini_data) # Factor
-            } else {
+              mini_numeric <- numeric(length(mini_unique))
-              mini_unique <- as.factor(unique(mini_data)) # Character
+              mini_numeric[1:length(mini_unique)] <- 1:length(mini_unique) # Respect ordinal if needed
-              mini_numeric <- as.integer(mini_unique) # No respect of ordinality
+            } else {
-            }
+              mini_unique <- as.factor(unique(mini_data)) # Character
+              mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
-            # Create ruleset
+            }
-            indexed <- colnames(data)[i] # Index value
-            ruleset[[indexed]] <- mini_numeric # Numeric content
+            # Create ruleset
-            names(ruleset[[indexed]]) <- mini_unique # Character equivalent
+            indexed <- colnames(data)[i] # Index value
+            ruleset[[indexed]] <- mini_numeric # Numeric content
-            # Apply to real data column
+            names(ruleset[[indexed]]) <- mini_unique # Character equivalent
-            data[[i]] <- unname(ruleset[[indexed]][mini_data])
+            # Apply to real data column
-          }
+            data[[i]] <- unname(ruleset[[indexed]][mini_data])
-        }
+          }
-      } else {
+        }
-        # What do you think you are doing here? Throw error.
+      } else {
-        stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
+        # What do you think you are doing here? Throw error.
-      }
+        stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
-    }
+      }
-  }
+    }
-  return(list(data = data, rules = ruleset))
+  }
-}
+  return(list(data = data, rules = ruleset))
+}
--- a/R-package/R/lgb.prepare_rules2.R
+++ b/R-package/R/lgb.prepare_rules2.R
-#' Data preparator for LightGBM datasets with rules (numeric)
+#' Data preparator for LightGBM datasets with rules (integer)
 #'
-#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for numeric dataset instead of integer dataset. There are programs which do not support integer-only input. Consider this is a fallback solution if you cannot use integers.
+#' Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
 #' 
 #' @param data A data.frame or data.table to prepare.
 #' @param rules A set of rules from the data preparator, if already used.
 #' 
 #' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 #' 
 #' @examples
 #' \dontrun{
 #'   library(lightgbm)
 #'   data(iris)
 #'   
 #'   str(iris)
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 #'   # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
 #'   
 #'   new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
 #'   str(new_iris$data)
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+#'   # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
 #'   
 #'   data(iris) # Erase iris dataset
 #'   iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
 #'   # Warning message:
 #'   In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
 #'     invalid factor level, NA generated
 #'   
 #'   # Use conversion using known rules
 #'   # Unknown factors become 0, excellent for sparse datasets
 #'   newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
 #'   
 #'   # Unknown factor is now zero, perfect for sparse datasets
 #'   newer_iris$data[1, ] # Species became 0 as it is an unknown factor
 #'   #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
 #'   # 1          5.1         3.5          1.4         0.2       0
 #'   
 #'   newer_iris$data[1, 5] <- 1 # Put back real initial value
 #'   
 #'   # Is the newly created dataset equal? YES!
 #'   all.equal(new_iris$data, newer_iris$data)
 #'   # [1] TRUE
 #'   
 #'   # Can we test our own rules?
 #'   data(iris) # Erase iris dataset
 #'   
 #'   # We remapped values differently
-#'   personal_rules <- list(Species = c("setosa" = 3,
+#'   personal_rules <- list(Species = c("setosa" = 3L,
-#'                                      "versicolor" = 2,
+#'                                      "versicolor" = 2L,
-#'                                      "virginica" = 1))
+#'                                      "virginica" = 1L))
 #'   newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
 #'   str(newest_iris$data) # SUCCESS!
 #'   # 'data.frame':	150 obs. of  5 variables:
 #'   # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 #'   # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 #'   # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 #'   # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-#'   # $ Species     : num  3 3 3 3 3 3 3 3 3 3 ...
+#'   # $ Species     : int  3 3 3 3 3 3 3 3 3 3 ...
 #'   
 #' }
 #' 
 #' @export
 lgb.prepare_rules2 <- function(data, rules = NULL) {
  # data.table not behaving like data.frame
  if ("data.table" %in% class(data)) {
    # Must use existing rules
    if (!is.null(rules)) {
      # Loop through rules
      for (i in names(rules)) {
        set(data, j = i, value = unname(rules[[i]][data[[i]]]))
-        data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
+        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
      }
    } else {
      # Get data classes
      list_classes <- sapply(data, class)
      # Map characters/factors
      is_fix <- which(list_classes %in% c("character", "factor"))
      ruleset <- list()
      # Need to create rules?
      if (length(is_fix) > 0) {
        # Go through all characters/factors
        for (i in is_fix) {
          # Store column elsewhere
          mini_data <- data[[i]]
          # Get unique values
          if (class(mini_data) == "factor") {
            mini_unique <- levels(mini_data) # Factor
-            mini_numeric <- numeric(length(mini_unique))
+            mini_numeric <- 1:length(mini_unique) # Respect ordinal if needed
-            mini_numeric[1:length(mini_unique)] <- 1:length(mini_unique) # Respect ordinal if needed
+          } else {
-          } else {
+            mini_unique <- as.factor(unique(mini_data)) # Character
-            mini_unique <- as.factor(unique(mini_data)) # Character
+            mini_numeric <- as.integer(mini_unique) # No respect of ordinality
-            mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
+          }
-          }
+          # Create ruleset
-          # Create ruleset
+          indexed <- colnames(data)[i] # Index value
-          indexed <- colnames(data)[i] # Index value
+          ruleset[[indexed]] <- mini_numeric # Numeric content
-          ruleset[[indexed]] <- mini_numeric # Numeric content
+          names(ruleset[[indexed]]) <- mini_unique # Character equivalent
-          names(ruleset[[indexed]]) <- mini_unique # Character equivalent
+          # Apply to real data column
-          # Apply to real data column
+          set(data, j = i, value = unname(ruleset[[indexed]][mini_data]))
-          set(data, j = i, value = unname(ruleset[[indexed]][mini_data]))
+        }
-        }
+      }
-      }
+    }
-    }
+  } else {
-  } else {
+    # Must use existing rules
-    # Must use existing rules
+    if (!is.null(rules)) {
-    if (!is.null(rules)) {
+      # Loop through rules
-      # Loop through rules
+      for (i in names(rules)) {
-      for (i in names(rules)) {
+        data[[i]] <- unname(rules[[i]][data[[i]]])
-        data[[i]] <- unname(rules[[i]][data[[i]]])
+        data[[i]][is.na(data[[i]])] <- 0L # Overwrite NAs by 0s as integer
-        data[[i]][is.na(data[[i]])] <- 0 # Overwrite NAs by 0s
+      }
-      }
+    } else {
-    } else {
+      # Default routine (data.frame)
-      # Default routine (data.frame)
+      if ("data.frame" %in% class(data)) {
-      if ("data.frame" %in% class(data)) {
+        # Get data classes
-        # Get data classes
+        list_classes <- sapply(data, class)
-        list_classes <- sapply(data, class)
+        # Map characters/factors
-        # Map characters/factors
+        is_fix <- which(list_classes %in% c("character", "factor"))
-        is_fix <- which(list_classes %in% c("character", "factor"))
+        ruleset <- list()
-        ruleset <- list()
+        # Need to create rules?
-        # Need to create rules?
+        if (length(is_fix) > 0) {
-        if (length(is_fix) > 0) {
+          # Go through all characters/factors
-          # Go through all characters/factors
+          for (i in is_fix) {
-          for (i in is_fix) {
+            # Store column elsewhere
-            # Store column elsewhere
+            mini_data <- data[[i]]
-            mini_data <- data[[i]]
+            # Get unique values
-            # Get unique values
+            if (class(mini_data) == "factor") {
-            if (class(mini_data) == "factor") {
+              mini_unique <- levels(mini_data) # Factor
-              mini_unique <- levels(mini_data) # Factor
+              mini_numeric <- 1:length(mini_unique) # Respect ordinal if needed
-              mini_numeric <- numeric(length(mini_unique))
+            } else {
-              mini_numeric[1:length(mini_unique)] <- 1:length(mini_unique) # Respect ordinal if needed
+              mini_unique <- as.factor(unique(mini_data)) # Character
-            } else {
+              mini_numeric <- as.integer(mini_unique) # No respect of ordinality
-              mini_unique <- as.factor(unique(mini_data)) # Character
+            }
-              mini_numeric <- as.numeric(mini_unique) # No respect of ordinality
-            }
+            # Create ruleset
+            indexed <- colnames(data)[i] # Index value
-            # Create ruleset
+            ruleset[[indexed]] <- mini_numeric # Numeric content
-            indexed <- colnames(data)[i] # Index value
+            names(ruleset[[indexed]]) <- mini_unique # Character equivalent
-            ruleset[[indexed]] <- mini_numeric # Numeric content
-            names(ruleset[[indexed]]) <- mini_unique # Character equivalent
+            # Apply to real data column
+            data[[i]] <- unname(ruleset[[indexed]][mini_data])
-            # Apply to real data column
-            data[[i]] <- unname(ruleset[[indexed]][mini_data])
+          }
-          }
+        }
-        }
+      } else {
-      } else {
+        # What do you think you are doing here? Throw error.
+        stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
-        # What do you think you are doing here? Throw error.
-        stop("lgb.prepare: you provided ", paste(class(data), collapse = " & "), " but data should have class data.frame")
+      }
-      }
+    }
-    }
+  }
-  }
+  return(list(data = data, rules = ruleset))
-  return(list(data = data, rules = ruleset))
+}
-}
--- a/R-package/man/lgb.prepare.Rd
+++ b/R-package/man/lgb.prepare.Rd
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/lgb.prepare.R
 \name{lgb.prepare}
 \alias{lgb.prepare}
-\title{Data preparator for LightGBM datasets (integer)}
+\title{Data preparator for LightGBM datasets (numeric)}
 \usage{
 lgb.prepare(data)
 }
 \arguments{
 \item{data}{A data.frame or data.table to prepare.}
 }
 \value{
 The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 }
 \description{
-Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. Please use \code{lgb.prepare_rules} if you want to apply this transformation to other datasets.
 }
 \examples{
 \dontrun{
  library(lightgbm)
  data(iris)
  str(iris)
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
-  str(lgb.prepare(data = iris)) # Convert all factors/chars to integer
+  str(lgb.prepare(data = iris)) # Convert all factors/chars to numeric
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
  # When lightgbm package is installed, and you do not want to load it
  # You can still use the function!
  lgb.unloader()
  str(lightgbm::lgb.prepare(data = iris))
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+}
-}
+}
-}
--- a/R-package/man/lgb.prepare2.Rd
+++ b/R-package/man/lgb.prepare2.Rd
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/lgb.prepare2.R
 \name{lgb.prepare2}
 \alias{lgb.prepare2}
-\title{Data preparator for LightGBM datasets (numeric)}
+\title{Data preparator for LightGBM datasets (integer)}
 \usage{
 lgb.prepare2(data)
 }
 \arguments{
 \item{data}{A data.frame or data.table to prepare.}
 }
 \value{
 The cleaned dataset. It must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 }
 \description{
-Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric without integers. This is useful if you have a specific need for numeric dataset instead of integer dataset. There are programs which do not support integer-only input. Consider this is a fallback solution if you cannot use integers. Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets.
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). Please use \code{lgb.prepare_rules2} if you want to apply this transformation to other datasets. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
 }
 \examples{
 \dontrun{
  library(lightgbm)
  data(iris)
  str(iris)
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
-  str(lgb.prepare2(data = iris)) # Convert all factors/chars to numeric
+  str(lgb.prepare2(data = iris)) # Convert all factors/chars to integer
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
  # When lightgbm package is installed, and you do not want to load it
  # You can still use the function!
  lgb.unloader()
  str(lightgbm::lgb.prepare2(data = iris))
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
-}
+}
-}
+}
--- a/R-package/man/lgb.prepare_rules.Rd
+++ b/R-package/man/lgb.prepare_rules.Rd
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/lgb.prepare_rules.R
 \name{lgb.prepare_rules}
 \alias{lgb.prepare_rules}
-\title{Data preparator for LightGBM datasets with rules (integer)}
+\title{Data preparator for LightGBM datasets with rules (numeric)}
 \usage{
 lgb.prepare_rules(data, rules = NULL)
 }
 \arguments{
 \item{data}{A data.frame or data.table to prepare.}
 \item{rules}{A set of rules from the data preparator, if already used.}
 }
 \value{
 A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 }
 \description{
-Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter.
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter.
 }
 \examples{
 \dontrun{
  library(lightgbm)
  data(iris)
  str(iris)
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
  new_iris <- lgb.prepare_rules(data = iris) # Autoconverter
  str(new_iris$data)
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
+  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
  data(iris) # Erase iris dataset
  iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
  # Warning message:
  In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
    invalid factor level, NA generated
  # Use conversion using known rules
  # Unknown factors become 0, excellent for sparse datasets
  newer_iris <- lgb.prepare_rules(data = iris, rules = new_iris$rules)
  # Unknown factor is now zero, perfect for sparse datasets
  newer_iris$data[1, ] # Species became 0 as it is an unknown factor
  #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
  # 1          5.1         3.5          1.4         0.2       0
  newer_iris$data[1, 5] <- 1 # Put back real initial value
  # Is the newly created dataset equal? YES!
  all.equal(new_iris$data, newer_iris$data)
  # [1] TRUE
  # Can we test our own rules?
  data(iris) # Erase iris dataset
  # We remapped values differently
-  personal_rules <- list(Species = c("setosa" = 3L,
+  personal_rules <- list(Species = c("setosa" = 3,
-                                     "versicolor" = 2L,
+                                     "versicolor" = 2,
-                                     "virginica" = 1L))
+                                     "virginica" = 1))
  newest_iris <- lgb.prepare_rules(data = iris, rules = personal_rules)
  str(newest_iris$data) # SUCCESS!
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : int  3 3 3 3 3 3 3 3 3 3 ...
+  # $ Species     : num  3 3 3 3 3 3 3 3 3 3 ...
 }
 }
--- a/R-package/man/lgb.prepare_rules2.Rd
+++ b/R-package/man/lgb.prepare_rules2.Rd
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/lgb.prepare_rules2.R
 \name{lgb.prepare_rules2}
 \alias{lgb.prepare_rules2}
-\title{Data preparator for LightGBM datasets with rules (numeric)}
+\title{Data preparator for LightGBM datasets with rules (integer)}
 \usage{
 lgb.prepare_rules2(data, rules = NULL)
 }
 \arguments{
 \item{data}{A data.frame or data.table to prepare.}
 \item{rules}{A set of rules from the data preparator, if already used.}
 }
 \value{
 A list with the cleaned dataset (\code{data}) and the rules (\code{rules}). The data must be converted to a matrix format (\code{as.matrix}) for input in lgb.Dataset.
 }
 \description{
-Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric. In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for numeric dataset instead of integer dataset. There are programs which do not support integer-only input. Consider this is a fallback solution if you cannot use integers.
+Attempts to prepare a clean dataset to prepare to put in a lgb.Dataset. Factors and characters are converted to numeric (specifically: integer). In addition, keeps rules created so you can convert other datasets using this converter. This is useful if you have a specific need for integer dataset instead of numeric dataset. Note that there are programs which do not support integer-only input. Consider this as a half memory technique which is dangerous, especially for LightGBM.
 }
 \examples{
 \dontrun{
  library(lightgbm)
  data(iris)
  str(iris)
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
  # $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 ...
  new_iris <- lgb.prepare_rules2(data = iris) # Autoconverter
  str(new_iris$data)
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : num  1 1 1 1 1 1 1 1 1 1 ...
+  # $ Species     : int  1 1 1 1 1 1 1 1 1 1 ...
  data(iris) # Erase iris dataset
  iris$Species[1] <- "NEW FACTOR" # Introduce junk factor (NA)
  # Warning message:
  In `[<-.factor`(`*tmp*`, 1, value = c(NA, 1L, 1L, 1L, 1L, 1L, 1L,  :
    invalid factor level, NA generated
  # Use conversion using known rules
  # Unknown factors become 0, excellent for sparse datasets
  newer_iris <- lgb.prepare_rules2(data = iris, rules = new_iris$rules)
  # Unknown factor is now zero, perfect for sparse datasets
  newer_iris$data[1, ] # Species became 0 as it is an unknown factor
  #   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
  # 1          5.1         3.5          1.4         0.2       0
  newer_iris$data[1, 5] <- 1 # Put back real initial value
  # Is the newly created dataset equal? YES!
  all.equal(new_iris$data, newer_iris$data)
  # [1] TRUE
  # Can we test our own rules?
  data(iris) # Erase iris dataset
  # We remapped values differently
-  personal_rules <- list(Species = c("setosa" = 3,
+  personal_rules <- list(Species = c("setosa" = 3L,
-                                     "versicolor" = 2,
+                                     "versicolor" = 2L,
-                                     "virginica" = 1))
+                                     "virginica" = 1L))
  newest_iris <- lgb.prepare_rules2(data = iris, rules = personal_rules)
  str(newest_iris$data) # SUCCESS!
  # 'data.frame':	150 obs. of  5 variables:
  # $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
  # $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
  # $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
  # $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
-  # $ Species     : num  3 3 3 3 3 3 3 3 3 3 ...
+  # $ Species     : int  3 3 3 3 3 3 3 3 3 3 ...
 }
 }