lgb.convert_with_rules.R 7.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# [description] get all column classes of a data.table or data.frame.
#               This function collapses the result of class() into a single string
.get_column_classes <- function(df) {
    return(
        vapply(
            X = df
            , FUN = function(x) {paste0(class(x), collapse = ",")}
            , FUN.VALUE = character(1L)
        )
    )
}

# [description] check a data frame or data table for columns tthat are any
#               type other than numeric and integer. This is used by lgb.convert()
#               and lgb.convert_with_rules() too warn if more action is needed by users
#               before a dataset can be converted to a lgb.Dataset.
.warn_for_unconverted_columns <- function(df, function_name) {
18
    column_classes <- .get_column_classes(df = df)
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
    if (length(unconverted_columns) > 0L) {
        col_detail_string <- paste0(
            paste0(
                names(unconverted_columns)
                , " ("
                , unconverted_columns
                , ")"
            )
            , collapse = ", "
        )
        msg <- paste0(
            function_name
            , ": "
            , length(unconverted_columns)
            , " columns are not numeric or integer. These need to be dropped or converted to "
            , "be used in an lgb.Dataset object. "
            , col_detail_string
        )
        warning(msg)
    }
    return(invisible(NULL))
}

.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {return(-1L)}
.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {return(0L)}


47
#' @name lgb.convert_with_rules
48
49
#' @title Data preparator for LightGBM datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
50
51
52
53
54
55
56
57
58
#'              Factor, character, and logical columns are converted to integer. Missing values
#'              in factors and characters will be filled with 0L. Missing values in logicals
#'              will be filled with -1L.
#'
#'              This function returns and optionally takes in "rules" the describe exactly
#'              how to convert values in columns.
#'
#'              Columns that contain only NA values will be converted by this function but will
#'              not show up in the returned \code{rules}.
59
60
#'
#'              NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
James Lamb's avatar
James Lamb committed
61
#' @param data A data.frame or data.table to prepare.
62
63
64
65
#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
#'              where names are column names in \code{data} and values are named character
#'              vectors whose names are column values and whose values are new values to
#'              replace them with.
66
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
67
#'         Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
68
#'         \code{lgb.Dataset}.
69
#'
James Lamb's avatar
James Lamb committed
70
#' @examples
71
#' \donttest{
James Lamb's avatar
James Lamb committed
72
#' data(iris)
73
#'
James Lamb's avatar
James Lamb committed
74
#' str(iris)
75
#'
76
#' new_iris <- lgb.convert_with_rules(data = iris)
James Lamb's avatar
James Lamb committed
77
#' str(new_iris$data)
78
#'
James Lamb's avatar
James Lamb committed
79
#' data(iris) # Erase iris dataset
80
#' iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
81
#'
James Lamb's avatar
James Lamb committed
82
83
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
84
#' newer_iris <- lgb.convert_with_rules(data = iris, rules = new_iris$rules)
85
#'
James Lamb's avatar
James Lamb committed
86
#' # Unknown factor is now zero, perfect for sparse datasets
87
#' newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
88
#'
89
#' newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
90
#'
James Lamb's avatar
James Lamb committed
91
92
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
93
#'
James Lamb's avatar
James Lamb committed
94
95
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
96
#'
James Lamb's avatar
James Lamb committed
97
#' # We remapped values differently
98
99
100
101
#' personal_rules <- list(
#'   Species = c(
#'     "setosa" = 3L
#'     , "versicolor" = 2L
102
#'     , "virginica" = 1L
103
104
#'   )
#' )
105
#' newest_iris <- lgb.convert_with_rules(data = iris, rules = personal_rules)
James Lamb's avatar
James Lamb committed
106
#' str(newest_iris$data) # SUCCESS!
107
#' }
James Lamb's avatar
James Lamb committed
108
109
#' @importFrom data.table set
#' @export
110
lgb.convert_with_rules <- function(data, rules = NULL) {
111

112
    column_classes <- .get_column_classes(df = data)
113

114
115
116
    is_char <- which(column_classes == "character")
    is_factor <- which(column_classes == "factor")
    is_logical <- which(column_classes == "logical")
117

118
    is_data_table <- data.table::is.data.table(x = data)
119
    is_data_frame <- is.data.frame(data)
120

121
122
123
124
125
126
127
    if (!(is_data_table || is_data_frame)) {
        stop(
            "lgb.convert_with_rules: you provided "
            , paste(class(data), collapse = " & ")
            , " but data should have class data.frame or data.table"
        )
    }
128

129
130
131
132
    # if user didn't provide rules, create them
    if (is.null(rules)) {
        rules <- list()
        columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))
133

134
        for (i in columns_to_fix) {
135

136
          col_values <- data[[i]]
137

James Lamb's avatar
James Lamb committed
138
          # Get unique values
139
140
141
142
143
144
145
146
147
148
149
          if (is.factor(col_values)) {
              unique_vals <- levels(col_values)
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- seq_along(unique_vals) # respect ordinal
          } else if (is.character(col_values)) {
              unique_vals <- as.factor(unique(col_values))
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- as.integer(unique_vals)  # no respect for ordinal
          } else if (is.logical(col_values)) {
              unique_vals <- c(FALSE, TRUE)
              mini_numeric <- c(0L, 1L)
James Lamb's avatar
James Lamb committed
150
          }
151

152
153
154
155
156
157
          # don't add rules for all-NA columns
          if (length(unique_vals) > 0L) {
              col_name <- names(data)[i]
              rules[[col_name]] <- mini_numeric
              names(rules[[col_name]]) <- unique_vals
          }
James Lamb's avatar
James Lamb committed
158
159
160
        }
    }

161
162
163
164
165
    for (col_name in names(rules)) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
James Lamb's avatar
James Lamb committed
166
        }
167
168
        if (is_data_table) {
            data.table::set(
169
                x = data
170
171
172
173
174
175
176
177
178
                , j = col_name
                , value = unname(rules[[col_name]][data[[col_name]]])
            )
            data[is.na(get(col_name)), (col_name) := default_value_for_na]
        } else {
            data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
            data[is.na(data[col_name]), col_name] <- default_value_for_na
        }
    }
179

180
181
182
183
184
185
186
    # if any all-NA columns exist, they won't be in rules. Convert them
    all_na_cols <- which(
        sapply(
            X = data
            , FUN = function(x) {
                (is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
            }
187
        )
188
189
190
191
192
193
194
195
196
197
198
199
    )
    for (col_name in all_na_cols) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
        }
        if (is_data_table) {
            data[, (col_name) := rep(default_value_for_na, .N)]
        } else {
            data[[col_name]] <- default_value_for_na
        }
James Lamb's avatar
James Lamb committed
200
    }
201

202
    .warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")
203

204
    return(list(data = data, rules = rules))
205

James Lamb's avatar
James Lamb committed
206
}