lgb.convert_with_rules.R 7.42 KB
Newer Older
1
2
3
4
5
6
# [description] get all column classes of a data.table or data.frame.
#               This function collapses the result of class() into a single string
.get_column_classes <- function(df) {
    return(
        vapply(
            X = df
7
8
9
            , FUN = function(x) {
                paste0(class(x), collapse = ",")
            }
10
11
12
13
14
            , FUN.VALUE = character(1L)
        )
    )
}

15
16
17
# [description] check a data frame or data table for columns that are any
#               type other than numeric and integer. This is used by lgb.convert_with_rules()
#               to warn if more action is needed by users
18
19
#               before a dataset can be converted to a lgb.Dataset.
.warn_for_unconverted_columns <- function(df, function_name) {
20
    column_classes <- .get_column_classes(df = df)
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
    unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
    if (length(unconverted_columns) > 0L) {
        col_detail_string <- paste0(
            paste0(
                names(unconverted_columns)
                , " ("
                , unconverted_columns
                , ")"
            )
            , collapse = ", "
        )
        msg <- paste0(
            function_name
            , ": "
            , length(unconverted_columns)
            , " columns are not numeric or integer. These need to be dropped or converted to "
            , "be used in an lgb.Dataset object. "
            , col_detail_string
        )
        warning(msg)
    }
    return(invisible(NULL))
}

45
46
47
48
49
50
.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {
    return(-1L)
}
.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {
    return(0L)
}
51
52


53
#' @name lgb.convert_with_rules
54
55
#' @title Data preparator for LightGBM datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
56
57
58
59
60
61
62
63
64
#'              Factor, character, and logical columns are converted to integer. Missing values
#'              in factors and characters will be filled with 0L. Missing values in logicals
#'              will be filled with -1L.
#'
#'              This function returns and optionally takes in "rules" the describe exactly
#'              how to convert values in columns.
#'
#'              Columns that contain only NA values will be converted by this function but will
#'              not show up in the returned \code{rules}.
65
66
#'
#'              NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
James Lamb's avatar
James Lamb committed
67
#' @param data A data.frame or data.table to prepare.
68
69
70
71
#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
#'              where names are column names in \code{data} and values are named character
#'              vectors whose names are column values and whose values are new values to
#'              replace them with.
72
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
73
#'         Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
74
#'         \code{lgb.Dataset}.
75
#'
James Lamb's avatar
James Lamb committed
76
#' @examples
77
#' \donttest{
James Lamb's avatar
James Lamb committed
78
#' data(iris)
79
#'
James Lamb's avatar
James Lamb committed
80
#' str(iris)
81
#'
82
#' new_iris <- lgb.convert_with_rules(data = iris)
James Lamb's avatar
James Lamb committed
83
#' str(new_iris$data)
84
#'
James Lamb's avatar
James Lamb committed
85
#' data(iris) # Erase iris dataset
86
#' iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
87
#'
James Lamb's avatar
James Lamb committed
88
89
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
90
#' newer_iris <- lgb.convert_with_rules(data = iris, rules = new_iris$rules)
91
#'
James Lamb's avatar
James Lamb committed
92
#' # Unknown factor is now zero, perfect for sparse datasets
93
#' newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
94
#'
95
#' newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
96
#'
James Lamb's avatar
James Lamb committed
97
98
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
99
#'
James Lamb's avatar
James Lamb committed
100
101
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
102
#'
James Lamb's avatar
James Lamb committed
103
#' # We remapped values differently
104
105
106
107
#' personal_rules <- list(
#'   Species = c(
#'     "setosa" = 3L
#'     , "versicolor" = 2L
108
#'     , "virginica" = 1L
109
110
#'   )
#' )
111
#' newest_iris <- lgb.convert_with_rules(data = iris, rules = personal_rules)
James Lamb's avatar
James Lamb committed
112
#' str(newest_iris$data) # SUCCESS!
113
#' }
James Lamb's avatar
James Lamb committed
114
115
#' @importFrom data.table set
#' @export
116
lgb.convert_with_rules <- function(data, rules = NULL) {
117

118
    column_classes <- .get_column_classes(df = data)
119

120
121
122
    is_char <- which(column_classes == "character")
    is_factor <- which(column_classes == "factor")
    is_logical <- which(column_classes == "logical")
123

124
    is_data_table <- data.table::is.data.table(x = data)
125
    is_data_frame <- is.data.frame(data)
126

127
128
129
130
131
132
133
    if (!(is_data_table || is_data_frame)) {
        stop(
            "lgb.convert_with_rules: you provided "
            , paste(class(data), collapse = " & ")
            , " but data should have class data.frame or data.table"
        )
    }
134

135
136
137
138
    # if user didn't provide rules, create them
    if (is.null(rules)) {
        rules <- list()
        columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))
139

140
        for (i in columns_to_fix) {
141

142
          col_values <- data[[i]]
143

James Lamb's avatar
James Lamb committed
144
          # Get unique values
145
146
147
148
149
150
151
152
153
154
155
          if (is.factor(col_values)) {
              unique_vals <- levels(col_values)
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- seq_along(unique_vals) # respect ordinal
          } else if (is.character(col_values)) {
              unique_vals <- as.factor(unique(col_values))
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- as.integer(unique_vals)  # no respect for ordinal
          } else if (is.logical(col_values)) {
              unique_vals <- c(FALSE, TRUE)
              mini_numeric <- c(0L, 1L)
James Lamb's avatar
James Lamb committed
156
          }
157

158
159
160
161
162
163
          # don't add rules for all-NA columns
          if (length(unique_vals) > 0L) {
              col_name <- names(data)[i]
              rules[[col_name]] <- mini_numeric
              names(rules[[col_name]]) <- unique_vals
          }
James Lamb's avatar
James Lamb committed
164
165
166
        }
    }

167
168
169
170
171
    for (col_name in names(rules)) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
James Lamb's avatar
James Lamb committed
172
        }
173
174
        if (is_data_table) {
            data.table::set(
175
                x = data
176
177
178
179
180
181
182
183
184
                , j = col_name
                , value = unname(rules[[col_name]][data[[col_name]]])
            )
            data[is.na(get(col_name)), (col_name) := default_value_for_na]
        } else {
            data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
            data[is.na(data[col_name]), col_name] <- default_value_for_na
        }
    }
185

186
187
188
189
190
191
192
    # if any all-NA columns exist, they won't be in rules. Convert them
    all_na_cols <- which(
        sapply(
            X = data
            , FUN = function(x) {
                (is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
            }
193
        )
194
195
196
197
198
199
200
201
202
203
204
205
    )
    for (col_name in all_na_cols) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
        }
        if (is_data_table) {
            data[, (col_name) := rep(default_value_for_na, .N)]
        } else {
            data[[col_name]] <- default_value_for_na
        }
James Lamb's avatar
James Lamb committed
206
    }
207

208
    .warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")
209

210
    return(list(data = data, rules = rules))
211

James Lamb's avatar
James Lamb committed
212
}