lgb.convert_with_rules.R 7.24 KB
Newer Older
1
2
3
4
5
6
# [description] get all column classes of a data.table or data.frame.
#               This function collapses the result of class() into a single string
.get_column_classes <- function(df) {
    return(
        vapply(
            X = df
7
8
9
            , FUN = function(x) {
                paste0(class(x), collapse = ",")
            }
10
11
12
13
14
            , FUN.VALUE = character(1L)
        )
    )
}

15
16
17
# [description] check a data frame or data table for columns that are any
#               type other than numeric and integer. This is used by lgb.convert_with_rules()
#               to warn if more action is needed by users
18
19
#               before a dataset can be converted to a lgb.Dataset.
.warn_for_unconverted_columns <- function(df, function_name) {
20
    column_classes <- .get_column_classes(df = df)
21
22
    unconverted_columns <- column_classes[!(column_classes %in% c("numeric", "integer"))]
    if (length(unconverted_columns) > 0L) {
23
        col_detail_string <- toString(
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
            paste0(
                names(unconverted_columns)
                , " ("
                , unconverted_columns
                , ")"
            )
        )
        msg <- paste0(
            function_name
            , ": "
            , length(unconverted_columns)
            , " columns are not numeric or integer. These need to be dropped or converted to "
            , "be used in an lgb.Dataset object. "
            , col_detail_string
        )
        warning(msg)
    }
    return(invisible(NULL))
}

44
45
46
47
48
49
.LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA <- function() {
    return(-1L)
}
.LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA <- function() {
    return(0L)
}
50
51


52
#' @name lgb.convert_with_rules
53
54
#' @title Data preparator for LightGBM datasets with rules (integer)
#' @description Attempts to prepare a clean dataset to prepare to put in a \code{lgb.Dataset}.
55
56
57
58
59
60
61
62
63
#'              Factor, character, and logical columns are converted to integer. Missing values
#'              in factors and characters will be filled with 0L. Missing values in logicals
#'              will be filled with -1L.
#'
#'              This function returns and optionally takes in "rules" the describe exactly
#'              how to convert values in columns.
#'
#'              Columns that contain only NA values will be converted by this function but will
#'              not show up in the returned \code{rules}.
64
65
#'
#'              NOTE: In previous releases of LightGBM, this function was called \code{lgb.prepare_rules2}.
James Lamb's avatar
James Lamb committed
66
#' @param data A data.frame or data.table to prepare.
67
68
69
70
#' @param rules A set of rules from the data preparator, if already used. This should be an R list,
#'              where names are column names in \code{data} and values are named character
#'              vectors whose names are column values and whose values are new values to
#'              replace them with.
71
#' @return A list with the cleaned dataset (\code{data}) and the rules (\code{rules}).
72
#'         Note that the data must be converted to a matrix format (\code{as.matrix}) for input in
73
#'         \code{lgb.Dataset}.
74
#'
James Lamb's avatar
James Lamb committed
75
#' @examples
76
#' \donttest{
James Lamb's avatar
James Lamb committed
77
#' data(iris)
78
#'
James Lamb's avatar
James Lamb committed
79
#' str(iris)
80
#'
81
#' new_iris <- lgb.convert_with_rules(data = iris)
James Lamb's avatar
James Lamb committed
82
#' str(new_iris$data)
83
#'
James Lamb's avatar
James Lamb committed
84
#' data(iris) # Erase iris dataset
85
#' iris$Species[1L] <- "NEW FACTOR" # Introduce junk factor (NA)
86
#'
James Lamb's avatar
James Lamb committed
87
88
#' # Use conversion using known rules
#' # Unknown factors become 0, excellent for sparse datasets
89
#' newer_iris <- lgb.convert_with_rules(data = iris, rules = new_iris$rules)
90
#'
James Lamb's avatar
James Lamb committed
91
#' # Unknown factor is now zero, perfect for sparse datasets
92
#' newer_iris$data[1L, ] # Species became 0 as it is an unknown factor
93
#'
94
#' newer_iris$data[1L, 5L] <- 1.0 # Put back real initial value
95
#'
James Lamb's avatar
James Lamb committed
96
97
#' # Is the newly created dataset equal? YES!
#' all.equal(new_iris$data, newer_iris$data)
98
#'
James Lamb's avatar
James Lamb committed
99
100
#' # Can we test our own rules?
#' data(iris) # Erase iris dataset
101
#'
James Lamb's avatar
James Lamb committed
102
#' # We remapped values differently
103
104
105
106
#' personal_rules <- list(
#'   Species = c(
#'     "setosa" = 3L
#'     , "versicolor" = 2L
107
#'     , "virginica" = 1L
108
109
#'   )
#' )
110
#' newest_iris <- lgb.convert_with_rules(data = iris, rules = personal_rules)
James Lamb's avatar
James Lamb committed
111
#' str(newest_iris$data) # SUCCESS!
112
#' }
James Lamb's avatar
James Lamb committed
113
114
#' @importFrom data.table set
#' @export
115
lgb.convert_with_rules <- function(data, rules = NULL) {
116

117
    column_classes <- .get_column_classes(df = data)
118

119
    is_data_table <- data.table::is.data.table(x = data)
120
    is_data_frame <- is.data.frame(data)
121

122
123
124
125
126
127
128
    if (!(is_data_table || is_data_frame)) {
        stop(
            "lgb.convert_with_rules: you provided "
            , paste(class(data), collapse = " & ")
            , " but data should have class data.frame or data.table"
        )
    }
129

130
131
132
133
    # if user didn't provide rules, create them
    if (is.null(rules)) {
        rules <- list()
        columns_to_fix <- which(column_classes %in% c("character", "factor", "logical"))
134

135
        for (i in columns_to_fix) {
136

137
          col_values <- data[[i]]
138

James Lamb's avatar
James Lamb committed
139
          # Get unique values
140
141
142
143
144
145
146
147
148
149
150
          if (is.factor(col_values)) {
              unique_vals <- levels(col_values)
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- seq_along(unique_vals) # respect ordinal
          } else if (is.character(col_values)) {
              unique_vals <- as.factor(unique(col_values))
              unique_vals <- unique_vals[!is.na(unique_vals)]
              mini_numeric <- as.integer(unique_vals)  # no respect for ordinal
          } else if (is.logical(col_values)) {
              unique_vals <- c(FALSE, TRUE)
              mini_numeric <- c(0L, 1L)
James Lamb's avatar
James Lamb committed
151
          }
152

153
154
155
156
157
158
          # don't add rules for all-NA columns
          if (length(unique_vals) > 0L) {
              col_name <- names(data)[i]
              rules[[col_name]] <- mini_numeric
              names(rules[[col_name]]) <- unique_vals
          }
James Lamb's avatar
James Lamb committed
159
160
161
        }
    }

162
163
164
165
166
    for (col_name in names(rules)) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
James Lamb's avatar
James Lamb committed
167
        }
168
169
        if (is_data_table) {
            data.table::set(
170
                x = data
171
172
173
174
175
176
177
178
179
                , j = col_name
                , value = unname(rules[[col_name]][data[[col_name]]])
            )
            data[is.na(get(col_name)), (col_name) := default_value_for_na]
        } else {
            data[[col_name]] <- unname(rules[[col_name]][data[[col_name]]])
            data[is.na(data[col_name]), col_name] <- default_value_for_na
        }
    }
180

181
182
183
184
185
186
187
    # if any all-NA columns exist, they won't be in rules. Convert them
    all_na_cols <- which(
        sapply(
            X = data
            , FUN = function(x) {
                (is.factor(x) || is.character(x) || is.logical(x)) && all(is.na(unique(x)))
            }
188
        )
189
190
191
192
193
194
195
196
197
198
199
200
    )
    for (col_name in all_na_cols) {
        if (column_classes[[col_name]] == "logical") {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_LOGICAL_NA()
        } else {
            default_value_for_na <- .LGB_CONVERT_DEFAULT_FOR_NON_LOGICAL_NA()
        }
        if (is_data_table) {
            data[, (col_name) := rep(default_value_for_na, .N)]
        } else {
            data[[col_name]] <- default_value_for_na
        }
James Lamb's avatar
James Lamb committed
201
    }
202

203
    .warn_for_unconverted_columns(df = data, function_name = "lgb.convert_with_rules")
204

205
    return(list(data = data, rules = rules))
206

James Lamb's avatar
James Lamb committed
207
}