test_lgb.convert_with_rules.R 10.9 KB
Newer Older
1
context("lgb.convert_with_rules()")
2

3
test_that("lgb.convert_with_rules() rejects inputs that are not a data.table or data.frame", {
4
5
6
7
8
9
10
11
12
13
14
15
16
    bad_inputs <- list(
        matrix(1.0:10.0, 2L, 5L)
        , TRUE
        , c("a", "b")
        , NA
        , 10L
        , lgb.Dataset(
            data = matrix(1.0:10.0, 2L, 5L)
            , params = list()
        )
    )
    for (bad_input in bad_inputs) {
        expect_error({
17
18
            conversion_result <- lgb.convert_with_rules(bad_input)
        }, regexp = "lgb.convert_with_rules: you provided", fixed = TRUE)
19
20
21
    }
})

22
test_that("lgb.convert_with_rules() should work correctly for a dataset with only character columns", {
23
24
25
26
27
28
29
    testDF <- data.frame(
        col1 = c("a", "b", "c")
        , col2 =  c("green", "green", "red")
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
30
        conversion_result <- lgb.convert_with_rules(input_data)
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
        # dataset should have been converted to integer
        converted_dataset <- conversion_result[["data"]]
        expect_identical(class(input_data), class(converted_dataset))
        expect_identical(class(converted_dataset[["col1"]]), "integer")
        expect_identical(class(converted_dataset[["col2"]]), "integer")
        expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
        expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
        # rules should be returned and correct
        rules <- conversion_result$rules
        expect_is(rules, "list")
        expect_length(rules, ncol(input_data))
        expect_identical(rules[["col1"]], c("a" = 1L, "b" = 2L, "c" = 3L))
        expect_identical(rules[["col2"]], c("green" = 1L, "red" = 2L))
    }
})

47
test_that("lgb.convert_with_rules() should work correctly for a dataset with only factor columns", {
48
49
50
51
52
53
54
    testDF <- data.frame(
        col1 = as.factor(c("a", "b", "c"))
        , col2 =  as.factor(c("green", "green", "red"))
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
55
        conversion_result <- lgb.convert_with_rules(input_data)
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
        # dataset should have been converted to integer
        converted_dataset <- conversion_result[["data"]]
        expect_identical(class(input_data), class(converted_dataset))
        expect_identical(class(converted_dataset[["col1"]]), "integer")
        expect_identical(class(converted_dataset[["col2"]]), "integer")
        expect_identical(converted_dataset[["col1"]], c(1L, 2L, 3L))
        expect_identical(converted_dataset[["col2"]], c(1L, 1L, 2L))
        # rules should be returned and correct
        rules <- conversion_result$rules
        expect_is(rules, "list")
        expect_length(rules, ncol(input_data))
        expect_identical(rules[["col1"]], c("a" = 1L, "b" = 2L, "c" = 3L))
        expect_identical(rules[["col2"]], c("green" = 1L, "red" = 2L))
    }
})

72
test_that("lgb.convert_with_rules() should not change a dataset with only integer columns", {
73
74
75
76
77
78
79
    testDF <- data.frame(
        col1 = 11L:15L
        , col2 = 16L:20L
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
80
        conversion_result <- lgb.convert_with_rules(input_data)
81
82
83
84
85
86
87
88
89
        # dataset should have been converted to integer
        converted_dataset <- conversion_result[["data"]]
        expect_identical(converted_dataset, input_data)
        # rules should be returned and correct
        rules <- conversion_result$rules
        expect_identical(rules, list())
    }
})

90
test_that("lgb.convert_with_rules() should work correctly for a dataset with numeric, factor, and character columns", {
91
92
93
94
95
96
97
98
    testDF <- data.frame(
        character_col = c("a", "b", "c")
        , numeric_col = c(1.0, 9.0, 10.0)
        , factor_col = as.factor(c("n", "n", "y"))
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
99
        conversion_result <- lgb.convert_with_rules(input_data)
100
101
102
103
104
105
106
107
108
109
110
111
112
113
        # dataset should have been converted to numeric
        converted_dataset <- conversion_result[["data"]]
        expect_identical(class(input_data), class(converted_dataset))
        expect_identical(class(converted_dataset[["character_col"]]), "integer")
        expect_identical(class(converted_dataset[["factor_col"]]), "integer")
        expect_identical(converted_dataset[["character_col"]], c(1L, 2L, 3L))
        expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))
        # rules should be returned and correct
        rules <- conversion_result$rules
        expect_is(rules, "list")
        expect_length(rules, 2L)
        expect_identical(rules[["character_col"]], c("a" = 1L, "b" = 2L, "c" = 3L))
        expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))

114
        # today, lgb.convert_with_rules() does not convert numeric columns
115
116
117
118
119
        expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
        expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
    }
})

120
test_that("lgb.convert_with_rules() should convert missing values to the expected value", {
121
122
123
124
125
126
127
128
129
130
131
132
133
    testDF <- data.frame(
        character_col = c("a", NA_character_, "c")
        , na_col = rep(NA, 3L)
        , na_real_col = rep(NA_real_, 3L)
        , na_int_col = rep(NA_integer_,  3L)
        , na_character_col = rep(NA_character_, 3L)
        , numeric_col = c(1.0, 9.0, NA_real_)
        , factor_col = as.factor(c("n", "n", "y"))
        , integer_col = c(1L, 9L, NA_integer_)
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
134
        conversion_result <- lgb.convert_with_rules(input_data)
135
136
137
138
139
        # dataset should have been converted to integer
        converted_dataset <- conversion_result[["data"]]
        expect_identical(class(input_data), class(converted_dataset))

        expect_identical(class(converted_dataset[["character_col"]]), "integer")
140
        expect_identical(converted_dataset[["character_col"]], c(1L, 0L, 2L))
141

142
        # does not try to fill 0s in for already-integer columns
143
144
        expect_identical(class(converted_dataset[["integer_col"]]), "integer")
        expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
145
146
        expect_identical(class(converted_dataset[["na_int_col"]]), "integer")
        expect_identical(converted_dataset[["na_int_col"]], rep(NA_integer_, nrow(converted_dataset)))
147
148
149
150

        expect_identical(class(converted_dataset[["factor_col"]]), "integer")
        expect_identical(converted_dataset[["factor_col"]], c(1L, 1L, 2L))

151
152
153
        # NAs in character columns should be converted to 0
        expect_identical(class(converted_dataset[["na_character_col"]]), "integer")
        expect_identical(converted_dataset[["na_character_col"]], rep(0L, nrow(converted_dataset)))
154

155
156
157
        # logical should be converted to integer
        expect_identical(class(converted_dataset[["na_col"]]), "integer")
        expect_identical(converted_dataset[["na_col"]], rep(-1L, 3L))
158

159
        # lgb.convert_with_rules() should not convert numeric columns to integer
160
161
162
163
164
165
166
167
168
        expect_identical(class(converted_dataset[["na_real_col"]]), "numeric")
        expect_identical(converted_dataset[["na_real_col"]], rep(NA_real_, nrow(converted_dataset)))
        expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
        expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, NA_real_))

        # rules should be returned and correct
        rules <- conversion_result$rules
        expect_is(rules, "list")
        expect_length(rules, 3L)
169
        expect_identical(rules[["character_col"]], c("a" = 1L, "c" = 2L))
170
        expect_identical(rules[["factor_col"]], c("n" = 1L, "y" = 2L))
171
        expect_identical(rules[["na_col"]], stats::setNames(c(0L, 1L), c(FALSE, TRUE)))
172
173
174
    }
})

175
test_that("lgb.convert_with_rules() should work correctly if you provide your own well-formed rules", {
176
177
178
179
180
181
182
183
184
185
186
187
    testDF <- data.frame(
        character_col = c("a", NA_character_, "c", "a", "a", "c")
        , na_col = rep(NA, 6L)
        , na_real_col = rep(NA_real_, 6L)
        , na_int_col = rep(NA_integer_, 6L)
        , na_character_col = rep(NA_character_, 6L)
        , numeric_col = c(1.0, 9.0, NA_real_, 10.0, 11.0, 12.0)
        , factor_col = as.factor(c("n", "n", "y", "y", "n", "n"))
        , integer_col = c(1L, 9L, NA_integer_, 1L, 1L, 1L)
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
188

189
    # value used by lgb.convert_with_rules() when it encounters a categorical value that
190
191
    # is not in the provided rules
    UNKNOWN_FACTOR_VALUE <- 0L
192
    UNKNOWN_LOGICAL_VALUE <- -1L
193
194
195
196
197
198
199
200
201
202
203
    for (input_data in list(testDF, testDT)) {
        custom_rules <- list(
            "character_col" = c(
                "a" = 5L
                , "c" = -10L
            )
            , "factor_col" = c(
                "n" = 65L
                , "y" = 66L
            )
        )
204
        conversion_result <- lgb.convert_with_rules(
205
206
207
208
209
210
211
212
213
214
215
216
217
218
            data = input_data
            , rules = custom_rules
        )

        # dataset should have been converted to integer
        converted_dataset <- conversion_result[["data"]]
        expect_identical(class(input_data), class(converted_dataset))

        expect_identical(class(converted_dataset[["character_col"]]), "integer")
        expect_identical(converted_dataset[["character_col"]], c(5L, UNKNOWN_FACTOR_VALUE, -10L, 5L, 5L, -10L))

        expect_identical(class(converted_dataset[["factor_col"]]), "integer")
        expect_identical(converted_dataset[["factor_col"]], c(65L, 65L, 66L, 66L, 65L, 65L))

219
220
        # columns not specified in rules are not going to be converted, unless they are all NA
        for (col in c("na_real_col", "na_int_col", "numeric_col", "integer_col")) {
221
222
223
            expect_identical(converted_dataset[[col]], input_data[[col]])
        }

224
225
226
227
        # non-numeric/integer columns that are all NA should have been filled in
        expect_identical(converted_dataset[["na_col"]], rep(UNKNOWN_LOGICAL_VALUE, 6L))
        expect_identical(converted_dataset[["na_character_col"]], rep(UNKNOWN_FACTOR_VALUE, 6L))

228
229
230
231
232
233
        # the rules you passed in should be returned unchanged
        rules <- conversion_result$rules
        expect_identical(rules, custom_rules)
    }
})

234
test_that("lgb.convert_with_rules() should modify data.tables in-place", {
235
236
237
238
239
240
241
242
243
244
    testDT <- data.table::data.table(
        character_col = c("a", NA_character_, "c")
        , na_col = rep(NA, 3L)
        , na_real_col = rep(NA_real_, 3L)
        , na_int_col = rep(NA_integer_,  3L)
        , na_character_col = rep(NA_character_, 3L)
        , numeric_col = c(1.0, 9.0, NA_real_)
        , factor_col = as.factor(c("n", "n", "y"))
        , integer_col = c(1L, 9L, NA_integer_)
    )
245
    conversion_result <- lgb.convert_with_rules(testDT)
246
247
248
    resultDT <- conversion_result[["data"]]
    expect_identical(resultDT, testDT)
})