test_lgb.prepare.R 5.99 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
context("lgb.prepare()")

test_that("lgb.prepare() rejects inputs that are not a data.table or data.frame", {
    bad_inputs <- list(
        matrix(1.0:10.0, 2L, 5L)
        , TRUE
        , c("a", "b")
        , NA
        , 10L
        , lgb.Dataset(
            data = matrix(1.0:10.0, 2L, 5L)
            , params = list()
        )
    )
    for (bad_input in bad_inputs) {
        expect_error({
            converted_dataset <- lgb.prepare(bad_input)
        }, regexp = "lgb.prepare: you provided", fixed = TRUE)
    }
})

test_that("lgb.prepare() should work correctly for a dataset with only character columns", {
    testDF <- data.frame(
        col1 = c("a", "b", "c")
        , col2 =  c("green", "green", "red")
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
        converted_dataset <- lgb.prepare(input_data)
        expect_identical(class(input_data), class(converted_dataset))
        expect_identical(class(converted_dataset[["col1"]]), "numeric")
        expect_identical(class(converted_dataset[["col2"]]), "numeric")
        expect_identical(converted_dataset[["col1"]], c(1.0, 2.0, 3.0))
        expect_identical(converted_dataset[["col2"]], c(1.0, 1.0, 2.0))
    }
})

test_that("lgb.prepare() should work correctly for a dataset with only factor columns", {
    testDF <- data.frame(
        col1 = as.factor(c("a", "b", "c"))
        , col2 =  as.factor(c("green", "green", "red"))
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
        converted_dataset <- lgb.prepare(input_data)
        expect_identical(class(input_data), class(converted_dataset))
        expect_identical(class(converted_dataset[["col1"]]), "numeric")
        expect_identical(class(converted_dataset[["col2"]]), "numeric")
        expect_identical(converted_dataset[["col1"]], c(1.0, 2.0, 3.0))
        expect_identical(converted_dataset[["col2"]], c(1.0, 1.0, 2.0))
    }
})

test_that("lgb.prepare() should not change a dataset with only numeric columns", {
    testDF <- data.frame(
        col1 = 11.0:15.0
        , col2 = 16.0:20.0
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
        converted_dataset <- lgb.prepare(input_data)
        expect_identical(converted_dataset, input_data)
    }
})

test_that("lgb.prepare() should work correctly for a dataset with numeric, factor, and character columns", {
    testDF <- data.frame(
        character_col = c("a", "b", "c")
        , numeric_col = c(1.0, 9.0, 10.0)
        , factor_col = as.factor(c("n", "n", "y"))
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
        converted_dataset <- lgb.prepare(input_data)
        expect_identical(class(input_data), class(converted_dataset))
        expect_identical(class(converted_dataset[["character_col"]]), "numeric")
        expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
        expect_identical(class(converted_dataset[["factor_col"]]), "numeric")
        expect_identical(converted_dataset[["character_col"]], c(1.0, 2.0, 3.0))
        expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, 10.0))
        expect_identical(converted_dataset[["factor_col"]], c(1.0, 1.0, 2.0))
    }
})

test_that("lgb.prepare() should work correctly for a dataset with missing values", {
    testDF <- data.frame(
        character_col = c("a", NA_character_, "c")
        , na_col = rep(NA, 3L)
        , na_real_col = rep(NA_real_, 3L)
        , na_int_col = rep(NA_integer_,  3L)
        , na_character_col = rep(NA_character_, 3L)
        , numeric_col = c(1.0, 9.0, NA_real_)
        , factor_col = as.factor(c("n", "n", "y"))
        , integer_col = c(1L, 9L, NA_integer_)
        , stringsAsFactors = FALSE
    )
    testDT <- data.table::as.data.table(testDF)
    for (input_data in list(testDF, testDT)) {
        converted_dataset <- lgb.prepare(input_data)
        expect_identical(class(input_data), class(converted_dataset))

        expect_identical(class(converted_dataset[["character_col"]]), "numeric")
        expect_identical(converted_dataset[["character_col"]], c(1.0, NA_real_, 2.0))

        expect_identical(class(converted_dataset[["numeric_col"]]), "numeric")
        expect_identical(converted_dataset[["numeric_col"]], c(1.0, 9.0, NA_real_))

        expect_identical(class(converted_dataset[["factor_col"]]), "numeric")
        expect_identical(converted_dataset[["factor_col"]], c(1.0, 1.0, 2.0))

        # NAs of any type should be converted to numeric
        for (col in c("na_real_col", "na_character_col")) {
            expect_identical(class(converted_dataset[[col]]), "numeric")
            expect_identical(converted_dataset[[col]], rep(NA_real_, nrow(converted_dataset)))
        }

        # today, lgb.prepare() does not convert logical columns
        expect_identical(class(converted_dataset[["na_col"]]), "logical")

        # today, lgb.prepare() does not convert integer columns to numeric
        expect_identical(class(converted_dataset[["na_int_col"]]), "integer")
        expect_identical(converted_dataset[["na_int_col"]], rep(NA_integer_, nrow(converted_dataset)))
        expect_identical(class(converted_dataset[["integer_col"]]), "integer")
        expect_identical(converted_dataset[["integer_col"]], c(1L, 9L, NA_integer_))
    }
})

test_that("lgb.prepare() should modify data.tables in-place", {
    testDT <- data.table::data.table(
        character_col = c("a", NA_character_, "c")
        , na_col = rep(NA, 3L)
        , na_real_col = rep(NA_real_, 3L)
        , na_int_col = rep(NA_integer_,  3L)
        , na_character_col = rep(NA_character_, 3L)
        , numeric_col = c(1.0, 9.0, NA_real_)
        , factor_col = as.factor(c("n", "n", "y"))
        , integer_col = c(1L, 9L, NA_integer_)
    )
    resultDT <- lgb.prepare(testDT)
    expect_identical(resultDT, testDT)
})