test_dataset.R 11 KB
Newer Older
Guolin Ke's avatar
Guolin Ke committed
1
2
context("testing lgb.Dataset functionality")

3
4
5
data(agaricus.test, package = "lightgbm")
test_data <- agaricus.test$data[1L:100L, ]
test_label <- agaricus.test$label[1L:100L]
Guolin Ke's avatar
Guolin Ke committed
6
7
8

test_that("lgb.Dataset: basic construction, saving, loading", {
  # from sparse matrix
9
  dtest1 <- lgb.Dataset(test_data, label = test_label)
10
  # from dense matrix
11
  dtest2 <- lgb.Dataset(as.matrix(test_data), label = test_label)
12
  expect_equal(getinfo(dtest1, "label"), getinfo(dtest2, "label"))
13

Guolin Ke's avatar
Guolin Ke committed
14
  # save to a local file
15
  tmp_file <- tempfile("lgb.Dataset_")
Guolin Ke's avatar
Guolin Ke committed
16
17
18
19
20
  lgb.Dataset.save(dtest1, tmp_file)
  # read from a local file
  dtest3 <- lgb.Dataset(tmp_file)
  lgb.Dataset.construct(dtest3)
  unlink(tmp_file)
21
  expect_equal(getinfo(dtest1, "label"), getinfo(dtest3, "label"))
Guolin Ke's avatar
Guolin Ke committed
22
23
24
25
})

test_that("lgb.Dataset: getinfo & setinfo", {
  dtest <- lgb.Dataset(test_data)
26
  dtest$construct()
27

28
29
30
  setinfo(dtest, "label", test_label)
  labels <- getinfo(dtest, "label")
  expect_equal(test_label, getinfo(dtest, "label"))
31

32
33
  expect_true(length(getinfo(dtest, "weight")) == 0L)
  expect_true(length(getinfo(dtest, "init_score")) == 0L)
34

Guolin Ke's avatar
Guolin Ke committed
35
  # any other label should error
36
  expect_error(setinfo(dtest, "asdf", test_label))
Guolin Ke's avatar
Guolin Ke committed
37
38
39
})

test_that("lgb.Dataset: slice, dim", {
40
  dtest <- lgb.Dataset(test_data, label = test_label)
Guolin Ke's avatar
Guolin Ke committed
41
42
  lgb.Dataset.construct(dtest)
  expect_equal(dim(dtest), dim(test_data))
43
  dsub1 <- slice(dtest, seq_len(42L))
Guolin Ke's avatar
Guolin Ke committed
44
  lgb.Dataset.construct(dsub1)
45
  expect_equal(nrow(dsub1), 42L)
Guolin Ke's avatar
Guolin Ke committed
46
47
48
  expect_equal(ncol(dsub1), ncol(test_data))
})

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
test_that("Dataset$slice() supports passing additional parameters through '...'", {
  dtest <- lgb.Dataset(test_data, label = test_label)
  dtest$construct()
  dsub1 <- slice(
    dataset = dtest
    , idxset = seq_len(42L)
    , feature_pre_filter = FALSE
  )
  dsub1$construct()
  expect_identical(dtest$get_params(), list())
  expect_identical(dsub1$get_params(), list(feature_pre_filter = FALSE))
})

test_that("Dataset$slice() supports passing Dataset attributes through '...'", {
  dtest <- lgb.Dataset(test_data, label = test_label)
  dtest$construct()
  num_subset_rows <- 51L
  init_score <- rnorm(n = num_subset_rows)
  dsub1 <- slice(
    dataset = dtest
    , idxset = seq_len(num_subset_rows)
    , init_score = init_score
  )
  dsub1$construct()
  expect_null(dtest$getinfo("init_score"), NULL)
  expect_identical(dsub1$getinfo("init_score"), init_score)
})

Guolin Ke's avatar
Guolin Ke committed
77
test_that("lgb.Dataset: colnames", {
78
  dtest <- lgb.Dataset(test_data, label = test_label)
Guolin Ke's avatar
Guolin Ke committed
79
80
81
  expect_equal(colnames(dtest), colnames(test_data))
  lgb.Dataset.construct(dtest)
  expect_equal(colnames(dtest), colnames(test_data))
82
83
84
85
  expect_error({
    colnames(dtest) <- "asdf"
  })
  new_names <- make.names(seq_len(ncol(test_data)))
Guolin Ke's avatar
Guolin Ke committed
86
87
88
89
90
  expect_silent(colnames(dtest) <- new_names)
  expect_equal(colnames(dtest), new_names)
})

test_that("lgb.Dataset: nrow is correct for a very sparse matrix", {
91
92
  nr <- 1000L
  x <- Matrix::rsparsematrix(nr, 100L, density = 0.0005)
Guolin Ke's avatar
Guolin Ke committed
93
94
95
96
97
  # we want it very sparse, so that last rows are empty
  expect_lt(max(x@i), nr)
  dtest <- lgb.Dataset(x)
  expect_equal(dim(dtest), dim(x))
})
98
99

test_that("lgb.Dataset: Dataset should be able to construct from matrix and return non-null handle", {
100
  rawData <- matrix(runif(1000L), ncol = 10L)
101
  ref_handle <- NULL
102
  handle <- .Call(
103
    LGBM_DatasetCreateFromMat_R
104
105
106
107
108
109
    , rawData
    , nrow(rawData)
    , ncol(rawData)
    , lightgbm:::lgb.params2str(params = list())
    , ref_handle
  )
110
111
  expect_is(handle, "externalptr")
  expect_false(is.null(handle))
112
  .Call(LGBM_DatasetFree_R, handle)
113
  handle <- NULL
114
})
115

116
117
118
119
120
121
122
123
124
125
126
127
128
test_that("cpp errors should be raised as proper R errors", {
  data(agaricus.train, package = "lightgbm")
  train <- agaricus.train
  dtrain <- lgb.Dataset(
    train$data
    , label = train$label
    , init_score = seq_len(10L)
  )
  expect_error({
    dtrain$construct()
  }, regexp = "Initial score size doesn't match data size")
})

129
130
131
132
133
134
135
136
137
138
139
140
test_that("lgb.Dataset$setinfo() should convert 'group' to integer", {
  ds <- lgb.Dataset(
    data = matrix(rnorm(100L), nrow = 50L, ncol = 2L)
    , label = sample(c(0L, 1L), size = 50L, replace = TRUE)
  )
  ds$construct()
  current_group <- ds$getinfo("group")
  expect_null(current_group)
  group_as_numeric <- rep(25.0, 2L)
  ds$setinfo("group", group_as_numeric)
  expect_identical(ds$getinfo("group"), as.integer(group_as_numeric))
})
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

test_that("lgb.Dataset should throw an error if 'reference' is provided but of the wrong format", {
  data(agaricus.test, package = "lightgbm")
  test_data <- agaricus.test$data[1L:100L, ]
  test_label <- agaricus.test$label[1L:100L]
  # Try to trick lgb.Dataset() into accepting bad input
  expect_error({
    dtest <- lgb.Dataset(
      data = test_data
      , label = test_label
      , reference = data.frame(x = seq_len(10L), y = seq_len(10L))
    )
  }, regexp = "reference must be a")
})

test_that("Dataset$new() should throw an error if 'predictor' is provided but of the wrong format", {
  data(agaricus.test, package = "lightgbm")
  test_data <- agaricus.test$data[1L:100L, ]
  test_label <- agaricus.test$label[1L:100L]
  expect_error({
    dtest <- Dataset$new(
      data = test_data
      , label = test_label
      , predictor = data.frame(x = seq_len(10L), y = seq_len(10L))
    )
  }, regexp = "predictor must be a", fixed = TRUE)
})
168
169
170
171
172
173
174
175
176
177
178
179
180
181

test_that("Dataset$get_params() successfully returns parameters if you passed them", {
  # note that this list uses one "main" parameter (feature_pre_filter) and one that
  # is an alias (is_sparse), to check that aliases are handled correctly
  params <- list(
    "feature_pre_filter" = TRUE
    , "is_sparse" = FALSE
  )
  ds <- lgb.Dataset(
    test_data
    , label = test_label
    , params = params
  )
  returned_params <- ds$get_params()
182
  expect_identical(class(returned_params), "list")
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
  expect_identical(length(params), length(returned_params))
  expect_identical(sort(names(params)), sort(names(returned_params)))
  for (param_name in names(params)) {
    expect_identical(params[[param_name]], returned_params[[param_name]])
  }
})

test_that("Dataset$get_params() ignores irrelevant parameters", {
  params <- list(
    "feature_pre_filter" = TRUE
    , "is_sparse" = FALSE
    , "nonsense_parameter" = c(1.0, 2.0, 5.0)
  )
  ds <- lgb.Dataset(
    test_data
    , label = test_label
    , params = params
  )
  returned_params <- ds$get_params()
  expect_false("nonsense_parameter" %in% names(returned_params))
})

test_that("Dataset$update_parameters() does nothing for empty inputs", {
  ds <- lgb.Dataset(
    test_data
    , label = test_label
  )
  initial_params <- ds$get_params()
  expect_identical(initial_params, list())

  # update_params() should return "self" so it can be chained
  res <- ds$update_params(
    params = list()
  )
  expect_true(lgb.is.Dataset(res))

  new_params <- ds$get_params()
  expect_identical(new_params, initial_params)
})

test_that("Dataset$update_params() works correctly for recognized Dataset parameters", {
  ds <- lgb.Dataset(
    test_data
    , label = test_label
  )
  initial_params <- ds$get_params()
  expect_identical(initial_params, list())

  new_params <- list(
    "data_random_seed" = 708L
    , "enable_bundle" = FALSE
  )
  res <- ds$update_params(
    params = new_params
  )
  expect_true(lgb.is.Dataset(res))

  updated_params <- ds$get_params()
  for (param_name in names(new_params)) {
    expect_identical(new_params[[param_name]], updated_params[[param_name]])
  }
})
245

246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
test_that("Dataset$finalize() should not fail on an already-finalized Dataset", {
  dtest <- lgb.Dataset(
    data = test_data
    , label = test_label
  )
  expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))

  dtest$construct()
  expect_false(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))

  dtest$finalize()
  expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))

  # calling finalize() a second time shouldn't cause any issues
  dtest$finalize()
  expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))
})

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
test_that("lgb.Dataset: should be able to run lgb.train() immediately after using lgb.Dataset() on a file", {
  dtest <- lgb.Dataset(
    data = test_data
    , label = test_label
  )
  tmp_file <- tempfile(pattern = "lgb.Dataset_")
  lgb.Dataset.save(
    dataset = dtest
    , fname = tmp_file
  )

  # read from a local file
  dtest_read_in <- lgb.Dataset(data = tmp_file)

  param <- list(
    objective = "binary"
    , metric = "binary_logloss"
    , num_leaves = 5L
    , learning_rate = 1.0
  )

  # should be able to train right away
  bst <- lgb.train(
    params = param
    , data = dtest_read_in
  )

  expect_true(lgb.is.Booster(x = bst))
})

test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using lgb.Dataset() on a file", {
  dtest <- lgb.Dataset(
    data = test_data
    , label = test_label
  )
  tmp_file <- tempfile(pattern = "lgb.Dataset_")
  lgb.Dataset.save(
    dataset = dtest
    , fname = tmp_file
  )

  # read from a local file
  dtest_read_in <- lgb.Dataset(data = tmp_file)

  param <- list(
    objective = "binary"
    , metric = "binary_logloss"
    , num_leaves = 5L
    , learning_rate = 1.0
  )

  # should be able to train right away
  bst <- lgb.cv(
    params = param
    , data = dtest_read_in
  )

  expect_is(bst, "lgb.CVBooster")
})
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340

test_that("lgb.Dataset: should be able to use and retrieve long feature names", {
  # set one feature to a value longer than the default buffer size used
  # in LGBM_DatasetGetFeatureNames_R
  feature_names <- names(iris)
  long_name <- paste0(rep("a", 1000L), collapse = "")
  feature_names[1L] <- long_name
  names(iris) <- feature_names
  # check that feature name survived the trip from R to C++ and back
  dtrain <- lgb.Dataset(
    data = as.matrix(iris[, -5L])
    , label = as.numeric(iris$Species) - 1L
  )
  dtrain$construct()
  col_names <- dtrain$get_colnames()
  expect_equal(col_names[1L], long_name)
  expect_equal(nchar(col_names[1L]), 1000L)
})
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

test_that("lgb.Dataset: should be able to create a Dataset from a text file with a header", {
  train_file <- tempfile(pattern = "train_", fileext = ".csv")
  write.table(
    data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
    , file = train_file
    , sep = ","
    , col.names = TRUE
    , row.names = FALSE
    , quote = FALSE
  )

  dtrain <- lgb.Dataset(
    data = train_file
    , params = list(header = TRUE)
  )
  dtrain$construct()
  expect_identical(dtrain$get_colnames(), c("x1", "x2"))
  expect_identical(dtrain$get_params(), list(header = TRUE))
  expect_identical(dtrain$dim(), c(100L, 2L))
})

test_that("lgb.Dataset: should be able to create a Dataset from a text file without a header", {
  train_file <- tempfile(pattern = "train_", fileext = ".csv")
  write.table(
    data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
    , file = train_file
    , sep = ","
    , col.names = FALSE
    , row.names = FALSE
    , quote = FALSE
  )

  dtrain <- lgb.Dataset(
    data = train_file
    , params = list(header = FALSE)
  )
  dtrain$construct()
  expect_identical(dtrain$get_colnames(), c("Column_0", "Column_1"))
  expect_identical(dtrain$get_params(), list(header = FALSE))
  expect_identical(dtrain$dim(), c(100L, 2L))
})