lgb.Dataset.R 44.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @param info a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
#'             is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
#'             directly.
#' @keywords internal
NULL

19
20
21
22
23
24
25
26
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
27
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
28
#' @importFrom R6 R6Class
29
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
30
31
Dataset <- R6::R6Class(

32
  classname = "lgb.Dataset",
33
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
34
  public = list(
James Lamb's avatar
James Lamb committed
35

36
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
37
    finalize = function() {
38
39
40
41
42
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
43
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
44
    },
James Lamb's avatar
James Lamb committed
45

46
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
47
    initialize = function(data,
48
49
50
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
51
                          categorical_feature = NULL,
52
53
54
55
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
56
57
58
59
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
60

61
      # validate inputs early to avoid unnecessary computation
62
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
63
64
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
65
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
66
67
68
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

69
70
71
72
73
74
75
      if (length(info) > 0L) {
        warning(paste0(
          "lgb.Dataset: found fields passed through 'info'. "
          , "As of v3.3.0, this behavior is deprecated, and support for it will be removed in a future release. "
          , "To suppress this warning, use keyword arguments 'label', 'weight', 'group', or 'init_score' directly"
        ))
      }
James Lamb's avatar
James Lamb committed
76

77
78
79
80
81
82
83
84
85
86
87
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
88
      }
James Lamb's avatar
James Lamb committed
89

90
91
92
93
94
95
96
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
97

98
99
100
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
101
      private$reference <- reference
102
      private$colnames <- colnames
103

104
      private$categorical_feature <- categorical_feature
105
106
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
107
      private$used_indices <- sort(used_indices, decreasing = FALSE)
108
      private$info <- info
109
      private$version <- 0L
James Lamb's avatar
James Lamb committed
110

111
112
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
113
    },
James Lamb's avatar
James Lamb committed
114

115
116
    create_valid = function(data,
                            info = list(),
117
118
119
120
121
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
                            params = list(),
122
                            ...) {
James Lamb's avatar
James Lamb committed
123

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      additional_params <- list(...)
      if (length(additional_params) > 0L) {
        warning(paste0(
          "Dataset$create_valid(): Found the following passed through '...': "
          , paste(names(additional_params), collapse = ", ")
          , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
          , "Add these to 'params' instead. "
          , "See ?lgb.Dataset.create.valid for documentation on how to call this function."
        ))
      }

      # anything passed into '...' should be overwritten by things passed to 'params'
      params <- modifyList(additional_params, params)

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

141
      # Create new dataset
142
143
      ret <- Dataset$new(
        data = data
144
        , params = params
145
146
147
148
149
150
151
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
152
153
154
155
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
156
      )
James Lamb's avatar
James Lamb committed
157

158
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
159

Guolin Ke's avatar
Guolin Ke committed
160
    },
James Lamb's avatar
James Lamb committed
161

162
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
163
    construct = function() {
James Lamb's avatar
James Lamb committed
164

165
      # Check for handle null
166
      if (!lgb.is.null.handle(x = private$handle)) {
167
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
168
      }
James Lamb's avatar
James Lamb committed
169

Guolin Ke's avatar
Guolin Ke committed
170
171
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
172
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
173
174
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
175

176
      # set feature names if they do not exist
177
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
180

181
182
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
183

184
        # Check for character name
185
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
186

187
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
188

189
            # Provided indices, but some indices are missing?
190
            if (sum(is.na(cate_indices)) > 0L) {
191
192
193
194
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
195
            }
James Lamb's avatar
James Lamb committed
196

197
          } else {
James Lamb's avatar
James Lamb committed
198

199
            # Check if more categorical features were output over the feature space
200
            if (max(private$categorical_feature) > length(private$colnames)) {
201
202
203
204
205
206
207
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
208
            }
James Lamb's avatar
James Lamb committed
209

210
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
211
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
212

213
          }
James Lamb's avatar
James Lamb committed
214

215
        # Store indices for categorical features
216
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
217

218
      }
James Lamb's avatar
James Lamb committed
219

Guolin Ke's avatar
Guolin Ke committed
220
      # Generate parameter str
221
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
222

223
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
228

229
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
230
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
231

232
233
234
235
236
237
238
239
240
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

241
        # Are we using a data file?
242
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
243

244
          handle <- .Call(
245
            LGBM_DatasetCreateFromFile_R
246
            , path.expand(private$raw_data)
247
248
249
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
250

Guolin Ke's avatar
Guolin Ke committed
251
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
252

253
          # Are we using a matrix?
254
          handle <- .Call(
255
            LGBM_DatasetCreateFromMat_R
256
257
258
259
260
261
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
262
263

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
264
          if (length(private$raw_data@p) > 2147483647L) {
265
266
            stop("Cannot support large CSC matrix")
          }
267
          # Are we using a dgCMatrix (sparsed matrix column compressed)
268
          handle <- .Call(
269
            LGBM_DatasetCreateFromCSC_R
270
271
272
273
274
275
276
277
278
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
279

Guolin Ke's avatar
Guolin Ke committed
280
        } else {
James Lamb's avatar
James Lamb committed
281

282
          # Unknown data type
283
284
285
286
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
287

Guolin Ke's avatar
Guolin Ke committed
288
        }
James Lamb's avatar
James Lamb committed
289

Guolin Ke's avatar
Guolin Ke committed
290
      } else {
James Lamb's avatar
James Lamb committed
291

292
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
293
        if (is.null(private$reference)) {
294
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
295
        }
James Lamb's avatar
James Lamb committed
296

297
        # Construct subset
298
        handle <- .Call(
299
          LGBM_DatasetGetSubset_R
300
301
302
303
304
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
305

Guolin Ke's avatar
Guolin Ke committed
306
      }
307
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
308
309
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
310
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
311
312
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
313

314
315
      # Set feature names
      if (!is.null(private$colnames)) {
316
        self$set_colnames(colnames = private$colnames)
317
      }
318

319
320
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
321

322
        # Setup initial scores
323
        init_score <- private$predictor$predict(
324
          data = private$raw_data
325
326
327
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
328

329
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
330
331
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
332

333
      }
James Lamb's avatar
James Lamb committed
334

335
336
337
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
338
      }
James Lamb's avatar
James Lamb committed
339

340
      # Get private information
341
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
342

343
        # Set infos
344
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
345

Guolin Ke's avatar
Guolin Ke committed
346
          p <- private$info[i]
347
348
349
350
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
351

Guolin Ke's avatar
Guolin Ke committed
352
        }
James Lamb's avatar
James Lamb committed
353

Guolin Ke's avatar
Guolin Ke committed
354
      }
James Lamb's avatar
James Lamb committed
355

356
      # Get label information existence
357
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
358
359
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
360

361
      return(invisible(self))
James Lamb's avatar
James Lamb committed
362

Guolin Ke's avatar
Guolin Ke committed
363
    },
James Lamb's avatar
James Lamb committed
364

365
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
366
    dim = function() {
James Lamb's avatar
James Lamb committed
367

368
      # Check for handle
369
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
370

371
372
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
373

374
        # Get numeric data and numeric features
375
376
377
378
379
380
381
382
383
384
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
385
        return(
386
          c(num_row, num_col)
387
        )
James Lamb's avatar
James Lamb committed
388
389
390

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

391
        # Check if dgCMatrix (sparse matrix column compressed)
392
        # NOTE: requires Matrix package
393
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
394

Guolin Ke's avatar
Guolin Ke committed
395
      } else {
James Lamb's avatar
James Lamb committed
396

397
        # Trying to work with unknown dimensions is not possible
398
399
400
401
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
402

Guolin Ke's avatar
Guolin Ke committed
403
      }
James Lamb's avatar
James Lamb committed
404

Guolin Ke's avatar
Guolin Ke committed
405
    },
James Lamb's avatar
James Lamb committed
406

407
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
408
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
409

410
      # Check for handle
411
      if (!lgb.is.null.handle(x = private$handle)) {
412
        private$colnames <- .Call(
413
414
          LGBM_DatasetGetFeatureNames_R
          , private$handle
415
        )
416
        return(private$colnames)
James Lamb's avatar
James Lamb committed
417
418
419

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

420
        # Check if dgCMatrix (sparse matrix column compressed)
421
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
422

Guolin Ke's avatar
Guolin Ke committed
423
      } else {
James Lamb's avatar
James Lamb committed
424

425
        # Trying to work with unknown formats is not possible
426
        stop(
427
428
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
429
        )
James Lamb's avatar
James Lamb committed
430

Guolin Ke's avatar
Guolin Ke committed
431
      }
James Lamb's avatar
James Lamb committed
432

Guolin Ke's avatar
Guolin Ke committed
433
    },
James Lamb's avatar
James Lamb committed
434

435
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
436
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
437

438
439
      # Check column names non-existence
      if (is.null(colnames)) {
440
        return(invisible(self))
441
      }
James Lamb's avatar
James Lamb committed
442

443
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
444
      colnames <- as.character(colnames)
445
      if (length(colnames) == 0L) {
446
        return(invisible(self))
447
      }
James Lamb's avatar
James Lamb committed
448

449
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
450
      private$colnames <- colnames
451
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
452

453
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
454
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
455
456
        .Call(
          LGBM_DatasetSetFeatureNames_R
457
          , private$handle
458
          , merged_name
459
        )
James Lamb's avatar
James Lamb committed
460

Guolin Ke's avatar
Guolin Ke committed
461
      }
James Lamb's avatar
James Lamb committed
462

463
      return(invisible(self))
James Lamb's avatar
James Lamb committed
464

Guolin Ke's avatar
Guolin Ke committed
465
    },
James Lamb's avatar
James Lamb committed
466

Guolin Ke's avatar
Guolin Ke committed
467
    getinfo = function(name) {
468
469
470
471
472
473
474
475
476
477
478
479
      warning(paste0(
        "Dataset$getinfo() is deprecated and will be removed in a future release. "
        , "Use Dataset$get_field() instead."
      ))
      return(
        self$get_field(
          field_name = name
        )
      )
    },

    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
480

481
      # Check if attribute key is in the known attribute list
482
483
484
485
486
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
487
      }
James Lamb's avatar
James Lamb committed
488

489
      # Check for info name and handle
490
      if (is.null(private$info[[field_name]])) {
491

492
        if (lgb.is.null.handle(x = private$handle)) {
493
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
494
        }
495

496
        # Get field size of info
497
        info_len <- 0L
498
499
        .Call(
          LGBM_DatasetGetFieldSize_R
500
          , private$handle
501
          , field_name
502
          , info_len
503
        )
James Lamb's avatar
James Lamb committed
504

505
        # Check if info is not empty
506
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
507

508
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
509
          ret <- NULL
510
          ret <- if (field_name == "group") {
511
512
513
514
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
515

516
517
          .Call(
            LGBM_DatasetGetField_R
518
            , private$handle
519
            , field_name
520
            , ret
521
          )
James Lamb's avatar
James Lamb committed
522

523
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
524

Guolin Ke's avatar
Guolin Ke committed
525
526
        }
      }
James Lamb's avatar
James Lamb committed
527

528
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
529

Guolin Ke's avatar
Guolin Ke committed
530
    },
James Lamb's avatar
James Lamb committed
531

532
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
533

534
      # Check if attribute key is in the known attribute list
535
536
537
538
539
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
540
      }
James Lamb's avatar
James Lamb committed
541

542
      # Check for type of information
543
544
      data <- if (field_name == "group") {
        as.integer(data) # Integer
545
      } else {
546
        as.numeric(data) # Numeric
547
      }
James Lamb's avatar
James Lamb committed
548

549
      # Store information privately
550
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
551

552
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
553

554
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
555

556
557
          .Call(
            LGBM_DatasetSetField_R
558
            , private$handle
559
560
561
            , field_name
            , data
            , length(data)
562
          )
James Lamb's avatar
James Lamb committed
563

564
565
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
566
        }
James Lamb's avatar
James Lamb committed
567

Guolin Ke's avatar
Guolin Ke committed
568
      }
James Lamb's avatar
James Lamb committed
569

570
      return(invisible(self))
James Lamb's avatar
James Lamb committed
571

Guolin Ke's avatar
Guolin Ke committed
572
    },
James Lamb's avatar
James Lamb committed
573

574
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
575
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
576

577
578
579
      additional_keyword_args <- list(...)

      if (length(additional_keyword_args) > 0L) {
580
581
        warning(paste0(
          "Dataset$slice(): Found the following passed through '...': "
582
          , paste(names(additional_keyword_args), collapse = ", ")
583
          , ". These are ignored and should be removed. "
584
          , "To change the parameters of a Dataset produced by Dataset$slice(), use Dataset$set_params(). "
585
          , "To modify attributes like 'init_score', use Dataset$set_field(). "
586
587
588
589
          , "In future releases of lightgbm, this warning will become an error."
        ))
      }

590
591
592
593
594
595
596
597
598
599
600
601
602
      # extract Dataset attributes passed through '...'
      #
      # NOTE: takes advantage of the fact that list[["non-existent-key"]] returns NULL
      group <- additional_keyword_args[["group"]]
      init_score <- additional_keyword_args[["init_score"]]
      label <- additional_keyword_args[["label"]]
      weight <- additional_keyword_args[["weight"]]

      # remove attributes from '...', so only params are left
      for (info_key in .INFO_KEYS()) {
        additional_keyword_args[[info_key]] <- NULL
      }

603
      # Perform slicing
604
605
606
      return(
        Dataset$new(
          data = NULL
607
          , params = utils::modifyList(self$get_params(), additional_keyword_args)
608
609
610
611
612
613
614
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
615
616
617
618
          , group = group
          , init_score = init_score
          , label = label
          , weight = weight
619
        )
620
      )
James Lamb's avatar
James Lamb committed
621

Guolin Ke's avatar
Guolin Ke committed
622
    },
James Lamb's avatar
James Lamb committed
623

624
625
626
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
627
    update_params = function(params) {
628
629
630
      if (length(params) == 0L) {
        return(invisible(self))
      }
631
      if (lgb.is.null.handle(x = private$handle)) {
632
        private$params <- utils::modifyList(private$params, params)
633
      } else {
634
635
        tryCatch({
          .Call(
636
            LGBM_DatasetUpdateParamChecking_R
637
638
639
640
641
642
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
643
          if (is.null(private$raw_data)) {
644
            stop(e)
645
646
          }

647
648
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
649
          private$params <- utils::modifyList(private$params, params)
650
          self$finalize()
651
        })
652
      }
653
      return(invisible(self))
James Lamb's avatar
James Lamb committed
654

Guolin Ke's avatar
Guolin Ke committed
655
    },
James Lamb's avatar
James Lamb committed
656

657
658
659
660
661
662
663
664
665
666
667
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

668
    # Set categorical feature parameter
669
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
670

671
672
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
673
        return(invisible(self))
674
      }
James Lamb's avatar
James Lamb committed
675

676
      # Check for empty data
677
      if (is.null(private$raw_data)) {
678
679
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
680
      }
James Lamb's avatar
James Lamb committed
681

682
      # Overwrite categorical features
683
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
684

685
      # Finalize and return self
686
      self$finalize()
687
      return(invisible(self))
James Lamb's avatar
James Lamb committed
688

689
    },
James Lamb's avatar
James Lamb committed
690

691
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
692
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
693

694
      # setting reference to this same Dataset object doesn't require any changes
695
      if (identical(private$reference, reference)) {
696
        return(invisible(self))
697
      }
James Lamb's avatar
James Lamb committed
698

699
700
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
701
      if (is.null(private$raw_data)) {
702
703
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
704
      }
James Lamb's avatar
James Lamb committed
705

706
707
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
708
      }
James Lamb's avatar
James Lamb committed
709

710
711
712
713
714
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

715
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
716
      private$reference <- reference
James Lamb's avatar
James Lamb committed
717

718
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
719
      self$finalize()
720
      return(invisible(self))
James Lamb's avatar
James Lamb committed
721

Guolin Ke's avatar
Guolin Ke committed
722
    },
James Lamb's avatar
James Lamb committed
723

724
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
725
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
726

727
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
728
      self$construct()
729
730
      .Call(
        LGBM_DatasetSaveBinary_R
731
        , private$handle
732
        , path.expand(fname)
733
      )
734
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
735
    }
James Lamb's avatar
James Lamb committed
736

Guolin Ke's avatar
Guolin Ke committed
737
738
  ),
  private = list(
739
740
741
742
743
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
744
    categorical_feature = NULL,
745
746
747
748
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
749
    version = 0L,
James Lamb's avatar
James Lamb committed
750

751
752
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
753

754
      # Get handle and construct if needed
755
      if (lgb.is.null.handle(x = private$handle)) {
756
757
        self$construct()
      }
758
      return(private$handle)
James Lamb's avatar
James Lamb committed
759

Guolin Ke's avatar
Guolin Ke committed
760
    },
James Lamb's avatar
James Lamb committed
761

762
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
763
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
764

765
      if (identical(private$predictor, predictor)) {
766
        return(invisible(self))
767
      }
James Lamb's avatar
James Lamb committed
768

769
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
770
      if (is.null(private$raw_data)) {
771
772
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
773
      }
James Lamb's avatar
James Lamb committed
774

775
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
776
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
777

778
        # Predictor is unknown
779
        if (!lgb.is.Predictor(predictor)) {
780
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
781
        }
James Lamb's avatar
James Lamb committed
782

Guolin Ke's avatar
Guolin Ke committed
783
      }
James Lamb's avatar
James Lamb committed
784

785
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
786
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
787

788
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
789
      self$finalize()
790
      return(invisible(self))
James Lamb's avatar
James Lamb committed
791

Guolin Ke's avatar
Guolin Ke committed
792
    }
James Lamb's avatar
James Lamb committed
793

Guolin Ke's avatar
Guolin Ke committed
794
795
796
  )
)

797
798
799
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
800
#' @inheritParams lgb_shared_dataset_params
801
802
803
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
804
805
806
807
808
809
810
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
811
#' @param colnames names of columns
812
813
814
815
816
817
818
819
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
820
#' @param ... other parameters passed to \code{params}
James Lamb's avatar
James Lamb committed
821
#'
Guolin Ke's avatar
Guolin Ke committed
822
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
823
#'
Guolin Ke's avatar
Guolin Ke committed
824
#' @examples
825
#' \donttest{
826
827
828
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
829
830
831
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
832
#' lgb.Dataset.construct(dtrain)
833
#' }
Guolin Ke's avatar
Guolin Ke committed
834
835
#' @export
lgb.Dataset <- function(data,
836
837
838
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
839
                        categorical_feature = NULL,
840
841
                        free_raw_data = TRUE,
                        info = list(),
842
843
844
845
                        label = NULL,
                        weight = NULL,
                        group = NULL,
                        init_score = NULL,
Guolin Ke's avatar
Guolin Ke committed
846
                        ...) {
James Lamb's avatar
James Lamb committed
847

848
849
850
851
852
853
854
855
856
857
858
859
  additional_params <- list(...)
  params <- modifyList(params, additional_params)

  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
    ))
  }

860
  # Create new dataset
861
862
863
864
865
866
867
868
869
870
871
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
872
873
874
875
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
876
877
    ))
  )
James Lamb's avatar
James Lamb committed
878

Guolin Ke's avatar
Guolin Ke committed
879
880
}

881
882
883
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
884
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
885
#' @param dataset \code{lgb.Dataset} object, training data
886
887
888
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
889
890
891
892
893
894
895
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
#' @param ... additional \code{lgb.Dataset} parameters.
#'            NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.
James Lamb's avatar
James Lamb committed
896
#'
Guolin Ke's avatar
Guolin Ke committed
897
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
898
#'
Guolin Ke's avatar
Guolin Ke committed
899
#' @examples
900
#' \donttest{
901
902
903
904
905
906
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
942
#' }
943
#' @importFrom utils modifyList
Guolin Ke's avatar
Guolin Ke committed
944
#' @export
945
946
947
948
949
950
951
952
953
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     info = list(),
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
                                     params = list(),
                                     ...) {
James Lamb's avatar
James Lamb committed
954

955
  if (!lgb.is.Dataset(x = dataset)) {
956
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
957
  }
James Lamb's avatar
James Lamb committed
958

959
960
961
962
963
964
965
966
967
968
  additional_params <- list(...)
  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset.create.valid: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset.create.valid for documentation on how to call this function."
    ))
  }

969
  # Create validation dataset
970
971
972
973
974
975
976
977
978
979
980
  return(invisible(
    dataset$create_valid(
      data = data
      , info = info
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
      , params = utils::modifyList(params, additional_params)
    )
  ))
James Lamb's avatar
James Lamb committed
981

982
}
Guolin Ke's avatar
Guolin Ke committed
983

984
985
986
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
987
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
988
#'
Guolin Ke's avatar
Guolin Ke committed
989
#' @examples
990
#' \donttest{
991
992
993
994
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
995
#' }
996
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
997
998
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
999

1000
  if (!lgb.is.Dataset(x = dataset)) {
1001
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1002
  }
James Lamb's avatar
James Lamb committed
1003

1004
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
1005

Guolin Ke's avatar
Guolin Ke committed
1006
1007
}

1008
1009
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
1010
#' @param x Object of class \code{lgb.Dataset}
1011
#' @param ... other parameters (ignored)
James Lamb's avatar
James Lamb committed
1012
#'
Guolin Ke's avatar
Guolin Ke committed
1013
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
1014
#'
Guolin Ke's avatar
Guolin Ke committed
1015
1016
1017
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
1018
#'
Guolin Ke's avatar
Guolin Ke committed
1019
#' @examples
1020
#' \donttest{
1021
1022
1023
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1024
#'
1025
1026
1027
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
1028
#' }
Guolin Ke's avatar
Guolin Ke committed
1029
1030
1031
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
1032

1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "dim.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?dim.lgb.Dataset for documentation on how to call this function."
    ))
  }

1043
  if (!lgb.is.Dataset(x = x)) {
1044
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1045
  }
James Lamb's avatar
James Lamb committed
1046

1047
  return(x$dim())
James Lamb's avatar
James Lamb committed
1048

Guolin Ke's avatar
Guolin Ke committed
1049
1050
}

1051
1052
1053
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
1054
1055
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
1056
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
1057
1058
1059
1060
1061
1062
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
1063
#' \donttest{
1064
1065
1066
1067
1068
1069
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1070
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1071
#' print(dtrain, verbose = TRUE)
1072
#' }
Guolin Ke's avatar
Guolin Ke committed
1073
#' @rdname dimnames.lgb.Dataset
1074
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1075
1076
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1077

1078
  if (!lgb.is.Dataset(x = x)) {
1079
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1080
  }
James Lamb's avatar
James Lamb committed
1081

1082
  # Return dimension names
1083
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1084

Guolin Ke's avatar
Guolin Ke committed
1085
1086
1087
1088
1089
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1090

1091
  # Check if invalid element list
1092
  if (!identical(class(value), "list") || length(value) != 2L) {
1093
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
1094
  }
James Lamb's avatar
James Lamb committed
1095

1096
1097
1098
1099
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1100

1101
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1102

1103
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1104
    return(x)
James Lamb's avatar
James Lamb committed
1105

1106
  }
James Lamb's avatar
James Lamb committed
1107

1108
  # Check for unmatching column size
1109
  if (ncol(x) != length(value[[2L]])) {
1110
1111
    stop(
      "can't assign "
1112
      , sQuote(length(value[[2L]]))
1113
1114
1115
1116
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1117
  }
James Lamb's avatar
James Lamb committed
1118

1119
  # Set column names properly, and return
1120
  x$set_colnames(colnames = value[[2L]])
1121
  return(x)
James Lamb's avatar
James Lamb committed
1122

Guolin Ke's avatar
Guolin Ke committed
1123
1124
}

1125
1126
1127
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1128
#' @param dataset Object of class \code{lgb.Dataset}
1129
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1130
1131
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1132
#'
Guolin Ke's avatar
Guolin Ke committed
1133
#' @examples
1134
#' \donttest{
1135
1136
1137
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1138
#'
1139
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1140
#' lgb.Dataset.construct(dsub)
1141
#' labels <- lightgbm::get_field(dsub, "label")
1142
#' }
Guolin Ke's avatar
Guolin Ke committed
1143
#' @export
1144
1145
1146
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1147
1148
1149
1150

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1151

1152
  if (!lgb.is.Dataset(x = dataset)) {
1153
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1154
  }
James Lamb's avatar
James Lamb committed
1155

1156
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1157

Guolin Ke's avatar
Guolin Ke committed
1158
1159
}

1160
1161
1162
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1163
1164
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
1165
#' @param ... other parameters (ignored)
Guolin Ke's avatar
Guolin Ke committed
1166
#' @return info data
James Lamb's avatar
James Lamb committed
1167
#'
Guolin Ke's avatar
Guolin Ke committed
1168
1169
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1170
#'
Guolin Ke's avatar
Guolin Ke committed
1171
1172
1173
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
1174
1175
1176
1177
1178
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1179
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1180
#' }
James Lamb's avatar
James Lamb committed
1181
#'
Guolin Ke's avatar
Guolin Ke committed
1182
#' @examples
1183
#' \donttest{
1184
1185
1186
1187
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1188
#'
1189
#' labels <- lightgbm::getinfo(dtrain, "label")
1190
#' lightgbm::set_field(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1191
#'
1192
1193
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1194
#' }
Guolin Ke's avatar
Guolin Ke committed
1195
#' @export
1196
1197
1198
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1199
1200
1201
1202

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1203

1204
1205
  warning("Calling getinfo() on a lgb.Dataset is deprecated. Use get_field() instead.")

1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "getinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?getinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1216
  if (!lgb.is.Dataset(x = dataset)) {
1217
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1218
  }
James Lamb's avatar
James Lamb committed
1219

1220
  return(dataset$get_field(field_name = name))
James Lamb's avatar
James Lamb committed
1221

Guolin Ke's avatar
Guolin Ke committed
1222
1223
}

1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317

#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1318
1319
}

1320
1321
1322
1323
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1324
#' @param dataset object of class \code{lgb.Dataset}
1325
1326
1327
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1328
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1329
#'
1330
#' @examples
1331
#' \donttest{
1332
1333
1334
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1335
1336
1337
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1338
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1339
#' }
1340
1341
1342
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1343

1344
  if (!lgb.is.Dataset(x = dataset)) {
1345
1346
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1347

1348
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1349

1350
1351
}

1352
1353
1354
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1355
1356
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1357
#'
1358
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1359
#'
Guolin Ke's avatar
Guolin Ke committed
1360
#' @examples
1361
#' \donttest{
1362
#' # create training Dataset
1363
1364
1365
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1366
1367
#'
#' # create a validation Dataset, using dtrain as a reference
1368
1369
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1370
#' dtest <- lgb.Dataset(test$data, label = test$label)
1371
#' lgb.Dataset.set.reference(dtest, dtrain)
1372
#' }
Guolin Ke's avatar
Guolin Ke committed
1373
1374
1375
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1376

1377
  if (!lgb.is.Dataset(x = dataset)) {
1378
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1379
  }
James Lamb's avatar
James Lamb committed
1380

1381
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1382
1383
}

1384
1385
1386
1387
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1388
1389
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1390
#'
1391
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1392
#'
Guolin Ke's avatar
Guolin Ke committed
1393
#' @examples
1394
#' \donttest{
1395
1396
1397
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1398
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1399
#' }
Guolin Ke's avatar
Guolin Ke committed
1400
1401
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1402

1403
  if (!lgb.is.Dataset(x = dataset)) {
1404
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1405
  }
James Lamb's avatar
James Lamb committed
1406

1407
1408
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1409
  }
James Lamb's avatar
James Lamb committed
1410

1411
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1412
}