lgb.Dataset.R 41.1 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @keywords internal
NULL

16
17
18
19
20
21
22
23
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
24
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
25
#' @importFrom R6 R6Class
26
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
27
28
Dataset <- R6::R6Class(

29
  classname = "lgb.Dataset",
30
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
31
  public = list(
James Lamb's avatar
James Lamb committed
32

33
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
34
    finalize = function() {
35
36
37
38
39
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
40
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
41
    },
James Lamb's avatar
James Lamb committed
42

43
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
44
    initialize = function(data,
45
46
47
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
48
                          categorical_feature = NULL,
49
50
51
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
52
53
54
55
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
56

57
      # validate inputs early to avoid unnecessary computation
58
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
59
60
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
61
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
62
63
64
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

65
      info <- list()
66
67
68
69
70
71
72
73
74
75
76
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
77
      }
James Lamb's avatar
James Lamb committed
78

79
80
81
82
83
84
85
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
86

87
88
89
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
90
      private$reference <- reference
91
      private$colnames <- colnames
92

93
      private$categorical_feature <- categorical_feature
94
95
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
96
      private$used_indices <- sort(used_indices, decreasing = FALSE)
97
      private$info <- info
98
      private$version <- 0L
James Lamb's avatar
James Lamb committed
99

100
101
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
102
    },
James Lamb's avatar
James Lamb committed
103

104
    create_valid = function(data,
105
106
107
108
109
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
                            params = list(),
110
                            ...) {
James Lamb's avatar
James Lamb committed
111

112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
      additional_params <- list(...)
      if (length(additional_params) > 0L) {
        warning(paste0(
          "Dataset$create_valid(): Found the following passed through '...': "
          , paste(names(additional_params), collapse = ", ")
          , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
          , "Add these to 'params' instead. "
          , "See ?lgb.Dataset.create.valid for documentation on how to call this function."
        ))
      }

      # anything passed into '...' should be overwritten by things passed to 'params'
      params <- modifyList(additional_params, params)

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

129
      # Create new dataset
130
131
      ret <- Dataset$new(
        data = data
132
        , params = params
133
134
135
136
137
138
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
139
140
141
142
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
143
      )
James Lamb's avatar
James Lamb committed
144

145
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
146

Guolin Ke's avatar
Guolin Ke committed
147
    },
James Lamb's avatar
James Lamb committed
148

149
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
150
    construct = function() {
James Lamb's avatar
James Lamb committed
151

152
      # Check for handle null
153
      if (!lgb.is.null.handle(x = private$handle)) {
154
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
155
      }
James Lamb's avatar
James Lamb committed
156

Guolin Ke's avatar
Guolin Ke committed
157
158
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
159
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
160
161
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
162

163
      # set feature names if they do not exist
164
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
165
166
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
167

168
169
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
170

171
        # Check for character name
172
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
173

174
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
175

176
            # Provided indices, but some indices are missing?
177
            if (sum(is.na(cate_indices)) > 0L) {
178
179
180
181
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
182
            }
James Lamb's avatar
James Lamb committed
183

184
          } else {
James Lamb's avatar
James Lamb committed
185

186
            # Check if more categorical features were output over the feature space
187
            if (max(private$categorical_feature) > length(private$colnames)) {
188
189
190
191
192
193
194
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
195
            }
James Lamb's avatar
James Lamb committed
196

197
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
198
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
199

200
          }
James Lamb's avatar
James Lamb committed
201

202
        # Store indices for categorical features
203
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
204

205
      }
James Lamb's avatar
James Lamb committed
206

Guolin Ke's avatar
Guolin Ke committed
207
      # Generate parameter str
208
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
209

210
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
211
212
213
214
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
215

216
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
217
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
218

219
220
221
222
223
224
225
226
227
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

228
        # Are we using a data file?
229
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
230

231
          handle <- .Call(
232
            LGBM_DatasetCreateFromFile_R
233
            , path.expand(private$raw_data)
234
235
236
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
237

Guolin Ke's avatar
Guolin Ke committed
238
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
239

240
          # Are we using a matrix?
241
          handle <- .Call(
242
            LGBM_DatasetCreateFromMat_R
243
244
245
246
247
248
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
249
250

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
251
          if (length(private$raw_data@p) > 2147483647L) {
252
253
            stop("Cannot support large CSC matrix")
          }
254
          # Are we using a dgCMatrix (sparsed matrix column compressed)
255
          handle <- .Call(
256
            LGBM_DatasetCreateFromCSC_R
257
258
259
260
261
262
263
264
265
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
266

Guolin Ke's avatar
Guolin Ke committed
267
        } else {
James Lamb's avatar
James Lamb committed
268

269
          # Unknown data type
270
271
272
273
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
274

Guolin Ke's avatar
Guolin Ke committed
275
        }
James Lamb's avatar
James Lamb committed
276

Guolin Ke's avatar
Guolin Ke committed
277
      } else {
James Lamb's avatar
James Lamb committed
278

279
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
280
        if (is.null(private$reference)) {
281
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
282
        }
James Lamb's avatar
James Lamb committed
283

284
        # Construct subset
285
        handle <- .Call(
286
          LGBM_DatasetGetSubset_R
287
288
289
290
291
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
292

Guolin Ke's avatar
Guolin Ke committed
293
      }
294
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
295
296
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
297
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
298
299
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
300

301
302
      # Set feature names
      if (!is.null(private$colnames)) {
303
        self$set_colnames(colnames = private$colnames)
304
      }
305

306
307
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
308

309
        # Setup initial scores
310
        init_score <- private$predictor$predict(
311
          data = private$raw_data
312
313
314
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
315

316
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
317
318
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
319

320
      }
James Lamb's avatar
James Lamb committed
321

322
323
324
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
325
      }
James Lamb's avatar
James Lamb committed
326

327
      # Get private information
328
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
329

330
        # Set infos
331
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
332

Guolin Ke's avatar
Guolin Ke committed
333
          p <- private$info[i]
334
335
336
337
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
338

Guolin Ke's avatar
Guolin Ke committed
339
        }
James Lamb's avatar
James Lamb committed
340

Guolin Ke's avatar
Guolin Ke committed
341
      }
James Lamb's avatar
James Lamb committed
342

343
      # Get label information existence
344
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
345
346
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
347

348
      return(invisible(self))
James Lamb's avatar
James Lamb committed
349

Guolin Ke's avatar
Guolin Ke committed
350
    },
James Lamb's avatar
James Lamb committed
351

352
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
353
    dim = function() {
James Lamb's avatar
James Lamb committed
354

355
      # Check for handle
356
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
357

358
359
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
360

361
        # Get numeric data and numeric features
362
363
364
365
366
367
368
369
370
371
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
372
        return(
373
          c(num_row, num_col)
374
        )
James Lamb's avatar
James Lamb committed
375
376
377

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

378
        # Check if dgCMatrix (sparse matrix column compressed)
379
        # NOTE: requires Matrix package
380
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
381

Guolin Ke's avatar
Guolin Ke committed
382
      } else {
James Lamb's avatar
James Lamb committed
383

384
        # Trying to work with unknown dimensions is not possible
385
386
387
388
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
389

Guolin Ke's avatar
Guolin Ke committed
390
      }
James Lamb's avatar
James Lamb committed
391

Guolin Ke's avatar
Guolin Ke committed
392
    },
James Lamb's avatar
James Lamb committed
393

394
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
395
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
396

397
      # Check for handle
398
      if (!lgb.is.null.handle(x = private$handle)) {
399
        private$colnames <- .Call(
400
401
          LGBM_DatasetGetFeatureNames_R
          , private$handle
402
        )
403
        return(private$colnames)
James Lamb's avatar
James Lamb committed
404
405
406

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

407
        # Check if dgCMatrix (sparse matrix column compressed)
408
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
409

Guolin Ke's avatar
Guolin Ke committed
410
      } else {
James Lamb's avatar
James Lamb committed
411

412
        # Trying to work with unknown formats is not possible
413
        stop(
414
415
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
416
        )
James Lamb's avatar
James Lamb committed
417

Guolin Ke's avatar
Guolin Ke committed
418
      }
James Lamb's avatar
James Lamb committed
419

Guolin Ke's avatar
Guolin Ke committed
420
    },
James Lamb's avatar
James Lamb committed
421

422
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
423
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
424

425
426
      # Check column names non-existence
      if (is.null(colnames)) {
427
        return(invisible(self))
428
      }
James Lamb's avatar
James Lamb committed
429

430
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
431
      colnames <- as.character(colnames)
432
      if (length(colnames) == 0L) {
433
        return(invisible(self))
434
      }
James Lamb's avatar
James Lamb committed
435

436
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
437
      private$colnames <- colnames
438
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
439

440
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
441
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
442
443
        .Call(
          LGBM_DatasetSetFeatureNames_R
444
          , private$handle
445
          , merged_name
446
        )
James Lamb's avatar
James Lamb committed
447

Guolin Ke's avatar
Guolin Ke committed
448
      }
James Lamb's avatar
James Lamb committed
449

450
      return(invisible(self))
James Lamb's avatar
James Lamb committed
451

Guolin Ke's avatar
Guolin Ke committed
452
    },
James Lamb's avatar
James Lamb committed
453

454
    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
455

456
      # Check if attribute key is in the known attribute list
457
458
459
460
461
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
462
      }
James Lamb's avatar
James Lamb committed
463

464
      # Check for info name and handle
465
      if (is.null(private$info[[field_name]])) {
466

467
        if (lgb.is.null.handle(x = private$handle)) {
468
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
469
        }
470

471
        # Get field size of info
472
        info_len <- 0L
473
474
        .Call(
          LGBM_DatasetGetFieldSize_R
475
          , private$handle
476
          , field_name
477
          , info_len
478
        )
James Lamb's avatar
James Lamb committed
479

480
        # Check if info is not empty
481
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
482

483
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
484
          ret <- NULL
485
          ret <- if (field_name == "group") {
486
487
488
489
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
490

491
492
          .Call(
            LGBM_DatasetGetField_R
493
            , private$handle
494
            , field_name
495
            , ret
496
          )
James Lamb's avatar
James Lamb committed
497

498
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
499

Guolin Ke's avatar
Guolin Ke committed
500
501
        }
      }
James Lamb's avatar
James Lamb committed
502

503
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
504

Guolin Ke's avatar
Guolin Ke committed
505
    },
James Lamb's avatar
James Lamb committed
506

507
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
508

509
      # Check if attribute key is in the known attribute list
510
511
512
513
514
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
515
      }
James Lamb's avatar
James Lamb committed
516

517
      # Check for type of information
518
519
      data <- if (field_name == "group") {
        as.integer(data) # Integer
520
      } else {
521
        as.numeric(data) # Numeric
522
      }
James Lamb's avatar
James Lamb committed
523

524
      # Store information privately
525
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
526

527
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
528

529
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
530

531
532
          .Call(
            LGBM_DatasetSetField_R
533
            , private$handle
534
535
536
            , field_name
            , data
            , length(data)
537
          )
James Lamb's avatar
James Lamb committed
538

539
540
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
541
        }
James Lamb's avatar
James Lamb committed
542

Guolin Ke's avatar
Guolin Ke committed
543
      }
James Lamb's avatar
James Lamb committed
544

545
      return(invisible(self))
James Lamb's avatar
James Lamb committed
546

Guolin Ke's avatar
Guolin Ke committed
547
    },
James Lamb's avatar
James Lamb committed
548

549
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
550
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
551

552
553
554
      additional_keyword_args <- list(...)

      if (length(additional_keyword_args) > 0L) {
555
556
        warning(paste0(
          "Dataset$slice(): Found the following passed through '...': "
557
          , paste(names(additional_keyword_args), collapse = ", ")
558
          , ". These are ignored and should be removed. "
559
          , "To change the parameters of a Dataset produced by Dataset$slice(), use Dataset$set_params(). "
560
          , "To modify attributes like 'init_score', use Dataset$set_field(). "
561
562
563
564
          , "In future releases of lightgbm, this warning will become an error."
        ))
      }

565
566
567
568
569
570
571
572
573
574
575
576
577
      # extract Dataset attributes passed through '...'
      #
      # NOTE: takes advantage of the fact that list[["non-existent-key"]] returns NULL
      group <- additional_keyword_args[["group"]]
      init_score <- additional_keyword_args[["init_score"]]
      label <- additional_keyword_args[["label"]]
      weight <- additional_keyword_args[["weight"]]

      # remove attributes from '...', so only params are left
      for (info_key in .INFO_KEYS()) {
        additional_keyword_args[[info_key]] <- NULL
      }

578
      # Perform slicing
579
580
581
      return(
        Dataset$new(
          data = NULL
582
          , params = utils::modifyList(self$get_params(), additional_keyword_args)
583
584
585
586
587
588
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
589
590
591
592
          , group = group
          , init_score = init_score
          , label = label
          , weight = weight
593
        )
594
      )
James Lamb's avatar
James Lamb committed
595

Guolin Ke's avatar
Guolin Ke committed
596
    },
James Lamb's avatar
James Lamb committed
597

598
599
600
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
601
    update_params = function(params) {
602
603
604
      if (length(params) == 0L) {
        return(invisible(self))
      }
605
      if (lgb.is.null.handle(x = private$handle)) {
606
        private$params <- utils::modifyList(private$params, params)
607
      } else {
608
609
        tryCatch({
          .Call(
610
            LGBM_DatasetUpdateParamChecking_R
611
612
613
614
615
616
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
617
          if (is.null(private$raw_data)) {
618
            stop(e)
619
620
          }

621
622
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
623
          private$params <- utils::modifyList(private$params, params)
624
          self$finalize()
625
        })
626
      }
627
      return(invisible(self))
James Lamb's avatar
James Lamb committed
628

Guolin Ke's avatar
Guolin Ke committed
629
    },
James Lamb's avatar
James Lamb committed
630

631
632
633
634
635
636
637
638
639
640
641
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

642
    # Set categorical feature parameter
643
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
644

645
646
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
647
        return(invisible(self))
648
      }
James Lamb's avatar
James Lamb committed
649

650
      # Check for empty data
651
      if (is.null(private$raw_data)) {
652
653
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
654
      }
James Lamb's avatar
James Lamb committed
655

656
      # Overwrite categorical features
657
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
658

659
      # Finalize and return self
660
      self$finalize()
661
      return(invisible(self))
James Lamb's avatar
James Lamb committed
662

663
    },
James Lamb's avatar
James Lamb committed
664

665
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
666
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
667

668
      # setting reference to this same Dataset object doesn't require any changes
669
      if (identical(private$reference, reference)) {
670
        return(invisible(self))
671
      }
James Lamb's avatar
James Lamb committed
672

673
674
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
675
      if (is.null(private$raw_data)) {
676
677
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
678
      }
James Lamb's avatar
James Lamb committed
679

680
681
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
682
      }
James Lamb's avatar
James Lamb committed
683

684
685
686
687
688
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

689
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
690
      private$reference <- reference
James Lamb's avatar
James Lamb committed
691

692
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
693
      self$finalize()
694
      return(invisible(self))
James Lamb's avatar
James Lamb committed
695

Guolin Ke's avatar
Guolin Ke committed
696
    },
James Lamb's avatar
James Lamb committed
697

698
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
699
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
700

701
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
702
      self$construct()
703
704
      .Call(
        LGBM_DatasetSaveBinary_R
705
        , private$handle
706
        , path.expand(fname)
707
      )
708
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
709
    }
James Lamb's avatar
James Lamb committed
710

Guolin Ke's avatar
Guolin Ke committed
711
712
  ),
  private = list(
713
714
715
716
717
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
718
    categorical_feature = NULL,
719
720
721
722
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
723
    version = 0L,
James Lamb's avatar
James Lamb committed
724

725
726
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
727

728
      # Get handle and construct if needed
729
      if (lgb.is.null.handle(x = private$handle)) {
730
731
        self$construct()
      }
732
      return(private$handle)
James Lamb's avatar
James Lamb committed
733

Guolin Ke's avatar
Guolin Ke committed
734
    },
James Lamb's avatar
James Lamb committed
735

736
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
737
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
738

739
      if (identical(private$predictor, predictor)) {
740
        return(invisible(self))
741
      }
James Lamb's avatar
James Lamb committed
742

743
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
744
      if (is.null(private$raw_data)) {
745
746
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
747
      }
James Lamb's avatar
James Lamb committed
748

749
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
750
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
751

752
        # Predictor is unknown
753
        if (!lgb.is.Predictor(predictor)) {
754
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
755
        }
James Lamb's avatar
James Lamb committed
756

Guolin Ke's avatar
Guolin Ke committed
757
      }
James Lamb's avatar
James Lamb committed
758

759
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
760
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
761

762
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
763
      self$finalize()
764
      return(invisible(self))
James Lamb's avatar
James Lamb committed
765

Guolin Ke's avatar
Guolin Ke committed
766
    }
James Lamb's avatar
James Lamb committed
767

Guolin Ke's avatar
Guolin Ke committed
768
769
770
  )
)

771
772
773
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
774
#' @inheritParams lgb_shared_dataset_params
775
776
777
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
778
779
780
781
782
783
784
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
785
#' @param colnames names of columns
786
787
788
789
790
791
792
793
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
794
#' @param ... other parameters passed to \code{params}
James Lamb's avatar
James Lamb committed
795
#'
Guolin Ke's avatar
Guolin Ke committed
796
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
797
#'
Guolin Ke's avatar
Guolin Ke committed
798
#' @examples
799
#' \donttest{
800
801
802
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
803
804
805
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
806
#' lgb.Dataset.construct(dtrain)
807
#' }
Guolin Ke's avatar
Guolin Ke committed
808
809
#' @export
lgb.Dataset <- function(data,
810
811
812
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
813
                        categorical_feature = NULL,
814
                        free_raw_data = TRUE,
815
816
817
818
                        label = NULL,
                        weight = NULL,
                        group = NULL,
                        init_score = NULL,
Guolin Ke's avatar
Guolin Ke committed
819
                        ...) {
James Lamb's avatar
James Lamb committed
820

821
822
823
824
825
826
827
828
829
830
831
832
  additional_params <- list(...)
  params <- modifyList(params, additional_params)

  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
    ))
  }

833
  # Create new dataset
834
835
836
837
838
839
840
841
842
843
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
844
845
846
847
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
848
849
    ))
  )
James Lamb's avatar
James Lamb committed
850

Guolin Ke's avatar
Guolin Ke committed
851
852
}

853
854
855
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
856
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
857
#' @param dataset \code{lgb.Dataset} object, training data
858
859
860
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
861
862
863
864
865
866
867
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
#' @param ... additional \code{lgb.Dataset} parameters.
#'            NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.
James Lamb's avatar
James Lamb committed
868
#'
Guolin Ke's avatar
Guolin Ke committed
869
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
870
#'
Guolin Ke's avatar
Guolin Ke committed
871
#' @examples
872
#' \donttest{
873
874
875
876
877
878
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
914
#' }
915
#' @importFrom utils modifyList
Guolin Ke's avatar
Guolin Ke committed
916
#' @export
917
918
919
920
921
922
923
924
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
                                     params = list(),
                                     ...) {
James Lamb's avatar
James Lamb committed
925

926
  if (!lgb.is.Dataset(x = dataset)) {
927
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
928
  }
James Lamb's avatar
James Lamb committed
929

930
931
932
933
934
935
936
937
938
939
  additional_params <- list(...)
  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset.create.valid: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset.create.valid for documentation on how to call this function."
    ))
  }

940
  # Create validation dataset
941
942
943
944
945
946
947
948
949
950
  return(invisible(
    dataset$create_valid(
      data = data
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
      , params = utils::modifyList(params, additional_params)
    )
  ))
James Lamb's avatar
James Lamb committed
951

952
}
Guolin Ke's avatar
Guolin Ke committed
953

954
955
956
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
957
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
958
#'
Guolin Ke's avatar
Guolin Ke committed
959
#' @examples
960
#' \donttest{
961
962
963
964
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
965
#' }
966
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
967
968
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
969

970
  if (!lgb.is.Dataset(x = dataset)) {
971
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
972
  }
James Lamb's avatar
James Lamb committed
973

974
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
975

Guolin Ke's avatar
Guolin Ke committed
976
977
}

978
979
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
980
#' @param x Object of class \code{lgb.Dataset}
981
#' @param ... other parameters (ignored)
James Lamb's avatar
James Lamb committed
982
#'
Guolin Ke's avatar
Guolin Ke committed
983
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
984
#'
Guolin Ke's avatar
Guolin Ke committed
985
986
987
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
988
#'
Guolin Ke's avatar
Guolin Ke committed
989
#' @examples
990
#' \donttest{
991
992
993
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
994
#'
995
996
997
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
998
#' }
Guolin Ke's avatar
Guolin Ke committed
999
1000
1001
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
1002

1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "dim.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?dim.lgb.Dataset for documentation on how to call this function."
    ))
  }

1013
  if (!lgb.is.Dataset(x = x)) {
1014
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1015
  }
James Lamb's avatar
James Lamb committed
1016

1017
  return(x$dim())
James Lamb's avatar
James Lamb committed
1018

Guolin Ke's avatar
Guolin Ke committed
1019
1020
}

1021
1022
1023
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
1024
1025
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
1026
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
1027
1028
1029
1030
1031
1032
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
1033
#' \donttest{
1034
1035
1036
1037
1038
1039
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1040
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1041
#' print(dtrain, verbose = TRUE)
1042
#' }
Guolin Ke's avatar
Guolin Ke committed
1043
#' @rdname dimnames.lgb.Dataset
1044
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1045
1046
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1047

1048
  if (!lgb.is.Dataset(x = x)) {
1049
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1050
  }
James Lamb's avatar
James Lamb committed
1051

1052
  # Return dimension names
1053
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1054

Guolin Ke's avatar
Guolin Ke committed
1055
1056
1057
1058
1059
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1060

1061
  # Check if invalid element list
1062
  if (!identical(class(value), "list") || length(value) != 2L) {
1063
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
1064
  }
James Lamb's avatar
James Lamb committed
1065

1066
1067
1068
1069
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1070

1071
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1072

1073
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1074
    return(x)
James Lamb's avatar
James Lamb committed
1075

1076
  }
James Lamb's avatar
James Lamb committed
1077

1078
  # Check for unmatching column size
1079
  if (ncol(x) != length(value[[2L]])) {
1080
1081
    stop(
      "can't assign "
1082
      , sQuote(length(value[[2L]]))
1083
1084
1085
1086
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1087
  }
James Lamb's avatar
James Lamb committed
1088

1089
  # Set column names properly, and return
1090
  x$set_colnames(colnames = value[[2L]])
1091
  return(x)
James Lamb's avatar
James Lamb committed
1092

Guolin Ke's avatar
Guolin Ke committed
1093
1094
}

1095
1096
1097
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1098
#' @param dataset Object of class \code{lgb.Dataset}
1099
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1100
1101
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1102
#'
Guolin Ke's avatar
Guolin Ke committed
1103
#' @examples
1104
#' \donttest{
1105
1106
1107
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1108
#'
1109
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1110
#' lgb.Dataset.construct(dsub)
1111
#' labels <- lightgbm::get_field(dsub, "label")
1112
#' }
Guolin Ke's avatar
Guolin Ke committed
1113
#' @export
1114
1115
1116
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1117
1118
1119
1120

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1121

1122
  if (!lgb.is.Dataset(x = dataset)) {
1123
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1124
  }
James Lamb's avatar
James Lamb committed
1125

1126
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1127

Guolin Ke's avatar
Guolin Ke committed
1128
1129
}

1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1223
1224
}

1225
1226
1227
1228
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1229
#' @param dataset object of class \code{lgb.Dataset}
1230
1231
1232
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1233
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1234
#'
1235
#' @examples
1236
#' \donttest{
1237
1238
1239
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1240
1241
1242
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1243
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1244
#' }
1245
1246
1247
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1248

1249
  if (!lgb.is.Dataset(x = dataset)) {
1250
1251
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1252

1253
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1254

1255
1256
}

1257
1258
1259
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1260
1261
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1262
#'
1263
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1264
#'
Guolin Ke's avatar
Guolin Ke committed
1265
#' @examples
1266
#' \donttest{
1267
#' # create training Dataset
1268
1269
1270
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1271
1272
#'
#' # create a validation Dataset, using dtrain as a reference
1273
1274
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1275
#' dtest <- lgb.Dataset(test$data, label = test$label)
1276
#' lgb.Dataset.set.reference(dtest, dtrain)
1277
#' }
Guolin Ke's avatar
Guolin Ke committed
1278
1279
1280
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1281

1282
  if (!lgb.is.Dataset(x = dataset)) {
1283
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1284
  }
James Lamb's avatar
James Lamb committed
1285

1286
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1287
1288
}

1289
1290
1291
1292
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1293
1294
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1295
#'
1296
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1297
#'
Guolin Ke's avatar
Guolin Ke committed
1298
#' @examples
1299
#' \donttest{
1300
1301
1302
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1303
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1304
#' }
Guolin Ke's avatar
Guolin Ke committed
1305
1306
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1307

1308
  if (!lgb.is.Dataset(x = dataset)) {
1309
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1310
  }
James Lamb's avatar
James Lamb committed
1311

1312
1313
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1314
  }
James Lamb's avatar
James Lamb committed
1315

1316
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1317
}