lgb.Dataset.R 42 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @param info a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
#'             is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
#'             directly.
#' @keywords internal
NULL

19
20
21
22
23
24
25
26
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
27
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
28
#' @importFrom R6 R6Class
29
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
30
31
Dataset <- R6::R6Class(

32
  classname = "lgb.Dataset",
33
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
34
  public = list(
James Lamb's avatar
James Lamb committed
35

36
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
37
    finalize = function() {
38
39
40
41
42
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
43
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
44
    },
James Lamb's avatar
James Lamb committed
45

46
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
47
    initialize = function(data,
48
49
50
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
51
                          categorical_feature = NULL,
52
53
54
55
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
56
57
58
59
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
60

61
      # validate inputs early to avoid unnecessary computation
62
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
63
64
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
65
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
66
67
68
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

69
70
71
72
73
74
75
      if (length(info) > 0L) {
        warning(paste0(
          "lgb.Dataset: found fields passed through 'info'. "
          , "As of v3.3.0, this behavior is deprecated, and support for it will be removed in a future release. "
          , "To suppress this warning, use keyword arguments 'label', 'weight', 'group', or 'init_score' directly"
        ))
      }
James Lamb's avatar
James Lamb committed
76

77
78
79
80
81
82
83
84
85
86
87
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
88
      }
James Lamb's avatar
James Lamb committed
89

90
91
92
93
94
95
96
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
97

98
99
100
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
101
      private$reference <- reference
102
      private$colnames <- colnames
103

104
      private$categorical_feature <- categorical_feature
105
106
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
107
      private$used_indices <- sort(used_indices, decreasing = FALSE)
108
      private$info <- info
109
      private$version <- 0L
James Lamb's avatar
James Lamb committed
110

111
112
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
113
    },
James Lamb's avatar
James Lamb committed
114

115
116
    create_valid = function(data,
                            info = list(),
117
118
119
120
121
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
                            params = list(),
122
                            ...) {
James Lamb's avatar
James Lamb committed
123

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      additional_params <- list(...)
      if (length(additional_params) > 0L) {
        warning(paste0(
          "Dataset$create_valid(): Found the following passed through '...': "
          , paste(names(additional_params), collapse = ", ")
          , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
          , "Add these to 'params' instead. "
          , "See ?lgb.Dataset.create.valid for documentation on how to call this function."
        ))
      }

      # anything passed into '...' should be overwritten by things passed to 'params'
      params <- modifyList(additional_params, params)

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

141
      # Create new dataset
142
143
      ret <- Dataset$new(
        data = data
144
        , params = params
145
146
147
148
149
150
151
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
152
153
154
155
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
156
      )
James Lamb's avatar
James Lamb committed
157

158
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
159

Guolin Ke's avatar
Guolin Ke committed
160
    },
James Lamb's avatar
James Lamb committed
161

162
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
163
    construct = function() {
James Lamb's avatar
James Lamb committed
164

165
      # Check for handle null
166
      if (!lgb.is.null.handle(x = private$handle)) {
167
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
168
      }
James Lamb's avatar
James Lamb committed
169

Guolin Ke's avatar
Guolin Ke committed
170
171
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
172
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
173
174
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
175

176
      # set feature names if they do not exist
177
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
180

181
182
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
183

184
        # Check for character name
185
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
186

187
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
188

189
            # Provided indices, but some indices are missing?
190
            if (sum(is.na(cate_indices)) > 0L) {
191
192
193
194
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
195
            }
James Lamb's avatar
James Lamb committed
196

197
          } else {
James Lamb's avatar
James Lamb committed
198

199
            # Check if more categorical features were output over the feature space
200
            if (max(private$categorical_feature) > length(private$colnames)) {
201
202
203
204
205
206
207
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
208
            }
James Lamb's avatar
James Lamb committed
209

210
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
211
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
212

213
          }
James Lamb's avatar
James Lamb committed
214

215
        # Store indices for categorical features
216
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
217

218
      }
James Lamb's avatar
James Lamb committed
219

Guolin Ke's avatar
Guolin Ke committed
220
      # Generate parameter str
221
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
222

223
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
228

229
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
230
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
231

232
        # Are we using a data file?
233
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
234

235
          handle <- .Call(
236
            LGBM_DatasetCreateFromFile_R
237
            , private$raw_data
238
239
240
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
241

Guolin Ke's avatar
Guolin Ke committed
242
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
243

244
          # Are we using a matrix?
245
          handle <- .Call(
246
            LGBM_DatasetCreateFromMat_R
247
248
249
250
251
252
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
253
254

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
255
          if (length(private$raw_data@p) > 2147483647L) {
256
257
            stop("Cannot support large CSC matrix")
          }
258
          # Are we using a dgCMatrix (sparsed matrix column compressed)
259
          handle <- .Call(
260
            LGBM_DatasetCreateFromCSC_R
261
262
263
264
265
266
267
268
269
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
270

Guolin Ke's avatar
Guolin Ke committed
271
        } else {
James Lamb's avatar
James Lamb committed
272

273
          # Unknown data type
274
275
276
277
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
278

Guolin Ke's avatar
Guolin Ke committed
279
        }
James Lamb's avatar
James Lamb committed
280

Guolin Ke's avatar
Guolin Ke committed
281
      } else {
James Lamb's avatar
James Lamb committed
282

283
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
284
        if (is.null(private$reference)) {
285
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
286
        }
James Lamb's avatar
James Lamb committed
287

288
        # Construct subset
289
        handle <- .Call(
290
          LGBM_DatasetGetSubset_R
291
292
293
294
295
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
296

Guolin Ke's avatar
Guolin Ke committed
297
      }
298
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
299
300
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
301
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
302
303
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
304

305
306
      # Set feature names
      if (!is.null(private$colnames)) {
307
        self$set_colnames(colnames = private$colnames)
308
      }
309

310
311
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
312

313
        # Setup initial scores
314
        init_score <- private$predictor$predict(
315
          data = private$raw_data
316
317
318
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
319

320
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
321
322
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
323

324
      }
James Lamb's avatar
James Lamb committed
325

326
327
328
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
329
      }
James Lamb's avatar
James Lamb committed
330

331
      # Get private information
332
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
333

334
        # Set infos
335
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
336

Guolin Ke's avatar
Guolin Ke committed
337
          p <- private$info[i]
338
          self$setinfo(name = names(p), info = p[[1L]])
James Lamb's avatar
James Lamb committed
339

Guolin Ke's avatar
Guolin Ke committed
340
        }
James Lamb's avatar
James Lamb committed
341

Guolin Ke's avatar
Guolin Ke committed
342
      }
James Lamb's avatar
James Lamb committed
343

344
      # Get label information existence
345
      if (is.null(self$getinfo(name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
346
347
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
348

349
      return(invisible(self))
James Lamb's avatar
James Lamb committed
350

Guolin Ke's avatar
Guolin Ke committed
351
    },
James Lamb's avatar
James Lamb committed
352

353
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
354
    dim = function() {
James Lamb's avatar
James Lamb committed
355

356
      # Check for handle
357
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
358

359
360
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
361

362
        # Get numeric data and numeric features
363
364
365
366
367
368
369
370
371
372
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
373
        return(
374
          c(num_row, num_col)
375
        )
James Lamb's avatar
James Lamb committed
376
377
378

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

379
        # Check if dgCMatrix (sparse matrix column compressed)
380
        # NOTE: requires Matrix package
381
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
382

Guolin Ke's avatar
Guolin Ke committed
383
      } else {
James Lamb's avatar
James Lamb committed
384

385
        # Trying to work with unknown dimensions is not possible
386
387
388
389
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
390

Guolin Ke's avatar
Guolin Ke committed
391
      }
James Lamb's avatar
James Lamb committed
392

Guolin Ke's avatar
Guolin Ke committed
393
    },
James Lamb's avatar
James Lamb committed
394

395
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
396
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
397

398
      # Check for handle
399
      if (!lgb.is.null.handle(x = private$handle)) {
400
        private$colnames <- .Call(
401
402
          LGBM_DatasetGetFeatureNames_R
          , private$handle
403
        )
404
        return(private$colnames)
James Lamb's avatar
James Lamb committed
405
406
407

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

408
        # Check if dgCMatrix (sparse matrix column compressed)
409
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
410

Guolin Ke's avatar
Guolin Ke committed
411
      } else {
James Lamb's avatar
James Lamb committed
412

413
        # Trying to work with unknown dimensions is not possible
414
415
416
417
        stop(
          "dim: cannot get dimensions before dataset has been constructed, please call "
          , "lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
418

Guolin Ke's avatar
Guolin Ke committed
419
      }
James Lamb's avatar
James Lamb committed
420

Guolin Ke's avatar
Guolin Ke committed
421
    },
James Lamb's avatar
James Lamb committed
422

423
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
424
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
425

426
427
      # Check column names non-existence
      if (is.null(colnames)) {
428
        return(invisible(self))
429
      }
James Lamb's avatar
James Lamb committed
430

431
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
432
      colnames <- as.character(colnames)
433
      if (length(colnames) == 0L) {
434
        return(invisible(self))
435
      }
James Lamb's avatar
James Lamb committed
436

437
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
438
      private$colnames <- colnames
439
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
440

441
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
442
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
443
444
        .Call(
          LGBM_DatasetSetFeatureNames_R
445
          , private$handle
446
          , merged_name
447
        )
James Lamb's avatar
James Lamb committed
448

Guolin Ke's avatar
Guolin Ke committed
449
      }
James Lamb's avatar
James Lamb committed
450

451
      return(invisible(self))
James Lamb's avatar
James Lamb committed
452

Guolin Ke's avatar
Guolin Ke committed
453
    },
James Lamb's avatar
James Lamb committed
454

455
    # Get information
Guolin Ke's avatar
Guolin Ke committed
456
    getinfo = function(name) {
James Lamb's avatar
James Lamb committed
457

458
      # Check if attribute key is in the known attribute list
459
460
      if (!is.character(name) || length(name) != 1L || !name %in% .INFO_KEYS()) {
        stop("getinfo: name must one of the following: ", paste0(sQuote(.INFO_KEYS()), collapse = ", "))
Guolin Ke's avatar
Guolin Ke committed
461
      }
James Lamb's avatar
James Lamb committed
462

463
      # Check for info name and handle
464
      if (is.null(private$info[[name]])) {
465

466
        if (lgb.is.null.handle(x = private$handle)) {
467
          stop("Cannot perform getinfo before constructing Dataset.")
468
        }
469

470
        # Get field size of info
471
        info_len <- 0L
472
473
        .Call(
          LGBM_DatasetGetFieldSize_R
474
          , private$handle
475
          , name
476
          , info_len
477
        )
James Lamb's avatar
James Lamb committed
478

479
        # Check if info is not empty
480
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
481

482
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
483
          ret <- NULL
484
485
486
487
488
          ret <- if (name == "group") {
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
489

490
491
          .Call(
            LGBM_DatasetGetField_R
492
            , private$handle
493
            , name
494
            , ret
495
          )
James Lamb's avatar
James Lamb committed
496

Guolin Ke's avatar
Guolin Ke committed
497
          private$info[[name]] <- ret
James Lamb's avatar
James Lamb committed
498

Guolin Ke's avatar
Guolin Ke committed
499
500
        }
      }
James Lamb's avatar
James Lamb committed
501

502
      return(private$info[[name]])
James Lamb's avatar
James Lamb committed
503

Guolin Ke's avatar
Guolin Ke committed
504
    },
James Lamb's avatar
James Lamb committed
505

506
    # Set information
Guolin Ke's avatar
Guolin Ke committed
507
    setinfo = function(name, info) {
James Lamb's avatar
James Lamb committed
508

509
      # Check if attribute key is in the known attribute list
510
511
      if (!is.character(name) || length(name) != 1L || !name %in% .INFO_KEYS()) {
        stop("setinfo: name must one of the following: ", paste0(sQuote(.INFO_KEYS()), collapse = ", "))
512
      }
James Lamb's avatar
James Lamb committed
513

514
515
516
517
518
519
      # Check for type of information
      info <- if (name == "group") {
        as.integer(info) # Integer
      } else {
        as.numeric(info) # Numeric
      }
James Lamb's avatar
James Lamb committed
520

521
      # Store information privately
Guolin Ke's avatar
Guolin Ke committed
522
      private$info[[name]] <- info
James Lamb's avatar
James Lamb committed
523

524
      if (!lgb.is.null.handle(x = private$handle) && !is.null(info)) {
James Lamb's avatar
James Lamb committed
525

526
        if (length(info) > 0L) {
James Lamb's avatar
James Lamb committed
527

528
529
          .Call(
            LGBM_DatasetSetField_R
530
            , private$handle
531
            , name
532
533
534
            , info
            , length(info)
          )
James Lamb's avatar
James Lamb committed
535

536
537
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
538
        }
James Lamb's avatar
James Lamb committed
539

Guolin Ke's avatar
Guolin Ke committed
540
      }
James Lamb's avatar
James Lamb committed
541

542
      return(invisible(self))
James Lamb's avatar
James Lamb committed
543

Guolin Ke's avatar
Guolin Ke committed
544
    },
James Lamb's avatar
James Lamb committed
545

546
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
547
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
548

549
550
551
      additional_keyword_args <- list(...)

      if (length(additional_keyword_args) > 0L) {
552
553
        warning(paste0(
          "Dataset$slice(): Found the following passed through '...': "
554
          , paste(names(additional_keyword_args), collapse = ", ")
555
          , ". These are ignored and should be removed. "
556
557
          , "To change the parameters of a Dataset produced by Dataset$slice(), use Dataset$set_params(). "
          , "To modify attributes like 'init_score', use Dataset$setinfo(). "
558
559
560
561
          , "In future releases of lightgbm, this warning will become an error."
        ))
      }

562
563
564
565
566
567
568
569
570
571
572
573
574
      # extract Dataset attributes passed through '...'
      #
      # NOTE: takes advantage of the fact that list[["non-existent-key"]] returns NULL
      group <- additional_keyword_args[["group"]]
      init_score <- additional_keyword_args[["init_score"]]
      label <- additional_keyword_args[["label"]]
      weight <- additional_keyword_args[["weight"]]

      # remove attributes from '...', so only params are left
      for (info_key in .INFO_KEYS()) {
        additional_keyword_args[[info_key]] <- NULL
      }

575
      # Perform slicing
576
577
578
      return(
        Dataset$new(
          data = NULL
579
          , params = utils::modifyList(self$get_params(), additional_keyword_args)
580
581
582
583
584
585
586
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
587
588
589
590
          , group = group
          , init_score = init_score
          , label = label
          , weight = weight
591
        )
592
      )
James Lamb's avatar
James Lamb committed
593

Guolin Ke's avatar
Guolin Ke committed
594
    },
James Lamb's avatar
James Lamb committed
595

596
597
598
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
599
    update_params = function(params) {
600
601
602
      if (length(params) == 0L) {
        return(invisible(self))
      }
603
      if (lgb.is.null.handle(x = private$handle)) {
604
        private$params <- utils::modifyList(private$params, params)
605
      } else {
606
607
        tryCatch({
          .Call(
608
            LGBM_DatasetUpdateParamChecking_R
609
610
611
612
613
614
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
615
          if (is.null(private$raw_data)) {
616
            stop(e)
617
618
          }

619
620
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
621
          private$params <- utils::modifyList(private$params, params)
622
          self$finalize()
623
        })
624
      }
625
      return(invisible(self))
James Lamb's avatar
James Lamb committed
626

Guolin Ke's avatar
Guolin Ke committed
627
    },
James Lamb's avatar
James Lamb committed
628

629
630
631
632
633
634
635
636
637
638
639
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

640
    # Set categorical feature parameter
641
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
642

643
644
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
645
        return(invisible(self))
646
      }
James Lamb's avatar
James Lamb committed
647

648
      # Check for empty data
649
      if (is.null(private$raw_data)) {
650
651
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
652
      }
James Lamb's avatar
James Lamb committed
653

654
      # Overwrite categorical features
655
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
656

657
      # Finalize and return self
658
      self$finalize()
659
      return(invisible(self))
James Lamb's avatar
James Lamb committed
660

661
    },
James Lamb's avatar
James Lamb committed
662

663
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
664
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
665

666
      # Set known references
667
668
669
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)
James Lamb's avatar
James Lamb committed
670

671
672
      # Check for identical references
      if (identical(private$reference, reference)) {
673
        return(invisible(self))
674
      }
James Lamb's avatar
James Lamb committed
675

676
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
677
      if (is.null(private$raw_data)) {
James Lamb's avatar
James Lamb committed
678

679
680
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
James Lamb's avatar
James Lamb committed
681

Guolin Ke's avatar
Guolin Ke committed
682
      }
James Lamb's avatar
James Lamb committed
683

684
      # Check for non-existing reference
Guolin Ke's avatar
Guolin Ke committed
685
      if (!is.null(reference)) {
James Lamb's avatar
James Lamb committed
686

687
        # Reference is unknown
688
        if (!lgb.is.Dataset(reference)) {
689
          stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
690
        }
James Lamb's avatar
James Lamb committed
691

Guolin Ke's avatar
Guolin Ke committed
692
      }
James Lamb's avatar
James Lamb committed
693

694
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
695
      private$reference <- reference
James Lamb's avatar
James Lamb committed
696

697
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
698
      self$finalize()
699
      return(invisible(self))
James Lamb's avatar
James Lamb committed
700

Guolin Ke's avatar
Guolin Ke committed
701
    },
James Lamb's avatar
James Lamb committed
702

703
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
704
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
705

706
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
707
      self$construct()
708
709
      .Call(
        LGBM_DatasetSaveBinary_R
710
        , private$handle
711
        , fname
712
      )
713
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
714
    }
James Lamb's avatar
James Lamb committed
715

Guolin Ke's avatar
Guolin Ke committed
716
717
  ),
  private = list(
718
719
720
721
722
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
723
    categorical_feature = NULL,
724
725
726
727
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
728
    version = 0L,
James Lamb's avatar
James Lamb committed
729

730
731
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
732

733
      # Get handle and construct if needed
734
      if (lgb.is.null.handle(x = private$handle)) {
735
736
        self$construct()
      }
737
      return(private$handle)
James Lamb's avatar
James Lamb committed
738

Guolin Ke's avatar
Guolin Ke committed
739
    },
James Lamb's avatar
James Lamb committed
740

741
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
742
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
743

744
      if (identical(private$predictor, predictor)) {
745
        return(invisible(self))
746
      }
James Lamb's avatar
James Lamb committed
747

748
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
749
      if (is.null(private$raw_data)) {
750
751
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
752
      }
James Lamb's avatar
James Lamb committed
753

754
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
755
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
756

757
        # Predictor is unknown
758
        if (!lgb.is.Predictor(predictor)) {
759
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
760
        }
James Lamb's avatar
James Lamb committed
761

Guolin Ke's avatar
Guolin Ke committed
762
      }
James Lamb's avatar
James Lamb committed
763

764
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
765
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
766

767
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
768
      self$finalize()
769
      return(invisible(self))
James Lamb's avatar
James Lamb committed
770

Guolin Ke's avatar
Guolin Ke committed
771
    }
James Lamb's avatar
James Lamb committed
772

Guolin Ke's avatar
Guolin Ke committed
773
774
775
  )
)

776
777
778
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
779
#' @inheritParams lgb_shared_dataset_params
780
781
782
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
783
784
785
786
787
788
789
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
790
#' @param colnames names of columns
791
792
793
794
795
796
797
798
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
799
#' @param ... other parameters passed to \code{params}
James Lamb's avatar
James Lamb committed
800
#'
Guolin Ke's avatar
Guolin Ke committed
801
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
802
#'
Guolin Ke's avatar
Guolin Ke committed
803
#' @examples
804
#' \donttest{
805
806
807
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
808
809
810
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
811
#' lgb.Dataset.construct(dtrain)
812
#' }
Guolin Ke's avatar
Guolin Ke committed
813
814
#' @export
lgb.Dataset <- function(data,
815
816
817
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
818
                        categorical_feature = NULL,
819
820
                        free_raw_data = TRUE,
                        info = list(),
821
822
823
824
                        label = NULL,
                        weight = NULL,
                        group = NULL,
                        init_score = NULL,
Guolin Ke's avatar
Guolin Ke committed
825
                        ...) {
James Lamb's avatar
James Lamb committed
826

827
828
829
830
831
832
833
834
835
836
837
838
  additional_params <- list(...)
  params <- modifyList(params, additional_params)

  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
    ))
  }

839
  # Create new dataset
840
841
842
843
844
845
846
847
848
849
850
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
851
852
853
854
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
855
856
    ))
  )
James Lamb's avatar
James Lamb committed
857

Guolin Ke's avatar
Guolin Ke committed
858
859
}

860
861
862
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
863
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
864
#' @param dataset \code{lgb.Dataset} object, training data
865
866
867
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
868
869
870
871
872
873
874
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
#' @param ... additional \code{lgb.Dataset} parameters.
#'            NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.
James Lamb's avatar
James Lamb committed
875
#'
Guolin Ke's avatar
Guolin Ke committed
876
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
877
#'
Guolin Ke's avatar
Guolin Ke committed
878
#' @examples
879
#' \donttest{
880
881
882
883
884
885
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
921
#' }
922
#' @importFrom utils modifyList
Guolin Ke's avatar
Guolin Ke committed
923
#' @export
924
925
926
927
928
929
930
931
932
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     info = list(),
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
                                     params = list(),
                                     ...) {
James Lamb's avatar
James Lamb committed
933

934
  if (!lgb.is.Dataset(x = dataset)) {
935
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
936
  }
James Lamb's avatar
James Lamb committed
937

938
939
940
941
942
943
944
945
946
947
  additional_params <- list(...)
  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset.create.valid: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset.create.valid for documentation on how to call this function."
    ))
  }

948
  # Create validation dataset
949
950
951
952
953
954
955
956
957
958
959
  return(invisible(
    dataset$create_valid(
      data = data
      , info = info
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
      , params = utils::modifyList(params, additional_params)
    )
  ))
James Lamb's avatar
James Lamb committed
960

961
}
Guolin Ke's avatar
Guolin Ke committed
962

963
964
965
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
966
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
967
#'
Guolin Ke's avatar
Guolin Ke committed
968
#' @examples
969
#' \donttest{
970
971
972
973
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
974
#' }
975
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
976
977
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
978

979
  if (!lgb.is.Dataset(x = dataset)) {
980
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
981
  }
James Lamb's avatar
James Lamb committed
982

983
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
984

Guolin Ke's avatar
Guolin Ke committed
985
986
}

987
988
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
989
#' @param x Object of class \code{lgb.Dataset}
990
#' @param ... other parameters (ignored)
James Lamb's avatar
James Lamb committed
991
#'
Guolin Ke's avatar
Guolin Ke committed
992
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
993
#'
Guolin Ke's avatar
Guolin Ke committed
994
995
996
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
997
#'
Guolin Ke's avatar
Guolin Ke committed
998
#' @examples
999
#' \donttest{
1000
1001
1002
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1003
#'
1004
1005
1006
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
1007
#' }
Guolin Ke's avatar
Guolin Ke committed
1008
1009
1010
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
1011

1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "dim.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?dim.lgb.Dataset for documentation on how to call this function."
    ))
  }

1022
  if (!lgb.is.Dataset(x = x)) {
1023
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1024
  }
James Lamb's avatar
James Lamb committed
1025

1026
  return(x$dim())
James Lamb's avatar
James Lamb committed
1027

Guolin Ke's avatar
Guolin Ke committed
1028
1029
}

1030
1031
1032
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
1033
1034
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
1035
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
1036
1037
1038
1039
1040
1041
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
1042
#' \donttest{
1043
1044
1045
1046
1047
1048
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1049
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1050
#' print(dtrain, verbose = TRUE)
1051
#' }
Guolin Ke's avatar
Guolin Ke committed
1052
#' @rdname dimnames.lgb.Dataset
1053
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1054
1055
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1056

1057
  if (!lgb.is.Dataset(x = x)) {
1058
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1059
  }
James Lamb's avatar
James Lamb committed
1060

1061
  # Return dimension names
1062
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1063

Guolin Ke's avatar
Guolin Ke committed
1064
1065
1066
1067
1068
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1069

1070
  # Check if invalid element list
1071
  if (!identical(class(value), "list") || length(value) != 2L) {
1072
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
1073
  }
James Lamb's avatar
James Lamb committed
1074

1075
1076
1077
1078
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1079

1080
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1081

1082
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1083
    return(x)
James Lamb's avatar
James Lamb committed
1084

1085
  }
James Lamb's avatar
James Lamb committed
1086

1087
  # Check for unmatching column size
1088
  if (ncol(x) != length(value[[2L]])) {
1089
1090
    stop(
      "can't assign "
1091
      , sQuote(length(value[[2L]]))
1092
1093
1094
1095
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1096
  }
James Lamb's avatar
James Lamb committed
1097

1098
  # Set column names properly, and return
1099
  x$set_colnames(colnames = value[[2L]])
1100
  return(x)
James Lamb's avatar
James Lamb committed
1101

Guolin Ke's avatar
Guolin Ke committed
1102
1103
}

1104
1105
1106
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1107
#' @param dataset Object of class \code{lgb.Dataset}
1108
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1109
1110
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1111
#'
Guolin Ke's avatar
Guolin Ke committed
1112
#' @examples
1113
#' \donttest{
1114
1115
1116
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1117
#'
1118
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1119
#' lgb.Dataset.construct(dsub)
1120
#' labels <- lightgbm::getinfo(dsub, "label")
1121
#' }
Guolin Ke's avatar
Guolin Ke committed
1122
#' @export
1123
1124
1125
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1126
1127
1128
1129

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1130

1131
  if (!lgb.is.Dataset(x = dataset)) {
1132
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1133
  }
James Lamb's avatar
James Lamb committed
1134

1135
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1136

Guolin Ke's avatar
Guolin Ke committed
1137
1138
}

1139
1140
1141
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1142
1143
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
1144
#' @param ... other parameters (ignored)
Guolin Ke's avatar
Guolin Ke committed
1145
#' @return info data
James Lamb's avatar
James Lamb committed
1146
#'
Guolin Ke's avatar
Guolin Ke committed
1147
1148
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1149
#'
Guolin Ke's avatar
Guolin Ke committed
1150
1151
1152
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
1153
1154
1155
1156
1157
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1158
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1159
#' }
James Lamb's avatar
James Lamb committed
1160
#'
Guolin Ke's avatar
Guolin Ke committed
1161
#' @examples
1162
#' \donttest{
1163
1164
1165
1166
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1167
#'
1168
1169
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1170
#'
1171
1172
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1173
#' }
Guolin Ke's avatar
Guolin Ke committed
1174
#' @export
1175
1176
1177
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1178
1179
1180
1181

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1182

1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "getinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?getinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1193
  if (!lgb.is.Dataset(x = dataset)) {
1194
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1195
  }
James Lamb's avatar
James Lamb committed
1196

1197
  return(dataset$getinfo(name = name))
James Lamb's avatar
James Lamb committed
1198

Guolin Ke's avatar
Guolin Ke committed
1199
1200
}

1201
1202
1203
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1204
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1205
1206
#' @param name the name of the field to get
#' @param info the specific field of information to set
1207
#' @param ... other parameters (ignored)
1208
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1209
#'
Guolin Ke's avatar
Guolin Ke committed
1210
1211
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1212
#'
Guolin Ke's avatar
Guolin Ke committed
1213
#' \itemize{
1214
1215
1216
1217
1218
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1219
1220
1221
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1222
#' }
James Lamb's avatar
James Lamb committed
1223
#'
Guolin Ke's avatar
Guolin Ke committed
1224
#' @examples
1225
#' \donttest{
1226
1227
1228
1229
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1230
#'
1231
1232
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1233
#'
1234
1235
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1236
#' }
Guolin Ke's avatar
Guolin Ke committed
1237
#' @export
1238
1239
1240
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1241
1242
1243
1244

#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1245

1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "setinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?setinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1256
  if (!lgb.is.Dataset(x = dataset)) {
1257
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1258
  }
James Lamb's avatar
James Lamb committed
1259

1260
  return(invisible(dataset$setinfo(name = name, info = info)))
Guolin Ke's avatar
Guolin Ke committed
1261
1262
}

1263
1264
1265
1266
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1267
#' @param dataset object of class \code{lgb.Dataset}
1268
1269
1270
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1271
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1272
#'
1273
#' @examples
1274
#' \donttest{
1275
1276
1277
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1278
1279
1280
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1281
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1282
#' }
1283
1284
1285
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1286

1287
  if (!lgb.is.Dataset(x = dataset)) {
1288
1289
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1290

1291
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1292

1293
1294
}

1295
1296
1297
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1298
1299
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1300
#'
1301
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1302
#'
Guolin Ke's avatar
Guolin Ke committed
1303
#' @examples
1304
#' \donttest{
1305
#' # create training Dataset
1306
1307
1308
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1309
1310
#'
#' # create a validation Dataset, using dtrain as a reference
1311
1312
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1313
#' dtest <- lgb.Dataset(test$data, label = test$label)
1314
#' lgb.Dataset.set.reference(dtest, dtrain)
1315
#' }
Guolin Ke's avatar
Guolin Ke committed
1316
1317
1318
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1319

1320
  if (!lgb.is.Dataset(x = dataset)) {
1321
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1322
  }
James Lamb's avatar
James Lamb committed
1323

1324
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1325
1326
}

1327
1328
1329
1330
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1331
1332
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1333
#'
1334
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1335
#'
Guolin Ke's avatar
Guolin Ke committed
1336
#' @examples
1337
#' \donttest{
1338
1339
1340
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1341
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1342
#' }
Guolin Ke's avatar
Guolin Ke committed
1343
1344
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1345

1346
  if (!lgb.is.Dataset(x = dataset)) {
1347
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1348
  }
James Lamb's avatar
James Lamb committed
1349

1350
1351
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1352
  }
James Lamb's avatar
James Lamb committed
1353

1354
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1355
}