lgb.Dataset.R 46.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @param info a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
#'             is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
#'             directly.
#' @keywords internal
NULL

19
20
21
22
23
24
25
26
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
27
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
28
#' @importFrom R6 R6Class
29
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
30
31
Dataset <- R6::R6Class(

32
  classname = "lgb.Dataset",
33
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
34
  public = list(
James Lamb's avatar
James Lamb committed
35

36
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
37
    finalize = function() {
38
39
40
41
42
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
43
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
44
    },
James Lamb's avatar
James Lamb committed
45

46
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
47
    initialize = function(data,
48
49
50
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
51
                          categorical_feature = NULL,
52
53
54
55
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
56
57
58
59
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
60

61
      # validate inputs early to avoid unnecessary computation
62
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
63
64
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
65
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
66
67
68
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

69
70
71
72
73
74
75
      if (length(info) > 0L) {
        warning(paste0(
          "lgb.Dataset: found fields passed through 'info'. "
          , "As of v3.3.0, this behavior is deprecated, and support for it will be removed in a future release. "
          , "To suppress this warning, use keyword arguments 'label', 'weight', 'group', or 'init_score' directly"
        ))
      }
James Lamb's avatar
James Lamb committed
76

77
78
79
80
81
82
83
84
85
86
87
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
88
      }
James Lamb's avatar
James Lamb committed
89

90
91
92
93
94
95
96
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
97

98
99
100
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
101
      private$reference <- reference
102
      private$colnames <- colnames
103

104
      private$categorical_feature <- categorical_feature
105
106
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
107
      private$used_indices <- sort(used_indices, decreasing = FALSE)
108
      private$info <- info
109
      private$version <- 0L
James Lamb's avatar
James Lamb committed
110

111
112
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
113
    },
James Lamb's avatar
James Lamb committed
114

115
116
    create_valid = function(data,
                            info = list(),
117
118
119
120
121
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
                            params = list(),
122
                            ...) {
James Lamb's avatar
James Lamb committed
123

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      additional_params <- list(...)
      if (length(additional_params) > 0L) {
        warning(paste0(
          "Dataset$create_valid(): Found the following passed through '...': "
          , paste(names(additional_params), collapse = ", ")
          , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
          , "Add these to 'params' instead. "
          , "See ?lgb.Dataset.create.valid for documentation on how to call this function."
        ))
      }

      # anything passed into '...' should be overwritten by things passed to 'params'
      params <- modifyList(additional_params, params)

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

141
      # Create new dataset
142
143
      ret <- Dataset$new(
        data = data
144
        , params = params
145
146
147
148
149
150
151
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
152
153
154
155
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
156
      )
James Lamb's avatar
James Lamb committed
157

158
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
159

Guolin Ke's avatar
Guolin Ke committed
160
    },
James Lamb's avatar
James Lamb committed
161

162
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
163
    construct = function() {
James Lamb's avatar
James Lamb committed
164

165
      # Check for handle null
166
      if (!lgb.is.null.handle(x = private$handle)) {
167
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
168
      }
James Lamb's avatar
James Lamb committed
169

Guolin Ke's avatar
Guolin Ke committed
170
171
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
172
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
173
174
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
175

176
      # set feature names if they do not exist
177
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
180

181
182
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
183

184
        # Check for character name
185
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
186

187
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
188

189
            # Provided indices, but some indices are missing?
190
            if (sum(is.na(cate_indices)) > 0L) {
191
192
193
194
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
195
            }
James Lamb's avatar
James Lamb committed
196

197
          } else {
James Lamb's avatar
James Lamb committed
198

199
            # Check if more categorical features were output over the feature space
200
            if (max(private$categorical_feature) > length(private$colnames)) {
201
202
203
204
205
206
207
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
208
            }
James Lamb's avatar
James Lamb committed
209

210
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
211
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
212

213
          }
James Lamb's avatar
James Lamb committed
214

215
        # Store indices for categorical features
216
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
217

218
      }
James Lamb's avatar
James Lamb committed
219

Guolin Ke's avatar
Guolin Ke committed
220
      # Generate parameter str
221
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
222

223
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
228

229
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
230
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
231

232
        # Are we using a data file?
233
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
234

235
          handle <- .Call(
236
            LGBM_DatasetCreateFromFile_R
237
            , private$raw_data
238
239
240
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
241

Guolin Ke's avatar
Guolin Ke committed
242
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
243

244
          # Are we using a matrix?
245
          handle <- .Call(
246
            LGBM_DatasetCreateFromMat_R
247
248
249
250
251
252
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
253
254

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
255
          if (length(private$raw_data@p) > 2147483647L) {
256
257
            stop("Cannot support large CSC matrix")
          }
258
          # Are we using a dgCMatrix (sparsed matrix column compressed)
259
          handle <- .Call(
260
            LGBM_DatasetCreateFromCSC_R
261
262
263
264
265
266
267
268
269
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
270

Guolin Ke's avatar
Guolin Ke committed
271
        } else {
James Lamb's avatar
James Lamb committed
272

273
          # Unknown data type
274
275
276
277
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
278

Guolin Ke's avatar
Guolin Ke committed
279
        }
James Lamb's avatar
James Lamb committed
280

Guolin Ke's avatar
Guolin Ke committed
281
      } else {
James Lamb's avatar
James Lamb committed
282

283
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
284
        if (is.null(private$reference)) {
285
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
286
        }
James Lamb's avatar
James Lamb committed
287

288
        # Construct subset
289
        handle <- .Call(
290
          LGBM_DatasetGetSubset_R
291
292
293
294
295
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
296

Guolin Ke's avatar
Guolin Ke committed
297
      }
298
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
299
300
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
301
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
302
303
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
304

305
306
      # Set feature names
      if (!is.null(private$colnames)) {
307
        self$set_colnames(colnames = private$colnames)
308
      }
309

310
311
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
312

313
        # Setup initial scores
314
        init_score <- private$predictor$predict(
315
          data = private$raw_data
316
317
318
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
319

320
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
321
322
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
323

324
      }
James Lamb's avatar
James Lamb committed
325

326
327
328
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
329
      }
James Lamb's avatar
James Lamb committed
330

331
      # Get private information
332
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
333

334
        # Set infos
335
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
336

Guolin Ke's avatar
Guolin Ke committed
337
          p <- private$info[i]
338
339
340
341
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
342

Guolin Ke's avatar
Guolin Ke committed
343
        }
James Lamb's avatar
James Lamb committed
344

Guolin Ke's avatar
Guolin Ke committed
345
      }
James Lamb's avatar
James Lamb committed
346

347
      # Get label information existence
348
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
349
350
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
351

352
      return(invisible(self))
James Lamb's avatar
James Lamb committed
353

Guolin Ke's avatar
Guolin Ke committed
354
    },
James Lamb's avatar
James Lamb committed
355

356
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
357
    dim = function() {
James Lamb's avatar
James Lamb committed
358

359
      # Check for handle
360
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
361

362
363
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
364

365
        # Get numeric data and numeric features
366
367
368
369
370
371
372
373
374
375
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
376
        return(
377
          c(num_row, num_col)
378
        )
James Lamb's avatar
James Lamb committed
379
380
381

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

382
        # Check if dgCMatrix (sparse matrix column compressed)
383
        # NOTE: requires Matrix package
384
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
385

Guolin Ke's avatar
Guolin Ke committed
386
      } else {
James Lamb's avatar
James Lamb committed
387

388
        # Trying to work with unknown dimensions is not possible
389
390
391
392
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
393

Guolin Ke's avatar
Guolin Ke committed
394
      }
James Lamb's avatar
James Lamb committed
395

Guolin Ke's avatar
Guolin Ke committed
396
    },
James Lamb's avatar
James Lamb committed
397

398
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
399
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
400

401
      # Check for handle
402
      if (!lgb.is.null.handle(x = private$handle)) {
403
        private$colnames <- .Call(
404
405
          LGBM_DatasetGetFeatureNames_R
          , private$handle
406
        )
407
        return(private$colnames)
James Lamb's avatar
James Lamb committed
408
409
410

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

411
        # Check if dgCMatrix (sparse matrix column compressed)
412
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
413

Guolin Ke's avatar
Guolin Ke committed
414
      } else {
James Lamb's avatar
James Lamb committed
415

416
        # Trying to work with unknown formats is not possible
417
        stop(
418
419
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
420
        )
James Lamb's avatar
James Lamb committed
421

Guolin Ke's avatar
Guolin Ke committed
422
      }
James Lamb's avatar
James Lamb committed
423

Guolin Ke's avatar
Guolin Ke committed
424
    },
James Lamb's avatar
James Lamb committed
425

426
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
427
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
428

429
430
      # Check column names non-existence
      if (is.null(colnames)) {
431
        return(invisible(self))
432
      }
James Lamb's avatar
James Lamb committed
433

434
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
435
      colnames <- as.character(colnames)
436
      if (length(colnames) == 0L) {
437
        return(invisible(self))
438
      }
James Lamb's avatar
James Lamb committed
439

440
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
441
      private$colnames <- colnames
442
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
443

444
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
445
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
446
447
        .Call(
          LGBM_DatasetSetFeatureNames_R
448
          , private$handle
449
          , merged_name
450
        )
James Lamb's avatar
James Lamb committed
451

Guolin Ke's avatar
Guolin Ke committed
452
      }
James Lamb's avatar
James Lamb committed
453

454
      return(invisible(self))
James Lamb's avatar
James Lamb committed
455

Guolin Ke's avatar
Guolin Ke committed
456
    },
James Lamb's avatar
James Lamb committed
457

Guolin Ke's avatar
Guolin Ke committed
458
    getinfo = function(name) {
459
460
461
462
463
464
465
466
467
468
469
470
      warning(paste0(
        "Dataset$getinfo() is deprecated and will be removed in a future release. "
        , "Use Dataset$get_field() instead."
      ))
      return(
        self$get_field(
          field_name = name
        )
      )
    },

    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
471

472
      # Check if attribute key is in the known attribute list
473
474
475
476
477
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
478
      }
James Lamb's avatar
James Lamb committed
479

480
      # Check for info name and handle
481
      if (is.null(private$info[[field_name]])) {
482

483
        if (lgb.is.null.handle(x = private$handle)) {
484
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
485
        }
486

487
        # Get field size of info
488
        info_len <- 0L
489
490
        .Call(
          LGBM_DatasetGetFieldSize_R
491
          , private$handle
492
          , field_name
493
          , info_len
494
        )
James Lamb's avatar
James Lamb committed
495

496
        # Check if info is not empty
497
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
498

499
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
500
          ret <- NULL
501
          ret <- if (field_name == "group") {
502
503
504
505
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
506

507
508
          .Call(
            LGBM_DatasetGetField_R
509
            , private$handle
510
            , field_name
511
            , ret
512
          )
James Lamb's avatar
James Lamb committed
513

514
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
515

Guolin Ke's avatar
Guolin Ke committed
516
517
        }
      }
James Lamb's avatar
James Lamb committed
518

519
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
520

Guolin Ke's avatar
Guolin Ke committed
521
    },
James Lamb's avatar
James Lamb committed
522

Guolin Ke's avatar
Guolin Ke committed
523
    setinfo = function(name, info) {
524
525
526
527
528
529
530
531
532
533
534
535
536
      warning(paste0(
        "Dataset$setinfo() is deprecated and will be removed in a future release. "
        , "Use Dataset$set_field() instead."
      ))
      return(
        self$set_field(
          field_name = name
          , data = info
        )
      )
    },

    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
537

538
      # Check if attribute key is in the known attribute list
539
540
541
542
543
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
544
      }
James Lamb's avatar
James Lamb committed
545

546
      # Check for type of information
547
548
      data <- if (field_name == "group") {
        as.integer(data) # Integer
549
      } else {
550
        as.numeric(data) # Numeric
551
      }
James Lamb's avatar
James Lamb committed
552

553
      # Store information privately
554
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
555

556
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
557

558
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
559

560
561
          .Call(
            LGBM_DatasetSetField_R
562
            , private$handle
563
564
565
            , field_name
            , data
            , length(data)
566
          )
James Lamb's avatar
James Lamb committed
567

568
569
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
570
        }
James Lamb's avatar
James Lamb committed
571

Guolin Ke's avatar
Guolin Ke committed
572
      }
James Lamb's avatar
James Lamb committed
573

574
      return(invisible(self))
James Lamb's avatar
James Lamb committed
575

Guolin Ke's avatar
Guolin Ke committed
576
    },
James Lamb's avatar
James Lamb committed
577

578
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
579
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
580

581
582
583
      additional_keyword_args <- list(...)

      if (length(additional_keyword_args) > 0L) {
584
585
        warning(paste0(
          "Dataset$slice(): Found the following passed through '...': "
586
          , paste(names(additional_keyword_args), collapse = ", ")
587
          , ". These are ignored and should be removed. "
588
          , "To change the parameters of a Dataset produced by Dataset$slice(), use Dataset$set_params(). "
589
          , "To modify attributes like 'init_score', use Dataset$set_field(). "
590
591
592
593
          , "In future releases of lightgbm, this warning will become an error."
        ))
      }

594
595
596
597
598
599
600
601
602
603
604
605
606
      # extract Dataset attributes passed through '...'
      #
      # NOTE: takes advantage of the fact that list[["non-existent-key"]] returns NULL
      group <- additional_keyword_args[["group"]]
      init_score <- additional_keyword_args[["init_score"]]
      label <- additional_keyword_args[["label"]]
      weight <- additional_keyword_args[["weight"]]

      # remove attributes from '...', so only params are left
      for (info_key in .INFO_KEYS()) {
        additional_keyword_args[[info_key]] <- NULL
      }

607
      # Perform slicing
608
609
610
      return(
        Dataset$new(
          data = NULL
611
          , params = utils::modifyList(self$get_params(), additional_keyword_args)
612
613
614
615
616
617
618
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
619
620
621
622
          , group = group
          , init_score = init_score
          , label = label
          , weight = weight
623
        )
624
      )
James Lamb's avatar
James Lamb committed
625

Guolin Ke's avatar
Guolin Ke committed
626
    },
James Lamb's avatar
James Lamb committed
627

628
629
630
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
631
    update_params = function(params) {
632
633
634
      if (length(params) == 0L) {
        return(invisible(self))
      }
635
      if (lgb.is.null.handle(x = private$handle)) {
636
        private$params <- utils::modifyList(private$params, params)
637
      } else {
638
639
        tryCatch({
          .Call(
640
            LGBM_DatasetUpdateParamChecking_R
641
642
643
644
645
646
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
647
          if (is.null(private$raw_data)) {
648
            stop(e)
649
650
          }

651
652
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
653
          private$params <- utils::modifyList(private$params, params)
654
          self$finalize()
655
        })
656
      }
657
      return(invisible(self))
James Lamb's avatar
James Lamb committed
658

Guolin Ke's avatar
Guolin Ke committed
659
    },
James Lamb's avatar
James Lamb committed
660

661
662
663
664
665
666
667
668
669
670
671
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

672
    # Set categorical feature parameter
673
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
674

675
676
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
677
        return(invisible(self))
678
      }
James Lamb's avatar
James Lamb committed
679

680
      # Check for empty data
681
      if (is.null(private$raw_data)) {
682
683
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
684
      }
James Lamb's avatar
James Lamb committed
685

686
      # Overwrite categorical features
687
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
688

689
      # Finalize and return self
690
      self$finalize()
691
      return(invisible(self))
James Lamb's avatar
James Lamb committed
692

693
    },
James Lamb's avatar
James Lamb committed
694

695
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
696
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
697

698
      # setting reference to this same Dataset object doesn't require any changes
699
      if (identical(private$reference, reference)) {
700
        return(invisible(self))
701
      }
James Lamb's avatar
James Lamb committed
702

703
704
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
705
      if (is.null(private$raw_data)) {
706
707
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
708
      }
James Lamb's avatar
James Lamb committed
709

710
711
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
712
      }
James Lamb's avatar
James Lamb committed
713

714
715
716
717
718
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

719
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
720
      private$reference <- reference
James Lamb's avatar
James Lamb committed
721

722
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
723
      self$finalize()
724
      return(invisible(self))
James Lamb's avatar
James Lamb committed
725

Guolin Ke's avatar
Guolin Ke committed
726
    },
James Lamb's avatar
James Lamb committed
727

728
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
729
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
730

731
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
732
      self$construct()
733
734
      .Call(
        LGBM_DatasetSaveBinary_R
735
        , private$handle
736
        , fname
737
      )
738
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
739
    }
James Lamb's avatar
James Lamb committed
740

Guolin Ke's avatar
Guolin Ke committed
741
742
  ),
  private = list(
743
744
745
746
747
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
748
    categorical_feature = NULL,
749
750
751
752
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
753
    version = 0L,
James Lamb's avatar
James Lamb committed
754

755
756
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
757

758
      # Get handle and construct if needed
759
      if (lgb.is.null.handle(x = private$handle)) {
760
761
        self$construct()
      }
762
      return(private$handle)
James Lamb's avatar
James Lamb committed
763

Guolin Ke's avatar
Guolin Ke committed
764
    },
James Lamb's avatar
James Lamb committed
765

766
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
767
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
768

769
      if (identical(private$predictor, predictor)) {
770
        return(invisible(self))
771
      }
James Lamb's avatar
James Lamb committed
772

773
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
774
      if (is.null(private$raw_data)) {
775
776
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
777
      }
James Lamb's avatar
James Lamb committed
778

779
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
780
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
781

782
        # Predictor is unknown
783
        if (!lgb.is.Predictor(predictor)) {
784
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
785
        }
James Lamb's avatar
James Lamb committed
786

Guolin Ke's avatar
Guolin Ke committed
787
      }
James Lamb's avatar
James Lamb committed
788

789
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
790
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
791

792
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
793
      self$finalize()
794
      return(invisible(self))
James Lamb's avatar
James Lamb committed
795

Guolin Ke's avatar
Guolin Ke committed
796
    }
James Lamb's avatar
James Lamb committed
797

Guolin Ke's avatar
Guolin Ke committed
798
799
800
  )
)

801
802
803
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
804
#' @inheritParams lgb_shared_dataset_params
805
806
807
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
808
809
810
811
812
813
814
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
815
#' @param colnames names of columns
816
817
818
819
820
821
822
823
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
824
#' @param ... other parameters passed to \code{params}
James Lamb's avatar
James Lamb committed
825
#'
Guolin Ke's avatar
Guolin Ke committed
826
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
827
#'
Guolin Ke's avatar
Guolin Ke committed
828
#' @examples
829
#' \donttest{
830
831
832
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
833
834
835
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
836
#' lgb.Dataset.construct(dtrain)
837
#' }
Guolin Ke's avatar
Guolin Ke committed
838
839
#' @export
lgb.Dataset <- function(data,
840
841
842
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
843
                        categorical_feature = NULL,
844
845
                        free_raw_data = TRUE,
                        info = list(),
846
847
848
849
                        label = NULL,
                        weight = NULL,
                        group = NULL,
                        init_score = NULL,
Guolin Ke's avatar
Guolin Ke committed
850
                        ...) {
James Lamb's avatar
James Lamb committed
851

852
853
854
855
856
857
858
859
860
861
862
863
  additional_params <- list(...)
  params <- modifyList(params, additional_params)

  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
    ))
  }

864
  # Create new dataset
865
866
867
868
869
870
871
872
873
874
875
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
876
877
878
879
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
880
881
    ))
  )
James Lamb's avatar
James Lamb committed
882

Guolin Ke's avatar
Guolin Ke committed
883
884
}

885
886
887
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
888
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
889
#' @param dataset \code{lgb.Dataset} object, training data
890
891
892
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
893
894
895
896
897
898
899
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
#' @param ... additional \code{lgb.Dataset} parameters.
#'            NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.
James Lamb's avatar
James Lamb committed
900
#'
Guolin Ke's avatar
Guolin Ke committed
901
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
902
#'
Guolin Ke's avatar
Guolin Ke committed
903
#' @examples
904
#' \donttest{
905
906
907
908
909
910
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
946
#' }
947
#' @importFrom utils modifyList
Guolin Ke's avatar
Guolin Ke committed
948
#' @export
949
950
951
952
953
954
955
956
957
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     info = list(),
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
                                     params = list(),
                                     ...) {
James Lamb's avatar
James Lamb committed
958

959
  if (!lgb.is.Dataset(x = dataset)) {
960
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
961
  }
James Lamb's avatar
James Lamb committed
962

963
964
965
966
967
968
969
970
971
972
  additional_params <- list(...)
  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset.create.valid: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset.create.valid for documentation on how to call this function."
    ))
  }

973
  # Create validation dataset
974
975
976
977
978
979
980
981
982
983
984
  return(invisible(
    dataset$create_valid(
      data = data
      , info = info
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
      , params = utils::modifyList(params, additional_params)
    )
  ))
James Lamb's avatar
James Lamb committed
985

986
}
Guolin Ke's avatar
Guolin Ke committed
987

988
989
990
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
991
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
992
#'
Guolin Ke's avatar
Guolin Ke committed
993
#' @examples
994
#' \donttest{
995
996
997
998
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
999
#' }
1000
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
1001
1002
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
1003

1004
  if (!lgb.is.Dataset(x = dataset)) {
1005
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1006
  }
James Lamb's avatar
James Lamb committed
1007

1008
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
1009

Guolin Ke's avatar
Guolin Ke committed
1010
1011
}

1012
1013
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
1014
#' @param x Object of class \code{lgb.Dataset}
1015
#' @param ... other parameters (ignored)
James Lamb's avatar
James Lamb committed
1016
#'
Guolin Ke's avatar
Guolin Ke committed
1017
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
1018
#'
Guolin Ke's avatar
Guolin Ke committed
1019
1020
1021
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
1022
#'
Guolin Ke's avatar
Guolin Ke committed
1023
#' @examples
1024
#' \donttest{
1025
1026
1027
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1028
#'
1029
1030
1031
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
1032
#' }
Guolin Ke's avatar
Guolin Ke committed
1033
1034
1035
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
1036

1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "dim.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?dim.lgb.Dataset for documentation on how to call this function."
    ))
  }

1047
  if (!lgb.is.Dataset(x = x)) {
1048
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1049
  }
James Lamb's avatar
James Lamb committed
1050

1051
  return(x$dim())
James Lamb's avatar
James Lamb committed
1052

Guolin Ke's avatar
Guolin Ke committed
1053
1054
}

1055
1056
1057
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
1058
1059
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
1060
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
1061
1062
1063
1064
1065
1066
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
1067
#' \donttest{
1068
1069
1070
1071
1072
1073
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1074
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1075
#' print(dtrain, verbose = TRUE)
1076
#' }
Guolin Ke's avatar
Guolin Ke committed
1077
#' @rdname dimnames.lgb.Dataset
1078
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1079
1080
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1081

1082
  if (!lgb.is.Dataset(x = x)) {
1083
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1084
  }
James Lamb's avatar
James Lamb committed
1085

1086
  # Return dimension names
1087
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1088

Guolin Ke's avatar
Guolin Ke committed
1089
1090
1091
1092
1093
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1094

1095
  # Check if invalid element list
1096
  if (!identical(class(value), "list") || length(value) != 2L) {
1097
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
1098
  }
James Lamb's avatar
James Lamb committed
1099

1100
1101
1102
1103
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1104

1105
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1106

1107
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1108
    return(x)
James Lamb's avatar
James Lamb committed
1109

1110
  }
James Lamb's avatar
James Lamb committed
1111

1112
  # Check for unmatching column size
1113
  if (ncol(x) != length(value[[2L]])) {
1114
1115
    stop(
      "can't assign "
1116
      , sQuote(length(value[[2L]]))
1117
1118
1119
1120
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1121
  }
James Lamb's avatar
James Lamb committed
1122

1123
  # Set column names properly, and return
1124
  x$set_colnames(colnames = value[[2L]])
1125
  return(x)
James Lamb's avatar
James Lamb committed
1126

Guolin Ke's avatar
Guolin Ke committed
1127
1128
}

1129
1130
1131
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1132
#' @param dataset Object of class \code{lgb.Dataset}
1133
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1134
1135
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1136
#'
Guolin Ke's avatar
Guolin Ke committed
1137
#' @examples
1138
#' \donttest{
1139
1140
1141
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1142
#'
1143
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1144
#' lgb.Dataset.construct(dsub)
1145
#' labels <- lightgbm::get_field(dsub, "label")
1146
#' }
Guolin Ke's avatar
Guolin Ke committed
1147
#' @export
1148
1149
1150
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1151
1152
1153
1154

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1155

1156
  if (!lgb.is.Dataset(x = dataset)) {
1157
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1158
  }
James Lamb's avatar
James Lamb committed
1159

1160
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1161

Guolin Ke's avatar
Guolin Ke committed
1162
1163
}

1164
1165
1166
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1167
1168
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
1169
#' @param ... other parameters (ignored)
Guolin Ke's avatar
Guolin Ke committed
1170
#' @return info data
James Lamb's avatar
James Lamb committed
1171
#'
Guolin Ke's avatar
Guolin Ke committed
1172
1173
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1174
#'
Guolin Ke's avatar
Guolin Ke committed
1175
1176
1177
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
1178
1179
1180
1181
1182
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1183
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1184
#' }
James Lamb's avatar
James Lamb committed
1185
#'
Guolin Ke's avatar
Guolin Ke committed
1186
#' @examples
1187
#' \donttest{
1188
1189
1190
1191
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1192
#'
1193
1194
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1195
#'
1196
1197
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1198
#' }
Guolin Ke's avatar
Guolin Ke committed
1199
#' @export
1200
1201
1202
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1203
1204
1205
1206

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1207

1208
1209
  warning("Calling getinfo() on a lgb.Dataset is deprecated. Use get_field() instead.")

1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "getinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?getinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1220
  if (!lgb.is.Dataset(x = dataset)) {
1221
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1222
  }
James Lamb's avatar
James Lamb committed
1223

1224
  return(dataset$get_field(field_name = name))
James Lamb's avatar
James Lamb committed
1225

Guolin Ke's avatar
Guolin Ke committed
1226
1227
}

1228
1229
1230
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1231
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1232
1233
#' @param name the name of the field to get
#' @param info the specific field of information to set
1234
#' @param ... other parameters (ignored)
1235
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1236
#'
Guolin Ke's avatar
Guolin Ke committed
1237
1238
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1239
#'
Guolin Ke's avatar
Guolin Ke committed
1240
#' \itemize{
1241
1242
1243
1244
1245
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1246
1247
1248
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1249
#' }
James Lamb's avatar
James Lamb committed
1250
#'
Guolin Ke's avatar
Guolin Ke committed
1251
#' @examples
1252
#' \donttest{
1253
1254
1255
1256
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1257
#'
1258
1259
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1260
#'
1261
1262
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1263
#' }
Guolin Ke's avatar
Guolin Ke committed
1264
#' @export
1265
1266
1267
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1268
1269
1270
1271

#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1272

1273
1274
  warning("Calling setinfo() on a lgb.Dataset is deprecated. Use set_field() instead.")

1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "setinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?setinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1285
  if (!lgb.is.Dataset(x = dataset)) {
1286
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1287
  }
James Lamb's avatar
James Lamb committed
1288

1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
  return(invisible(dataset$set_field(field_name = name, data = info)))
}

#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1385
1386
}

1387
1388
1389
1390
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1391
#' @param dataset object of class \code{lgb.Dataset}
1392
1393
1394
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1395
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1396
#'
1397
#' @examples
1398
#' \donttest{
1399
1400
1401
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1402
1403
1404
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1405
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1406
#' }
1407
1408
1409
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1410

1411
  if (!lgb.is.Dataset(x = dataset)) {
1412
1413
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1414

1415
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1416

1417
1418
}

1419
1420
1421
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1422
1423
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1424
#'
1425
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1426
#'
Guolin Ke's avatar
Guolin Ke committed
1427
#' @examples
1428
#' \donttest{
1429
#' # create training Dataset
1430
1431
1432
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1433
1434
#'
#' # create a validation Dataset, using dtrain as a reference
1435
1436
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1437
#' dtest <- lgb.Dataset(test$data, label = test$label)
1438
#' lgb.Dataset.set.reference(dtest, dtrain)
1439
#' }
Guolin Ke's avatar
Guolin Ke committed
1440
1441
1442
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1443

1444
  if (!lgb.is.Dataset(x = dataset)) {
1445
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1446
  }
James Lamb's avatar
James Lamb committed
1447

1448
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1449
1450
}

1451
1452
1453
1454
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1455
1456
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1457
#'
1458
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1459
#'
Guolin Ke's avatar
Guolin Ke committed
1460
#' @examples
1461
#' \donttest{
1462
1463
1464
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1465
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1466
#' }
Guolin Ke's avatar
Guolin Ke committed
1467
1468
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1469

1470
  if (!lgb.is.Dataset(x = dataset)) {
1471
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1472
  }
James Lamb's avatar
James Lamb committed
1473

1474
1475
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1476
  }
James Lamb's avatar
James Lamb committed
1477

1478
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1479
}