lgb.Dataset.R 41.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @param info a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
#'             is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
#'             directly.
#' @keywords internal
NULL

19
20
21
22
23
24
25
26
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
27
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
28
#' @importFrom R6 R6Class
29
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
30
31
Dataset <- R6::R6Class(

32
  classname = "lgb.Dataset",
33
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
34
  public = list(
James Lamb's avatar
James Lamb committed
35

36
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
37
    finalize = function() {
38
39
40
41
42
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
43
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
44
    },
James Lamb's avatar
James Lamb committed
45

46
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
47
    initialize = function(data,
48
49
50
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
51
                          categorical_feature = NULL,
52
53
54
55
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
56
57
58
59
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
60

61
      # validate inputs early to avoid unnecessary computation
62
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
63
64
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
65
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
66
67
68
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

69
70
71
72
73
74
75
      if (length(info) > 0L) {
        warning(paste0(
          "lgb.Dataset: found fields passed through 'info'. "
          , "As of v3.3.0, this behavior is deprecated, and support for it will be removed in a future release. "
          , "To suppress this warning, use keyword arguments 'label', 'weight', 'group', or 'init_score' directly"
        ))
      }
James Lamb's avatar
James Lamb committed
76

77
78
79
80
81
82
83
84
85
86
87
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
88
      }
James Lamb's avatar
James Lamb committed
89

90
91
92
93
94
95
96
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
97

98
99
100
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
101
      private$reference <- reference
102
      private$colnames <- colnames
103

104
      private$categorical_feature <- categorical_feature
105
106
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
107
      private$used_indices <- sort(used_indices, decreasing = FALSE)
108
      private$info <- info
109
      private$version <- 0L
James Lamb's avatar
James Lamb committed
110

111
112
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
113
    },
James Lamb's avatar
James Lamb committed
114

115
116
    create_valid = function(data,
                            info = list(),
117
118
119
120
121
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
                            params = list(),
122
                            ...) {
James Lamb's avatar
James Lamb committed
123

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      additional_params <- list(...)
      if (length(additional_params) > 0L) {
        warning(paste0(
          "Dataset$create_valid(): Found the following passed through '...': "
          , paste(names(additional_params), collapse = ", ")
          , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
          , "Add these to 'params' instead. "
          , "See ?lgb.Dataset.create.valid for documentation on how to call this function."
        ))
      }

      # anything passed into '...' should be overwritten by things passed to 'params'
      params <- modifyList(additional_params, params)

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

141
      # Create new dataset
142
143
      ret <- Dataset$new(
        data = data
144
        , params = params
145
146
147
148
149
150
151
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
152
153
154
155
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
156
      )
James Lamb's avatar
James Lamb committed
157

158
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
159

Guolin Ke's avatar
Guolin Ke committed
160
    },
James Lamb's avatar
James Lamb committed
161

162
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
163
    construct = function() {
James Lamb's avatar
James Lamb committed
164

165
      # Check for handle null
166
      if (!lgb.is.null.handle(x = private$handle)) {
167
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
168
      }
James Lamb's avatar
James Lamb committed
169

Guolin Ke's avatar
Guolin Ke committed
170
171
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
172
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
173
174
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
175

176
      # set feature names if they do not exist
177
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
180

181
182
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
183

184
        # Check for character name
185
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
186

187
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
188

189
            # Provided indices, but some indices are missing?
190
            if (sum(is.na(cate_indices)) > 0L) {
191
192
193
194
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
195
            }
James Lamb's avatar
James Lamb committed
196

197
          } else {
James Lamb's avatar
James Lamb committed
198

199
            # Check if more categorical features were output over the feature space
200
            if (max(private$categorical_feature) > length(private$colnames)) {
201
202
203
204
205
206
207
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
208
            }
James Lamb's avatar
James Lamb committed
209

210
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
211
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
212

213
          }
James Lamb's avatar
James Lamb committed
214

215
        # Store indices for categorical features
216
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
217

218
      }
James Lamb's avatar
James Lamb committed
219

Guolin Ke's avatar
Guolin Ke committed
220
      # Generate parameter str
221
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
222

223
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
228

229
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
230
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
231

232
233
234
235
236
237
238
239
240
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

241
        # Are we using a data file?
242
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
243

244
          handle <- .Call(
245
            LGBM_DatasetCreateFromFile_R
246
            , path.expand(private$raw_data)
247
248
249
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
250

Guolin Ke's avatar
Guolin Ke committed
251
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
252

253
          # Are we using a matrix?
254
          handle <- .Call(
255
            LGBM_DatasetCreateFromMat_R
256
257
258
259
260
261
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
262
263

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
264
          if (length(private$raw_data@p) > 2147483647L) {
265
266
            stop("Cannot support large CSC matrix")
          }
267
          # Are we using a dgCMatrix (sparsed matrix column compressed)
268
          handle <- .Call(
269
            LGBM_DatasetCreateFromCSC_R
270
271
272
273
274
275
276
277
278
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
279

Guolin Ke's avatar
Guolin Ke committed
280
        } else {
James Lamb's avatar
James Lamb committed
281

282
          # Unknown data type
283
284
285
286
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
287

Guolin Ke's avatar
Guolin Ke committed
288
        }
James Lamb's avatar
James Lamb committed
289

Guolin Ke's avatar
Guolin Ke committed
290
      } else {
James Lamb's avatar
James Lamb committed
291

292
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
293
        if (is.null(private$reference)) {
294
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
295
        }
James Lamb's avatar
James Lamb committed
296

297
        # Construct subset
298
        handle <- .Call(
299
          LGBM_DatasetGetSubset_R
300
301
302
303
304
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
305

Guolin Ke's avatar
Guolin Ke committed
306
      }
307
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
308
309
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
310
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
311
312
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
313

314
315
      # Set feature names
      if (!is.null(private$colnames)) {
316
        self$set_colnames(colnames = private$colnames)
317
      }
318

319
320
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
321

322
        # Setup initial scores
323
        init_score <- private$predictor$predict(
324
          data = private$raw_data
325
326
327
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
328

329
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
330
331
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
332

333
      }
James Lamb's avatar
James Lamb committed
334

335
336
337
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
338
      }
James Lamb's avatar
James Lamb committed
339

340
      # Get private information
341
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
342

343
        # Set infos
344
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
345

Guolin Ke's avatar
Guolin Ke committed
346
          p <- private$info[i]
347
348
349
350
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
351

Guolin Ke's avatar
Guolin Ke committed
352
        }
James Lamb's avatar
James Lamb committed
353

Guolin Ke's avatar
Guolin Ke committed
354
      }
James Lamb's avatar
James Lamb committed
355

356
      # Get label information existence
357
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
358
359
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
360

361
      return(invisible(self))
James Lamb's avatar
James Lamb committed
362

Guolin Ke's avatar
Guolin Ke committed
363
    },
James Lamb's avatar
James Lamb committed
364

365
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
366
    dim = function() {
James Lamb's avatar
James Lamb committed
367

368
      # Check for handle
369
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
370

371
372
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
373

374
        # Get numeric data and numeric features
375
376
377
378
379
380
381
382
383
384
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
385
        return(
386
          c(num_row, num_col)
387
        )
James Lamb's avatar
James Lamb committed
388
389
390

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

391
        # Check if dgCMatrix (sparse matrix column compressed)
392
        # NOTE: requires Matrix package
393
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
394

Guolin Ke's avatar
Guolin Ke committed
395
      } else {
James Lamb's avatar
James Lamb committed
396

397
        # Trying to work with unknown dimensions is not possible
398
399
400
401
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
402

Guolin Ke's avatar
Guolin Ke committed
403
      }
James Lamb's avatar
James Lamb committed
404

Guolin Ke's avatar
Guolin Ke committed
405
    },
James Lamb's avatar
James Lamb committed
406

407
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
408
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
409

410
      # Check for handle
411
      if (!lgb.is.null.handle(x = private$handle)) {
412
        private$colnames <- .Call(
413
414
          LGBM_DatasetGetFeatureNames_R
          , private$handle
415
        )
416
        return(private$colnames)
James Lamb's avatar
James Lamb committed
417
418
419

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

420
        # Check if dgCMatrix (sparse matrix column compressed)
421
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
422

Guolin Ke's avatar
Guolin Ke committed
423
      } else {
James Lamb's avatar
James Lamb committed
424

425
        # Trying to work with unknown formats is not possible
426
        stop(
427
428
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
429
        )
James Lamb's avatar
James Lamb committed
430

Guolin Ke's avatar
Guolin Ke committed
431
      }
James Lamb's avatar
James Lamb committed
432

Guolin Ke's avatar
Guolin Ke committed
433
    },
James Lamb's avatar
James Lamb committed
434

435
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
436
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
437

438
439
      # Check column names non-existence
      if (is.null(colnames)) {
440
        return(invisible(self))
441
      }
James Lamb's avatar
James Lamb committed
442

443
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
444
      colnames <- as.character(colnames)
445
      if (length(colnames) == 0L) {
446
        return(invisible(self))
447
      }
James Lamb's avatar
James Lamb committed
448

449
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
450
      private$colnames <- colnames
451
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
452

453
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
454
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
455
456
        .Call(
          LGBM_DatasetSetFeatureNames_R
457
          , private$handle
458
          , merged_name
459
        )
James Lamb's avatar
James Lamb committed
460

Guolin Ke's avatar
Guolin Ke committed
461
      }
James Lamb's avatar
James Lamb committed
462

463
      return(invisible(self))
James Lamb's avatar
James Lamb committed
464

Guolin Ke's avatar
Guolin Ke committed
465
    },
James Lamb's avatar
James Lamb committed
466

467
    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
468

469
      # Check if attribute key is in the known attribute list
470
471
472
473
474
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
475
      }
James Lamb's avatar
James Lamb committed
476

477
      # Check for info name and handle
478
      if (is.null(private$info[[field_name]])) {
479

480
        if (lgb.is.null.handle(x = private$handle)) {
481
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
482
        }
483

484
        # Get field size of info
485
        info_len <- 0L
486
487
        .Call(
          LGBM_DatasetGetFieldSize_R
488
          , private$handle
489
          , field_name
490
          , info_len
491
        )
James Lamb's avatar
James Lamb committed
492

493
        # Check if info is not empty
494
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
495

496
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
497
          ret <- NULL
498
          ret <- if (field_name == "group") {
499
500
501
502
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
503

504
505
          .Call(
            LGBM_DatasetGetField_R
506
            , private$handle
507
            , field_name
508
            , ret
509
          )
James Lamb's avatar
James Lamb committed
510

511
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
512

Guolin Ke's avatar
Guolin Ke committed
513
514
        }
      }
James Lamb's avatar
James Lamb committed
515

516
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
517

Guolin Ke's avatar
Guolin Ke committed
518
    },
James Lamb's avatar
James Lamb committed
519

520
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
521

522
      # Check if attribute key is in the known attribute list
523
524
525
526
527
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
528
      }
James Lamb's avatar
James Lamb committed
529

530
      # Check for type of information
531
532
      data <- if (field_name == "group") {
        as.integer(data) # Integer
533
      } else {
534
        as.numeric(data) # Numeric
535
      }
James Lamb's avatar
James Lamb committed
536

537
      # Store information privately
538
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
539

540
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
541

542
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
543

544
545
          .Call(
            LGBM_DatasetSetField_R
546
            , private$handle
547
548
549
            , field_name
            , data
            , length(data)
550
          )
James Lamb's avatar
James Lamb committed
551

552
553
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
554
        }
James Lamb's avatar
James Lamb committed
555

Guolin Ke's avatar
Guolin Ke committed
556
      }
James Lamb's avatar
James Lamb committed
557

558
      return(invisible(self))
James Lamb's avatar
James Lamb committed
559

Guolin Ke's avatar
Guolin Ke committed
560
    },
James Lamb's avatar
James Lamb committed
561

562
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
563
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
564

565
566
567
      additional_keyword_args <- list(...)

      if (length(additional_keyword_args) > 0L) {
568
569
        warning(paste0(
          "Dataset$slice(): Found the following passed through '...': "
570
          , paste(names(additional_keyword_args), collapse = ", ")
571
          , ". These are ignored and should be removed. "
572
          , "To change the parameters of a Dataset produced by Dataset$slice(), use Dataset$set_params(). "
573
          , "To modify attributes like 'init_score', use Dataset$set_field(). "
574
575
576
577
          , "In future releases of lightgbm, this warning will become an error."
        ))
      }

578
579
580
581
582
583
584
585
586
587
588
589
590
      # extract Dataset attributes passed through '...'
      #
      # NOTE: takes advantage of the fact that list[["non-existent-key"]] returns NULL
      group <- additional_keyword_args[["group"]]
      init_score <- additional_keyword_args[["init_score"]]
      label <- additional_keyword_args[["label"]]
      weight <- additional_keyword_args[["weight"]]

      # remove attributes from '...', so only params are left
      for (info_key in .INFO_KEYS()) {
        additional_keyword_args[[info_key]] <- NULL
      }

591
      # Perform slicing
592
593
594
      return(
        Dataset$new(
          data = NULL
595
          , params = utils::modifyList(self$get_params(), additional_keyword_args)
596
597
598
599
600
601
602
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
603
604
605
606
          , group = group
          , init_score = init_score
          , label = label
          , weight = weight
607
        )
608
      )
James Lamb's avatar
James Lamb committed
609

Guolin Ke's avatar
Guolin Ke committed
610
    },
James Lamb's avatar
James Lamb committed
611

612
613
614
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
615
    update_params = function(params) {
616
617
618
      if (length(params) == 0L) {
        return(invisible(self))
      }
619
      if (lgb.is.null.handle(x = private$handle)) {
620
        private$params <- utils::modifyList(private$params, params)
621
      } else {
622
623
        tryCatch({
          .Call(
624
            LGBM_DatasetUpdateParamChecking_R
625
626
627
628
629
630
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
631
          if (is.null(private$raw_data)) {
632
            stop(e)
633
634
          }

635
636
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
637
          private$params <- utils::modifyList(private$params, params)
638
          self$finalize()
639
        })
640
      }
641
      return(invisible(self))
James Lamb's avatar
James Lamb committed
642

Guolin Ke's avatar
Guolin Ke committed
643
    },
James Lamb's avatar
James Lamb committed
644

645
646
647
648
649
650
651
652
653
654
655
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

656
    # Set categorical feature parameter
657
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
658

659
660
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
661
        return(invisible(self))
662
      }
James Lamb's avatar
James Lamb committed
663

664
      # Check for empty data
665
      if (is.null(private$raw_data)) {
666
667
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
668
      }
James Lamb's avatar
James Lamb committed
669

670
      # Overwrite categorical features
671
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
672

673
      # Finalize and return self
674
      self$finalize()
675
      return(invisible(self))
James Lamb's avatar
James Lamb committed
676

677
    },
James Lamb's avatar
James Lamb committed
678

679
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
680
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
681

682
      # setting reference to this same Dataset object doesn't require any changes
683
      if (identical(private$reference, reference)) {
684
        return(invisible(self))
685
      }
James Lamb's avatar
James Lamb committed
686

687
688
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
689
      if (is.null(private$raw_data)) {
690
691
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
692
      }
James Lamb's avatar
James Lamb committed
693

694
695
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
696
      }
James Lamb's avatar
James Lamb committed
697

698
699
700
701
702
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

703
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
704
      private$reference <- reference
James Lamb's avatar
James Lamb committed
705

706
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
707
      self$finalize()
708
      return(invisible(self))
James Lamb's avatar
James Lamb committed
709

Guolin Ke's avatar
Guolin Ke committed
710
    },
James Lamb's avatar
James Lamb committed
711

712
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
713
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
714

715
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
716
      self$construct()
717
718
      .Call(
        LGBM_DatasetSaveBinary_R
719
        , private$handle
720
        , path.expand(fname)
721
      )
722
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
723
    }
James Lamb's avatar
James Lamb committed
724

Guolin Ke's avatar
Guolin Ke committed
725
726
  ),
  private = list(
727
728
729
730
731
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
732
    categorical_feature = NULL,
733
734
735
736
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
737
    version = 0L,
James Lamb's avatar
James Lamb committed
738

739
740
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
741

742
      # Get handle and construct if needed
743
      if (lgb.is.null.handle(x = private$handle)) {
744
745
        self$construct()
      }
746
      return(private$handle)
James Lamb's avatar
James Lamb committed
747

Guolin Ke's avatar
Guolin Ke committed
748
    },
James Lamb's avatar
James Lamb committed
749

750
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
751
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
752

753
      if (identical(private$predictor, predictor)) {
754
        return(invisible(self))
755
      }
James Lamb's avatar
James Lamb committed
756

757
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
758
      if (is.null(private$raw_data)) {
759
760
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
761
      }
James Lamb's avatar
James Lamb committed
762

763
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
764
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
765

766
        # Predictor is unknown
767
        if (!lgb.is.Predictor(predictor)) {
768
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
769
        }
James Lamb's avatar
James Lamb committed
770

Guolin Ke's avatar
Guolin Ke committed
771
      }
James Lamb's avatar
James Lamb committed
772

773
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
774
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
775

776
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
777
      self$finalize()
778
      return(invisible(self))
James Lamb's avatar
James Lamb committed
779

Guolin Ke's avatar
Guolin Ke committed
780
    }
James Lamb's avatar
James Lamb committed
781

Guolin Ke's avatar
Guolin Ke committed
782
783
784
  )
)

785
786
787
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
788
#' @inheritParams lgb_shared_dataset_params
789
790
791
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
792
793
794
795
796
797
798
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
799
#' @param colnames names of columns
800
801
802
803
804
805
806
807
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
808
#' @param ... other parameters passed to \code{params}
James Lamb's avatar
James Lamb committed
809
#'
Guolin Ke's avatar
Guolin Ke committed
810
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
811
#'
Guolin Ke's avatar
Guolin Ke committed
812
#' @examples
813
#' \donttest{
814
815
816
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
817
818
819
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
820
#' lgb.Dataset.construct(dtrain)
821
#' }
Guolin Ke's avatar
Guolin Ke committed
822
823
#' @export
lgb.Dataset <- function(data,
824
825
826
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
827
                        categorical_feature = NULL,
828
829
                        free_raw_data = TRUE,
                        info = list(),
830
831
832
833
                        label = NULL,
                        weight = NULL,
                        group = NULL,
                        init_score = NULL,
Guolin Ke's avatar
Guolin Ke committed
834
                        ...) {
James Lamb's avatar
James Lamb committed
835

836
837
838
839
840
841
842
843
844
845
846
847
  additional_params <- list(...)
  params <- modifyList(params, additional_params)

  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
    ))
  }

848
  # Create new dataset
849
850
851
852
853
854
855
856
857
858
859
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
860
861
862
863
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
864
865
    ))
  )
James Lamb's avatar
James Lamb committed
866

Guolin Ke's avatar
Guolin Ke committed
867
868
}

869
870
871
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
872
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
873
#' @param dataset \code{lgb.Dataset} object, training data
874
875
876
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
877
878
879
880
881
882
883
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
#' @param ... additional \code{lgb.Dataset} parameters.
#'            NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.
James Lamb's avatar
James Lamb committed
884
#'
Guolin Ke's avatar
Guolin Ke committed
885
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
886
#'
Guolin Ke's avatar
Guolin Ke committed
887
#' @examples
888
#' \donttest{
889
890
891
892
893
894
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
930
#' }
931
#' @importFrom utils modifyList
Guolin Ke's avatar
Guolin Ke committed
932
#' @export
933
934
935
936
937
938
939
940
941
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     info = list(),
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
                                     params = list(),
                                     ...) {
James Lamb's avatar
James Lamb committed
942

943
  if (!lgb.is.Dataset(x = dataset)) {
944
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
945
  }
James Lamb's avatar
James Lamb committed
946

947
948
949
950
951
952
953
954
955
956
  additional_params <- list(...)
  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset.create.valid: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset.create.valid for documentation on how to call this function."
    ))
  }

957
  # Create validation dataset
958
959
960
961
962
963
964
965
966
967
968
  return(invisible(
    dataset$create_valid(
      data = data
      , info = info
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
      , params = utils::modifyList(params, additional_params)
    )
  ))
James Lamb's avatar
James Lamb committed
969

970
}
Guolin Ke's avatar
Guolin Ke committed
971

972
973
974
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
975
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
976
#'
Guolin Ke's avatar
Guolin Ke committed
977
#' @examples
978
#' \donttest{
979
980
981
982
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
983
#' }
984
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
985
986
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
987

988
  if (!lgb.is.Dataset(x = dataset)) {
989
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
990
  }
James Lamb's avatar
James Lamb committed
991

992
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
993

Guolin Ke's avatar
Guolin Ke committed
994
995
}

996
997
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
998
#' @param x Object of class \code{lgb.Dataset}
999
#' @param ... other parameters (ignored)
James Lamb's avatar
James Lamb committed
1000
#'
Guolin Ke's avatar
Guolin Ke committed
1001
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
1002
#'
Guolin Ke's avatar
Guolin Ke committed
1003
1004
1005
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
1006
#'
Guolin Ke's avatar
Guolin Ke committed
1007
#' @examples
1008
#' \donttest{
1009
1010
1011
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1012
#'
1013
1014
1015
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
1016
#' }
Guolin Ke's avatar
Guolin Ke committed
1017
1018
1019
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
1020

1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "dim.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?dim.lgb.Dataset for documentation on how to call this function."
    ))
  }

1031
  if (!lgb.is.Dataset(x = x)) {
1032
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1033
  }
James Lamb's avatar
James Lamb committed
1034

1035
  return(x$dim())
James Lamb's avatar
James Lamb committed
1036

Guolin Ke's avatar
Guolin Ke committed
1037
1038
}

1039
1040
1041
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
1042
1043
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
1044
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
1045
1046
1047
1048
1049
1050
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
1051
#' \donttest{
1052
1053
1054
1055
1056
1057
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1058
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1059
#' print(dtrain, verbose = TRUE)
1060
#' }
Guolin Ke's avatar
Guolin Ke committed
1061
#' @rdname dimnames.lgb.Dataset
1062
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1063
1064
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1065

1066
  if (!lgb.is.Dataset(x = x)) {
1067
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1068
  }
James Lamb's avatar
James Lamb committed
1069

1070
  # Return dimension names
1071
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1072

Guolin Ke's avatar
Guolin Ke committed
1073
1074
1075
1076
1077
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1078

1079
  # Check if invalid element list
1080
  if (!identical(class(value), "list") || length(value) != 2L) {
1081
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
1082
  }
James Lamb's avatar
James Lamb committed
1083

1084
1085
1086
1087
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1088

1089
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1090

1091
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1092
    return(x)
James Lamb's avatar
James Lamb committed
1093

1094
  }
James Lamb's avatar
James Lamb committed
1095

1096
  # Check for unmatching column size
1097
  if (ncol(x) != length(value[[2L]])) {
1098
1099
    stop(
      "can't assign "
1100
      , sQuote(length(value[[2L]]))
1101
1102
1103
1104
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1105
  }
James Lamb's avatar
James Lamb committed
1106

1107
  # Set column names properly, and return
1108
  x$set_colnames(colnames = value[[2L]])
1109
  return(x)
James Lamb's avatar
James Lamb committed
1110

Guolin Ke's avatar
Guolin Ke committed
1111
1112
}

1113
1114
1115
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1116
#' @param dataset Object of class \code{lgb.Dataset}
1117
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1118
1119
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1120
#'
Guolin Ke's avatar
Guolin Ke committed
1121
#' @examples
1122
#' \donttest{
1123
1124
1125
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1126
#'
1127
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1128
#' lgb.Dataset.construct(dsub)
1129
#' labels <- lightgbm::get_field(dsub, "label")
1130
#' }
Guolin Ke's avatar
Guolin Ke committed
1131
#' @export
1132
1133
1134
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1135
1136
1137
1138

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1139

1140
  if (!lgb.is.Dataset(x = dataset)) {
1141
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1142
  }
James Lamb's avatar
James Lamb committed
1143

1144
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1145

Guolin Ke's avatar
Guolin Ke committed
1146
1147
}

1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1241
1242
}

1243
1244
1245
1246
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1247
#' @param dataset object of class \code{lgb.Dataset}
1248
1249
1250
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1251
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1252
#'
1253
#' @examples
1254
#' \donttest{
1255
1256
1257
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1258
1259
1260
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1261
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1262
#' }
1263
1264
1265
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1266

1267
  if (!lgb.is.Dataset(x = dataset)) {
1268
1269
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1270

1271
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1272

1273
1274
}

1275
1276
1277
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1278
1279
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1280
#'
1281
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1282
#'
Guolin Ke's avatar
Guolin Ke committed
1283
#' @examples
1284
#' \donttest{
1285
#' # create training Dataset
1286
1287
1288
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1289
1290
#'
#' # create a validation Dataset, using dtrain as a reference
1291
1292
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1293
#' dtest <- lgb.Dataset(test$data, label = test$label)
1294
#' lgb.Dataset.set.reference(dtest, dtrain)
1295
#' }
Guolin Ke's avatar
Guolin Ke committed
1296
1297
1298
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1299

1300
  if (!lgb.is.Dataset(x = dataset)) {
1301
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1302
  }
James Lamb's avatar
James Lamb committed
1303

1304
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1305
1306
}

1307
1308
1309
1310
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1311
1312
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1313
#'
1314
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1315
#'
Guolin Ke's avatar
Guolin Ke committed
1316
#' @examples
1317
#' \donttest{
1318
1319
1320
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1321
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1322
#' }
Guolin Ke's avatar
Guolin Ke committed
1323
1324
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1325

1326
  if (!lgb.is.Dataset(x = dataset)) {
1327
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1328
  }
James Lamb's avatar
James Lamb committed
1329

1330
1331
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1332
  }
James Lamb's avatar
James Lamb committed
1333

1334
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1335
}