lgb.Dataset.R 35.2 KB
Newer Older
1
2
3
4
5
6
7
8
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
9
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
10
#' @importFrom R6 R6Class
11
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
12
13
Dataset <- R6::R6Class(

14
  classname = "lgb.Dataset",
15
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
16
  public = list(
James Lamb's avatar
James Lamb committed
17

18
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
19
    finalize = function() {
20
21
22
23
24
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
25
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
26
    },
James Lamb's avatar
James Lamb committed
27

28
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
29
    initialize = function(data,
30
31
32
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
33
                          categorical_feature = NULL,
34
35
36
37
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
Guolin Ke's avatar
Guolin Ke committed
38
                          ...) {
James Lamb's avatar
James Lamb committed
39

40
      # validate inputs early to avoid unnecessary computation
41
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
42
43
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
44
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
45
46
47
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

48
      # Check for additional parameters
49
      additional_params <- list(...)
James Lamb's avatar
James Lamb committed
50

51
      # Check if attribute key is in the known attribute list
52
      for (key in names(additional_params)) {
James Lamb's avatar
James Lamb committed
53

54
        # Key existing
55
        if (key %in% .INFO_KEYS()) {
James Lamb's avatar
James Lamb committed
56

57
          # Store as info
58
          info[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
59

Guolin Ke's avatar
Guolin Ke committed
60
        } else {
James Lamb's avatar
James Lamb committed
61

62
          # Store as param
63
          params[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
64

Guolin Ke's avatar
Guolin Ke committed
65
        }
James Lamb's avatar
James Lamb committed
66

Guolin Ke's avatar
Guolin Ke committed
67
      }
James Lamb's avatar
James Lamb committed
68

69
70
71
72
73
74
75
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
76

77
78
79
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
80
      private$reference <- reference
81
      private$colnames <- colnames
82

83
      private$categorical_feature <- categorical_feature
84
85
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
86
      private$used_indices <- sort(used_indices, decreasing = FALSE)
87
      private$info <- info
88
      private$version <- 0L
James Lamb's avatar
James Lamb committed
89

90
91
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
92
    },
James Lamb's avatar
James Lamb committed
93

94
95
96
    create_valid = function(data,
                            info = list(),
                            ...) {
James Lamb's avatar
James Lamb committed
97

98
      # Create new dataset
99
100
101
102
103
104
105
106
107
108
109
110
      ret <- Dataset$new(
        data = data
        , params = private$params
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
        , ...
      )
James Lamb's avatar
James Lamb committed
111

112
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
113

Guolin Ke's avatar
Guolin Ke committed
114
    },
James Lamb's avatar
James Lamb committed
115

116
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
117
    construct = function() {
James Lamb's avatar
James Lamb committed
118

119
      # Check for handle null
120
      if (!lgb.is.null.handle(x = private$handle)) {
121
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
122
      }
James Lamb's avatar
James Lamb committed
123

Guolin Ke's avatar
Guolin Ke committed
124
125
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
126
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
127
128
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
129

130
      # set feature names if they do not exist
131
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
132
133
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
134

135
136
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
137

138
        # Check for character name
139
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
140

141
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
142

143
            # Provided indices, but some indices are missing?
144
            if (sum(is.na(cate_indices)) > 0L) {
145
146
147
148
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
149
            }
James Lamb's avatar
James Lamb committed
150

151
          } else {
James Lamb's avatar
James Lamb committed
152

153
            # Check if more categorical features were output over the feature space
154
            if (max(private$categorical_feature) > length(private$colnames)) {
155
156
157
158
159
160
161
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
162
            }
James Lamb's avatar
James Lamb committed
163

164
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
165
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
166

167
          }
James Lamb's avatar
James Lamb committed
168

169
        # Store indices for categorical features
170
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
171

172
      }
James Lamb's avatar
James Lamb committed
173

Guolin Ke's avatar
Guolin Ke committed
174
      # Generate parameter str
175
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
176

177
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
178
179
180
181
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
182

183
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
184
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
185

186
        # Are we using a data file?
187
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
188

189
          handle <- .Call(
190
            LGBM_DatasetCreateFromFile_R
191
            , private$raw_data
192
193
194
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
195

Guolin Ke's avatar
Guolin Ke committed
196
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
197

198
          # Are we using a matrix?
199
          handle <- .Call(
200
            LGBM_DatasetCreateFromMat_R
201
202
203
204
205
206
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
207
208

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
209
          if (length(private$raw_data@p) > 2147483647L) {
210
211
            stop("Cannot support large CSC matrix")
          }
212
          # Are we using a dgCMatrix (sparsed matrix column compressed)
213
          handle <- .Call(
214
            LGBM_DatasetCreateFromCSC_R
215
216
217
218
219
220
221
222
223
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
224

Guolin Ke's avatar
Guolin Ke committed
225
        } else {
James Lamb's avatar
James Lamb committed
226

227
          # Unknown data type
228
229
230
231
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
232

Guolin Ke's avatar
Guolin Ke committed
233
        }
James Lamb's avatar
James Lamb committed
234

Guolin Ke's avatar
Guolin Ke committed
235
      } else {
James Lamb's avatar
James Lamb committed
236

237
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
238
        if (is.null(private$reference)) {
239
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
240
        }
James Lamb's avatar
James Lamb committed
241

242
        # Construct subset
243
        handle <- .Call(
244
          LGBM_DatasetGetSubset_R
245
246
247
248
249
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
250

Guolin Ke's avatar
Guolin Ke committed
251
      }
252
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
253
254
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
255
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
256
257
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
258

259
260
      # Set feature names
      if (!is.null(private$colnames)) {
261
        self$set_colnames(colnames = private$colnames)
262
      }
263

264
265
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
266

267
        # Setup initial scores
268
        init_score <- private$predictor$predict(
269
          data = private$raw_data
270
271
272
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
273

274
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
275
276
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
277

278
      }
James Lamb's avatar
James Lamb committed
279

280
281
282
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
283
      }
James Lamb's avatar
James Lamb committed
284

285
      # Get private information
286
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
287

288
        # Set infos
289
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
290

Guolin Ke's avatar
Guolin Ke committed
291
          p <- private$info[i]
292
          self$setinfo(name = names(p), info = p[[1L]])
James Lamb's avatar
James Lamb committed
293

Guolin Ke's avatar
Guolin Ke committed
294
        }
James Lamb's avatar
James Lamb committed
295

Guolin Ke's avatar
Guolin Ke committed
296
      }
James Lamb's avatar
James Lamb committed
297

298
      # Get label information existence
299
      if (is.null(self$getinfo(name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
300
301
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
302

303
      return(invisible(self))
James Lamb's avatar
James Lamb committed
304

Guolin Ke's avatar
Guolin Ke committed
305
    },
James Lamb's avatar
James Lamb committed
306

307
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
308
    dim = function() {
James Lamb's avatar
James Lamb committed
309

310
      # Check for handle
311
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
312

313
314
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
315

316
        # Get numeric data and numeric features
317
318
319
320
321
322
323
324
325
326
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
327
        return(
328
          c(num_row, num_col)
329
        )
James Lamb's avatar
James Lamb committed
330
331
332

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

333
        # Check if dgCMatrix (sparse matrix column compressed)
334
        # NOTE: requires Matrix package
335
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
336

Guolin Ke's avatar
Guolin Ke committed
337
      } else {
James Lamb's avatar
James Lamb committed
338

339
        # Trying to work with unknown dimensions is not possible
340
341
342
343
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
344

Guolin Ke's avatar
Guolin Ke committed
345
      }
James Lamb's avatar
James Lamb committed
346

Guolin Ke's avatar
Guolin Ke committed
347
    },
James Lamb's avatar
James Lamb committed
348

349
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
350
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
351

352
      # Check for handle
353
      if (!lgb.is.null.handle(x = private$handle)) {
354
        private$colnames <- .Call(
355
356
          LGBM_DatasetGetFeatureNames_R
          , private$handle
357
        )
358
        return(private$colnames)
James Lamb's avatar
James Lamb committed
359
360
361

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

362
        # Check if dgCMatrix (sparse matrix column compressed)
363
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
364

Guolin Ke's avatar
Guolin Ke committed
365
      } else {
James Lamb's avatar
James Lamb committed
366

367
        # Trying to work with unknown dimensions is not possible
368
369
370
371
        stop(
          "dim: cannot get dimensions before dataset has been constructed, please call "
          , "lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
372

Guolin Ke's avatar
Guolin Ke committed
373
      }
James Lamb's avatar
James Lamb committed
374

Guolin Ke's avatar
Guolin Ke committed
375
    },
James Lamb's avatar
James Lamb committed
376

377
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
378
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
379

380
381
      # Check column names non-existence
      if (is.null(colnames)) {
382
        return(invisible(self))
383
      }
James Lamb's avatar
James Lamb committed
384

385
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
386
      colnames <- as.character(colnames)
387
      if (length(colnames) == 0L) {
388
        return(invisible(self))
389
      }
James Lamb's avatar
James Lamb committed
390

391
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
392
      private$colnames <- colnames
393
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
394

395
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
396
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
397
398
        .Call(
          LGBM_DatasetSetFeatureNames_R
399
          , private$handle
400
          , merged_name
401
        )
James Lamb's avatar
James Lamb committed
402

Guolin Ke's avatar
Guolin Ke committed
403
      }
James Lamb's avatar
James Lamb committed
404

405
      return(invisible(self))
James Lamb's avatar
James Lamb committed
406

Guolin Ke's avatar
Guolin Ke committed
407
    },
James Lamb's avatar
James Lamb committed
408

409
    # Get information
Guolin Ke's avatar
Guolin Ke committed
410
    getinfo = function(name) {
James Lamb's avatar
James Lamb committed
411

412
      # Check if attribute key is in the known attribute list
413
414
      if (!is.character(name) || length(name) != 1L || !name %in% .INFO_KEYS()) {
        stop("getinfo: name must one of the following: ", paste0(sQuote(.INFO_KEYS()), collapse = ", "))
Guolin Ke's avatar
Guolin Ke committed
415
      }
James Lamb's avatar
James Lamb committed
416

417
      # Check for info name and handle
418
      if (is.null(private$info[[name]])) {
419

420
        if (lgb.is.null.handle(x = private$handle)) {
421
          stop("Cannot perform getinfo before constructing Dataset.")
422
        }
423

424
        # Get field size of info
425
        info_len <- 0L
426
427
        .Call(
          LGBM_DatasetGetFieldSize_R
428
          , private$handle
429
          , name
430
          , info_len
431
        )
James Lamb's avatar
James Lamb committed
432

433
        # Check if info is not empty
434
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
435

436
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
437
          ret <- NULL
438
439
440
441
442
          ret <- if (name == "group") {
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
443

444
445
          .Call(
            LGBM_DatasetGetField_R
446
            , private$handle
447
            , name
448
            , ret
449
          )
James Lamb's avatar
James Lamb committed
450

Guolin Ke's avatar
Guolin Ke committed
451
          private$info[[name]] <- ret
James Lamb's avatar
James Lamb committed
452

Guolin Ke's avatar
Guolin Ke committed
453
454
        }
      }
James Lamb's avatar
James Lamb committed
455

456
      return(private$info[[name]])
James Lamb's avatar
James Lamb committed
457

Guolin Ke's avatar
Guolin Ke committed
458
    },
James Lamb's avatar
James Lamb committed
459

460
    # Set information
Guolin Ke's avatar
Guolin Ke committed
461
    setinfo = function(name, info) {
James Lamb's avatar
James Lamb committed
462

463
      # Check if attribute key is in the known attribute list
464
465
      if (!is.character(name) || length(name) != 1L || !name %in% .INFO_KEYS()) {
        stop("setinfo: name must one of the following: ", paste0(sQuote(.INFO_KEYS()), collapse = ", "))
466
      }
James Lamb's avatar
James Lamb committed
467

468
469
470
471
472
473
      # Check for type of information
      info <- if (name == "group") {
        as.integer(info) # Integer
      } else {
        as.numeric(info) # Numeric
      }
James Lamb's avatar
James Lamb committed
474

475
      # Store information privately
Guolin Ke's avatar
Guolin Ke committed
476
      private$info[[name]] <- info
James Lamb's avatar
James Lamb committed
477

478
      if (!lgb.is.null.handle(x = private$handle) && !is.null(info)) {
James Lamb's avatar
James Lamb committed
479

480
        if (length(info) > 0L) {
James Lamb's avatar
James Lamb committed
481

482
483
          .Call(
            LGBM_DatasetSetField_R
484
            , private$handle
485
            , name
486
487
488
            , info
            , length(info)
          )
James Lamb's avatar
James Lamb committed
489

490
491
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
492
        }
James Lamb's avatar
James Lamb committed
493

Guolin Ke's avatar
Guolin Ke committed
494
      }
James Lamb's avatar
James Lamb committed
495

496
      return(invisible(self))
James Lamb's avatar
James Lamb committed
497

Guolin Ke's avatar
Guolin Ke committed
498
    },
James Lamb's avatar
James Lamb committed
499

500
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
501
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
502

503
      # Perform slicing
504
505
506
507
508
509
510
511
512
513
514
515
516
      return(
        Dataset$new(
          data = NULL
          , params = private$params
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
          , ...
        )
517
      )
James Lamb's avatar
James Lamb committed
518

Guolin Ke's avatar
Guolin Ke committed
519
    },
James Lamb's avatar
James Lamb committed
520

521
522
523
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
524
    update_params = function(params) {
525
526
527
      if (length(params) == 0L) {
        return(invisible(self))
      }
528
      if (lgb.is.null.handle(x = private$handle)) {
529
        private$params <- utils::modifyList(private$params, params)
530
      } else {
531
532
        tryCatch({
          .Call(
533
            LGBM_DatasetUpdateParamChecking_R
534
535
536
537
538
539
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
540
          if (is.null(private$raw_data)) {
541
            stop(e)
542
543
          }

544
545
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
546
          private$params <- utils::modifyList(private$params, params)
547
          self$finalize()
548
        })
549
      }
550
      return(invisible(self))
James Lamb's avatar
James Lamb committed
551

Guolin Ke's avatar
Guolin Ke committed
552
    },
James Lamb's avatar
James Lamb committed
553

554
555
556
557
558
559
560
561
562
563
564
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

565
    # Set categorical feature parameter
566
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
567

568
569
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
570
        return(invisible(self))
571
      }
James Lamb's avatar
James Lamb committed
572

573
      # Check for empty data
574
      if (is.null(private$raw_data)) {
575
576
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
577
      }
James Lamb's avatar
James Lamb committed
578

579
      # Overwrite categorical features
580
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
581

582
      # Finalize and return self
583
      self$finalize()
584
      return(invisible(self))
James Lamb's avatar
James Lamb committed
585

586
    },
James Lamb's avatar
James Lamb committed
587

588
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
589
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
590

591
      # Set known references
592
593
594
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)
James Lamb's avatar
James Lamb committed
595

596
597
      # Check for identical references
      if (identical(private$reference, reference)) {
598
        return(invisible(self))
599
      }
James Lamb's avatar
James Lamb committed
600

601
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
602
      if (is.null(private$raw_data)) {
James Lamb's avatar
James Lamb committed
603

604
605
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
James Lamb's avatar
James Lamb committed
606

Guolin Ke's avatar
Guolin Ke committed
607
      }
James Lamb's avatar
James Lamb committed
608

609
      # Check for non-existing reference
Guolin Ke's avatar
Guolin Ke committed
610
      if (!is.null(reference)) {
James Lamb's avatar
James Lamb committed
611

612
        # Reference is unknown
613
        if (!lgb.is.Dataset(reference)) {
614
          stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
615
        }
James Lamb's avatar
James Lamb committed
616

Guolin Ke's avatar
Guolin Ke committed
617
      }
James Lamb's avatar
James Lamb committed
618

619
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
620
      private$reference <- reference
James Lamb's avatar
James Lamb committed
621

622
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
623
      self$finalize()
624
      return(invisible(self))
James Lamb's avatar
James Lamb committed
625

Guolin Ke's avatar
Guolin Ke committed
626
    },
James Lamb's avatar
James Lamb committed
627

628
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
629
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
630

631
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
632
      self$construct()
633
634
      .Call(
        LGBM_DatasetSaveBinary_R
635
        , private$handle
636
        , fname
637
      )
638
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
639
    }
James Lamb's avatar
James Lamb committed
640

Guolin Ke's avatar
Guolin Ke committed
641
642
  ),
  private = list(
643
644
645
646
647
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
648
    categorical_feature = NULL,
649
650
651
652
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
653
    version = 0L,
James Lamb's avatar
James Lamb committed
654

655
656
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
657

658
      # Get handle and construct if needed
659
      if (lgb.is.null.handle(x = private$handle)) {
660
661
        self$construct()
      }
662
      return(private$handle)
James Lamb's avatar
James Lamb committed
663

Guolin Ke's avatar
Guolin Ke committed
664
    },
James Lamb's avatar
James Lamb committed
665

666
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
667
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
668

669
      if (identical(private$predictor, predictor)) {
670
        return(invisible(self))
671
      }
James Lamb's avatar
James Lamb committed
672

673
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
674
      if (is.null(private$raw_data)) {
675
676
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
677
      }
James Lamb's avatar
James Lamb committed
678

679
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
680
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
681

682
        # Predictor is unknown
683
        if (!lgb.is.Predictor(predictor)) {
684
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
685
        }
James Lamb's avatar
James Lamb committed
686

Guolin Ke's avatar
Guolin Ke committed
687
      }
James Lamb's avatar
James Lamb committed
688

689
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
690
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
691

692
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
693
      self$finalize()
694
      return(invisible(self))
James Lamb's avatar
James Lamb committed
695

Guolin Ke's avatar
Guolin Ke committed
696
    }
James Lamb's avatar
James Lamb committed
697

Guolin Ke's avatar
Guolin Ke committed
698
699
700
  )
)

701
702
703
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
704
705
706
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
707
708
709
710
711
712
713
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
714
#' @param colnames names of columns
715
716
717
718
719
720
721
722
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
Nikita Titov's avatar
Nikita Titov committed
723
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
724
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
James Lamb's avatar
James Lamb committed
725
#'
Guolin Ke's avatar
Guolin Ke committed
726
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
727
#'
Guolin Ke's avatar
Guolin Ke committed
728
#' @examples
729
#' \donttest{
730
731
732
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
733
734
735
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
736
#' lgb.Dataset.construct(dtrain)
737
#' }
Guolin Ke's avatar
Guolin Ke committed
738
739
#' @export
lgb.Dataset <- function(data,
740
741
742
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
743
                        categorical_feature = NULL,
744
745
                        free_raw_data = TRUE,
                        info = list(),
Guolin Ke's avatar
Guolin Ke committed
746
                        ...) {
James Lamb's avatar
James Lamb committed
747

748
  # Create new dataset
749
750
751
752
753
754
755
756
757
758
759
760
761
762
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
      , ...
    ))
  )
James Lamb's avatar
James Lamb committed
763

Guolin Ke's avatar
Guolin Ke committed
764
765
}

766
767
768
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
Guolin Ke's avatar
Guolin Ke committed
769
#' @param dataset \code{lgb.Dataset} object, training data
770
771
772
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
Nikita Titov's avatar
Nikita Titov committed
773
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
774
#' @param ... other information to pass to \code{info}.
James Lamb's avatar
James Lamb committed
775
#'
Guolin Ke's avatar
Guolin Ke committed
776
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
777
#'
Guolin Ke's avatar
Guolin Ke committed
778
#' @examples
779
#' \donttest{
780
781
782
783
784
785
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
786
#' }
Guolin Ke's avatar
Guolin Ke committed
787
#' @export
788
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
James Lamb's avatar
James Lamb committed
789

790
  # Check if dataset is not a dataset
791
  if (!lgb.is.Dataset(x = dataset)) {
792
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
793
  }
James Lamb's avatar
James Lamb committed
794

795
  # Create validation dataset
796
  return(invisible(dataset$create_valid(data = data, info = info, ...)))
James Lamb's avatar
James Lamb committed
797

798
}
Guolin Ke's avatar
Guolin Ke committed
799

800
801
802
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
803
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
804
#'
Guolin Ke's avatar
Guolin Ke committed
805
#' @examples
806
#' \donttest{
807
808
809
810
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
811
#' }
812
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
813
814
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
815

816
  # Check if dataset is not a dataset
817
  if (!lgb.is.Dataset(x = dataset)) {
818
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
819
  }
James Lamb's avatar
James Lamb committed
820

821
  # Construct the dataset
822
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
823

Guolin Ke's avatar
Guolin Ke committed
824
825
}

826
827
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
828
#' @param x Object of class \code{lgb.Dataset}
829
#' @param ... other parameters (ignored)
James Lamb's avatar
James Lamb committed
830
#'
Guolin Ke's avatar
Guolin Ke committed
831
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
832
#'
Guolin Ke's avatar
Guolin Ke committed
833
834
835
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
836
#'
Guolin Ke's avatar
Guolin Ke committed
837
#' @examples
838
#' \donttest{
839
840
841
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
842
#'
843
844
845
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
846
#' }
Guolin Ke's avatar
Guolin Ke committed
847
848
849
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
850

851
852
853
854
855
856
857
858
859
860
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "dim.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?dim.lgb.Dataset for documentation on how to call this function."
    ))
  }

861
  # Check if dataset is not a dataset
862
  if (!lgb.is.Dataset(x = x)) {
863
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
864
  }
James Lamb's avatar
James Lamb committed
865

866
  return(x$dim())
James Lamb's avatar
James Lamb committed
867

Guolin Ke's avatar
Guolin Ke committed
868
869
}

870
871
872
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
873
874
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
875
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
876
877
878
879
880
881
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
882
#' \donttest{
883
884
885
886
887
888
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
889
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
890
#' print(dtrain, verbose = TRUE)
891
#' }
Guolin Ke's avatar
Guolin Ke committed
892
#' @rdname dimnames.lgb.Dataset
893
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
894
895
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
896

897
  # Check if dataset is not a dataset
898
  if (!lgb.is.Dataset(x = x)) {
899
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
900
  }
James Lamb's avatar
James Lamb committed
901

902
  # Return dimension names
903
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
904

Guolin Ke's avatar
Guolin Ke committed
905
906
907
908
909
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
910

911
  # Check if invalid element list
912
  if (!identical(class(value), "list") || length(value) != 2L) {
913
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
914
  }
James Lamb's avatar
James Lamb committed
915

916
917
918
919
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
920

921
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
922

923
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
924
    return(x)
James Lamb's avatar
James Lamb committed
925

926
  }
James Lamb's avatar
James Lamb committed
927

928
  # Check for unmatching column size
929
  if (ncol(x) != length(value[[2L]])) {
930
931
    stop(
      "can't assign "
932
      , sQuote(length(value[[2L]]))
933
934
935
936
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
937
  }
James Lamb's avatar
James Lamb committed
938

939
  # Set column names properly, and return
940
  x$set_colnames(colnames = value[[2L]])
941
  return(x)
James Lamb's avatar
James Lamb committed
942

Guolin Ke's avatar
Guolin Ke committed
943
944
}

945
946
947
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
948
#' @param dataset Object of class \code{lgb.Dataset}
949
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
950
951
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
952
#'
Guolin Ke's avatar
Guolin Ke committed
953
#' @examples
954
#' \donttest{
955
956
957
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
958
#'
959
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
960
#' lgb.Dataset.construct(dsub)
961
#' labels <- lightgbm::getinfo(dsub, "label")
962
#' }
Guolin Ke's avatar
Guolin Ke committed
963
#' @export
964
965
966
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
967
968
969
970

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
971

972
  # Check if dataset is not a dataset
973
  if (!lgb.is.Dataset(x = dataset)) {
974
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
975
  }
James Lamb's avatar
James Lamb committed
976

977
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
978

Guolin Ke's avatar
Guolin Ke committed
979
980
}

981
982
983
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
984
985
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
986
#' @param ... other parameters (ignored)
Guolin Ke's avatar
Guolin Ke committed
987
#' @return info data
James Lamb's avatar
James Lamb committed
988
#'
Guolin Ke's avatar
Guolin Ke committed
989
990
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
991
#'
Guolin Ke's avatar
Guolin Ke committed
992
993
994
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
995
996
997
998
999
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1000
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1001
#' }
James Lamb's avatar
James Lamb committed
1002
#'
Guolin Ke's avatar
Guolin Ke committed
1003
#' @examples
1004
#' \donttest{
1005
1006
1007
1008
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1009
#'
1010
1011
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1012
#'
1013
1014
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1015
#' }
Guolin Ke's avatar
Guolin Ke committed
1016
#' @export
1017
1018
1019
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1020
1021
1022
1023

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1024

1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "getinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?getinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1035
  # Check if dataset is not a dataset
1036
  if (!lgb.is.Dataset(x = dataset)) {
1037
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1038
  }
James Lamb's avatar
James Lamb committed
1039

1040
  return(dataset$getinfo(name = name))
James Lamb's avatar
James Lamb committed
1041

Guolin Ke's avatar
Guolin Ke committed
1042
1043
}

1044
1045
1046
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1047
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1048
1049
#' @param name the name of the field to get
#' @param info the specific field of information to set
1050
#' @param ... other parameters (ignored)
1051
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1052
#'
Guolin Ke's avatar
Guolin Ke committed
1053
1054
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1055
#'
Guolin Ke's avatar
Guolin Ke committed
1056
#' \itemize{
1057
1058
1059
1060
1061
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1062
1063
1064
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1065
#' }
James Lamb's avatar
James Lamb committed
1066
#'
Guolin Ke's avatar
Guolin Ke committed
1067
#' @examples
1068
#' \donttest{
1069
1070
1071
1072
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1073
#'
1074
1075
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1076
#'
1077
1078
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1079
#' }
Guolin Ke's avatar
Guolin Ke committed
1080
#' @export
1081
1082
1083
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1084
1085
1086
1087

#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1088

1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "setinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?setinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1099
  if (!lgb.is.Dataset(x = dataset)) {
1100
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1101
  }
James Lamb's avatar
James Lamb committed
1102

1103
  return(invisible(dataset$setinfo(name = name, info = info)))
Guolin Ke's avatar
Guolin Ke committed
1104
1105
}

1106
1107
1108
1109
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1110
#' @param dataset object of class \code{lgb.Dataset}
1111
1112
1113
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1114
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1115
#'
1116
#' @examples
1117
#' \donttest{
1118
1119
1120
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1121
1122
1123
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1124
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1125
#' }
1126
1127
1128
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1129

1130
  if (!lgb.is.Dataset(x = dataset)) {
1131
1132
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1133

1134
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1135

1136
1137
}

1138
1139
1140
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1141
1142
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1143
#'
1144
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1145
#'
Guolin Ke's avatar
Guolin Ke committed
1146
#' @examples
1147
#' \donttest{
1148
1149
1150
1151
1152
1153
1154
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test = train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
1155
#' }
Guolin Ke's avatar
Guolin Ke committed
1156
1157
1158
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1159

1160
  if (!lgb.is.Dataset(x = dataset)) {
1161
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1162
  }
James Lamb's avatar
James Lamb committed
1163

1164
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1165
1166
}

1167
1168
1169
1170
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1171
1172
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1173
#'
1174
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1175
#'
Guolin Ke's avatar
Guolin Ke committed
1176
#' @examples
1177
#' \donttest{
1178
1179
1180
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1181
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1182
#' }
Guolin Ke's avatar
Guolin Ke committed
1183
1184
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1185

1186
  if (!lgb.is.Dataset(x = dataset)) {
1187
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1188
  }
James Lamb's avatar
James Lamb committed
1189

1190
1191
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1192
  }
James Lamb's avatar
James Lamb committed
1193

1194
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1195
}