lgb.Dataset.R 34.2 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
2
3
4
#' @importFrom R6 R6Class
Dataset <- R6::R6Class(

5
  classname = "lgb.Dataset",
6
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
7
  public = list(
James Lamb's avatar
James Lamb committed
8

9
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
10
    finalize = function() {
11
12
13
14
15
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
16
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
17
    },
James Lamb's avatar
James Lamb committed
18

19
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
20
    initialize = function(data,
21
22
23
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
24
                          categorical_feature = NULL,
25
26
27
28
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
Guolin Ke's avatar
Guolin Ke committed
29
                          ...) {
James Lamb's avatar
James Lamb committed
30

31
      # validate inputs early to avoid unnecessary computation
32
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
33
34
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
35
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
36
37
38
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

39
      # Check for additional parameters
40
      additional_params <- list(...)
James Lamb's avatar
James Lamb committed
41

42
43
      # Create known attributes list
      INFO_KEYS <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
44

45
      # Check if attribute key is in the known attribute list
46
      for (key in names(additional_params)) {
James Lamb's avatar
James Lamb committed
47

48
        # Key existing
49
        if (key %in% INFO_KEYS) {
James Lamb's avatar
James Lamb committed
50

51
          # Store as info
52
          info[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
53

Guolin Ke's avatar
Guolin Ke committed
54
        } else {
James Lamb's avatar
James Lamb committed
55

56
          # Store as param
57
          params[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
58

Guolin Ke's avatar
Guolin Ke committed
59
        }
James Lamb's avatar
James Lamb committed
60

Guolin Ke's avatar
Guolin Ke committed
61
      }
James Lamb's avatar
James Lamb committed
62

63
64
65
66
67
68
69
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
70

71
72
73
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
74
      private$reference <- reference
75
      private$colnames <- colnames
76

77
      private$categorical_feature <- categorical_feature
78
79
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
80
      private$used_indices <- sort(used_indices, decreasing = FALSE)
81
      private$info <- info
82
      private$version <- 0L
James Lamb's avatar
James Lamb committed
83

84
85
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
86
    },
James Lamb's avatar
James Lamb committed
87

88
89
90
    create_valid = function(data,
                            info = list(),
                            ...) {
James Lamb's avatar
James Lamb committed
91

92
      # Create new dataset
93
94
95
96
97
98
99
100
101
102
103
104
      ret <- Dataset$new(
        data = data
        , params = private$params
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
        , ...
      )
James Lamb's avatar
James Lamb committed
105

106
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
107

Guolin Ke's avatar
Guolin Ke committed
108
    },
James Lamb's avatar
James Lamb committed
109

110
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
111
    construct = function() {
James Lamb's avatar
James Lamb committed
112

113
      # Check for handle null
114
      if (!lgb.is.null.handle(x = private$handle)) {
115
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
116
      }
James Lamb's avatar
James Lamb committed
117

Guolin Ke's avatar
Guolin Ke committed
118
119
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
120
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
121
122
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
123

124
      # set feature names if they do not exist
125
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
126
127
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
128

129
130
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
131

132
        # Check for character name
133
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
134

135
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
136

137
            # Provided indices, but some indices are missing?
138
            if (sum(is.na(cate_indices)) > 0L) {
139
140
141
142
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
143
            }
James Lamb's avatar
James Lamb committed
144

145
          } else {
James Lamb's avatar
James Lamb committed
146

147
            # Check if more categorical features were output over the feature space
148
            if (max(private$categorical_feature) > length(private$colnames)) {
149
150
151
152
153
154
155
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
156
            }
James Lamb's avatar
James Lamb committed
157

158
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
159
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
160

161
          }
James Lamb's avatar
James Lamb committed
162

163
        # Store indices for categorical features
164
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
165

166
      }
James Lamb's avatar
James Lamb committed
167

Guolin Ke's avatar
Guolin Ke committed
168
169
      # Check has header or not
      has_header <- FALSE
170
      if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
171
172
173
        params_has_header <- tolower(as.character(private$params$has_header)) == "true"
        params_header <- tolower(as.character(private$params$header)) == "true"
        if (params_has_header || params_header) {
Guolin Ke's avatar
Guolin Ke committed
174
175
176
          has_header <- TRUE
        }
      }
James Lamb's avatar
James Lamb committed
177

Guolin Ke's avatar
Guolin Ke committed
178
      # Generate parameter str
179
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
180

181
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
182
183
184
185
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
186

187
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
188
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
189

190
        # Are we using a data file?
191
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
192

193
          handle <- .Call(
194
            LGBM_DatasetCreateFromFile_R
195
            , private$raw_data
196
197
198
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
199

Guolin Ke's avatar
Guolin Ke committed
200
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
201

202
          # Are we using a matrix?
203
          handle <- .Call(
204
            LGBM_DatasetCreateFromMat_R
205
206
207
208
209
210
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
211
212

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
213
          if (length(private$raw_data@p) > 2147483647L) {
214
215
            stop("Cannot support large CSC matrix")
          }
216
          # Are we using a dgCMatrix (sparsed matrix column compressed)
217
          handle <- .Call(
218
            LGBM_DatasetCreateFromCSC_R
219
220
221
222
223
224
225
226
227
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
228

Guolin Ke's avatar
Guolin Ke committed
229
        } else {
James Lamb's avatar
James Lamb committed
230

231
          # Unknown data type
232
233
234
235
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
236

Guolin Ke's avatar
Guolin Ke committed
237
        }
James Lamb's avatar
James Lamb committed
238

Guolin Ke's avatar
Guolin Ke committed
239
      } else {
James Lamb's avatar
James Lamb committed
240

241
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
242
        if (is.null(private$reference)) {
243
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
244
        }
James Lamb's avatar
James Lamb committed
245

246
        # Construct subset
247
        handle <- .Call(
248
          LGBM_DatasetGetSubset_R
249
250
251
252
253
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
254

Guolin Ke's avatar
Guolin Ke committed
255
      }
256
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
257
258
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
259
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
260
261
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
262

263
264
      # Set feature names
      if (!is.null(private$colnames)) {
265
        self$set_colnames(colnames = private$colnames)
266
      }
267

268
269
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
270

271
        # Setup initial scores
272
        init_score <- private$predictor$predict(
273
          data = private$raw_data
274
275
276
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
277

278
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
279
280
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
281

282
      }
James Lamb's avatar
James Lamb committed
283

284
285
286
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
287
      }
James Lamb's avatar
James Lamb committed
288

289
      # Get private information
290
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
291

292
        # Set infos
293
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
294

Guolin Ke's avatar
Guolin Ke committed
295
          p <- private$info[i]
296
          self$setinfo(name = names(p), info = p[[1L]])
James Lamb's avatar
James Lamb committed
297

Guolin Ke's avatar
Guolin Ke committed
298
        }
James Lamb's avatar
James Lamb committed
299

Guolin Ke's avatar
Guolin Ke committed
300
      }
James Lamb's avatar
James Lamb committed
301

302
      # Get label information existence
303
      if (is.null(self$getinfo(name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
304
305
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
306

307
      return(invisible(self))
James Lamb's avatar
James Lamb committed
308

Guolin Ke's avatar
Guolin Ke committed
309
    },
James Lamb's avatar
James Lamb committed
310

311
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
312
    dim = function() {
James Lamb's avatar
James Lamb committed
313

314
      # Check for handle
315
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
316

317
318
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
319

320
        # Get numeric data and numeric features
321
322
323
324
325
326
327
328
329
330
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
331
        return(
332
          c(num_row, num_col)
333
        )
James Lamb's avatar
James Lamb committed
334
335
336

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

337
        # Check if dgCMatrix (sparse matrix column compressed)
338
        # NOTE: requires Matrix package
339
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
340

Guolin Ke's avatar
Guolin Ke committed
341
      } else {
James Lamb's avatar
James Lamb committed
342

343
        # Trying to work with unknown dimensions is not possible
344
345
346
347
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
348

Guolin Ke's avatar
Guolin Ke committed
349
      }
James Lamb's avatar
James Lamb committed
350

Guolin Ke's avatar
Guolin Ke committed
351
    },
James Lamb's avatar
James Lamb committed
352

353
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
354
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
355

356
      # Check for handle
357
      if (!lgb.is.null.handle(x = private$handle)) {
358
        private$colnames <- .Call(
359
360
          LGBM_DatasetGetFeatureNames_R
          , private$handle
361
        )
362
        return(private$colnames)
James Lamb's avatar
James Lamb committed
363
364
365

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

366
        # Check if dgCMatrix (sparse matrix column compressed)
367
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
368

Guolin Ke's avatar
Guolin Ke committed
369
      } else {
James Lamb's avatar
James Lamb committed
370

371
        # Trying to work with unknown dimensions is not possible
372
373
374
375
        stop(
          "dim: cannot get dimensions before dataset has been constructed, please call "
          , "lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
376

Guolin Ke's avatar
Guolin Ke committed
377
      }
James Lamb's avatar
James Lamb committed
378

Guolin Ke's avatar
Guolin Ke committed
379
    },
James Lamb's avatar
James Lamb committed
380

381
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
382
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
383

384
385
      # Check column names non-existence
      if (is.null(colnames)) {
386
        return(invisible(self))
387
      }
James Lamb's avatar
James Lamb committed
388

389
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
390
      colnames <- as.character(colnames)
391
      if (length(colnames) == 0L) {
392
        return(invisible(self))
393
      }
James Lamb's avatar
James Lamb committed
394

395
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
396
      private$colnames <- colnames
397
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
398

399
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
400
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
401
402
        .Call(
          LGBM_DatasetSetFeatureNames_R
403
          , private$handle
404
          , merged_name
405
        )
James Lamb's avatar
James Lamb committed
406

Guolin Ke's avatar
Guolin Ke committed
407
      }
James Lamb's avatar
James Lamb committed
408

409
      return(invisible(self))
James Lamb's avatar
James Lamb committed
410

Guolin Ke's avatar
Guolin Ke committed
411
    },
James Lamb's avatar
James Lamb committed
412

413
    # Get information
Guolin Ke's avatar
Guolin Ke committed
414
    getinfo = function(name) {
James Lamb's avatar
James Lamb committed
415

416
      # Create known attributes list
417
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
418

419
      # Check if attribute key is in the known attribute list
420
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
421
        stop("getinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
Guolin Ke's avatar
Guolin Ke committed
422
      }
James Lamb's avatar
James Lamb committed
423

424
      # Check for info name and handle
425
      if (is.null(private$info[[name]])) {
426

427
        if (lgb.is.null.handle(x = private$handle)) {
428
          stop("Cannot perform getinfo before constructing Dataset.")
429
        }
430

431
        # Get field size of info
432
        info_len <- 0L
433
434
        .Call(
          LGBM_DatasetGetFieldSize_R
435
          , private$handle
436
          , name
437
          , info_len
438
        )
James Lamb's avatar
James Lamb committed
439

440
        # Check if info is not empty
441
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
442

443
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
444
          ret <- NULL
445
446
447
448
449
          ret <- if (name == "group") {
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
450

451
452
          .Call(
            LGBM_DatasetGetField_R
453
            , private$handle
454
            , name
455
            , ret
456
          )
James Lamb's avatar
James Lamb committed
457

Guolin Ke's avatar
Guolin Ke committed
458
          private$info[[name]] <- ret
James Lamb's avatar
James Lamb committed
459

Guolin Ke's avatar
Guolin Ke committed
460
461
        }
      }
James Lamb's avatar
James Lamb committed
462

463
      return(private$info[[name]])
James Lamb's avatar
James Lamb committed
464

Guolin Ke's avatar
Guolin Ke committed
465
    },
James Lamb's avatar
James Lamb committed
466

467
    # Set information
Guolin Ke's avatar
Guolin Ke committed
468
    setinfo = function(name, info) {
James Lamb's avatar
James Lamb committed
469

470
      # Create known attributes list
471
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
472

473
      # Check if attribute key is in the known attribute list
474
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
475
476
        stop("setinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
      }
James Lamb's avatar
James Lamb committed
477

478
479
480
481
482
483
      # Check for type of information
      info <- if (name == "group") {
        as.integer(info) # Integer
      } else {
        as.numeric(info) # Numeric
      }
James Lamb's avatar
James Lamb committed
484

485
      # Store information privately
Guolin Ke's avatar
Guolin Ke committed
486
      private$info[[name]] <- info
James Lamb's avatar
James Lamb committed
487

488
      if (!lgb.is.null.handle(x = private$handle) && !is.null(info)) {
James Lamb's avatar
James Lamb committed
489

490
        if (length(info) > 0L) {
James Lamb's avatar
James Lamb committed
491

492
493
          .Call(
            LGBM_DatasetSetField_R
494
            , private$handle
495
            , name
496
497
498
            , info
            , length(info)
          )
James Lamb's avatar
James Lamb committed
499

500
501
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
502
        }
James Lamb's avatar
James Lamb committed
503

Guolin Ke's avatar
Guolin Ke committed
504
      }
James Lamb's avatar
James Lamb committed
505

506
      return(invisible(self))
James Lamb's avatar
James Lamb committed
507

Guolin Ke's avatar
Guolin Ke committed
508
    },
James Lamb's avatar
James Lamb committed
509

510
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
511
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
512

513
      # Perform slicing
514
515
516
517
518
519
520
521
522
523
524
525
526
      return(
        Dataset$new(
          data = NULL
          , params = private$params
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
          , ...
        )
527
      )
James Lamb's avatar
James Lamb committed
528

Guolin Ke's avatar
Guolin Ke committed
529
    },
James Lamb's avatar
James Lamb committed
530

531
532
533
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
534
    update_params = function(params) {
535
536
537
      if (length(params) == 0L) {
        return(invisible(self))
      }
538
      if (lgb.is.null.handle(x = private$handle)) {
539
540
        private$params <- modifyList(private$params, params)
      } else {
541
542
        tryCatch({
          .Call(
543
            LGBM_DatasetUpdateParamChecking_R
544
545
546
547
548
549
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
550
          if (is.null(private$raw_data)) {
551
            stop(e)
552
553
          }

554
555
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
556
557
          private$params <- modifyList(private$params, params)
          self$finalize()
558
        })
559
      }
560
      return(invisible(self))
James Lamb's avatar
James Lamb committed
561

Guolin Ke's avatar
Guolin Ke committed
562
    },
James Lamb's avatar
James Lamb committed
563

564
565
566
567
568
569
570
571
572
573
574
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

575
    # Set categorical feature parameter
576
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
577

578
579
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
580
        return(invisible(self))
581
      }
James Lamb's avatar
James Lamb committed
582

583
      # Check for empty data
584
      if (is.null(private$raw_data)) {
585
586
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
587
      }
James Lamb's avatar
James Lamb committed
588

589
      # Overwrite categorical features
590
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
591

592
      # Finalize and return self
593
      self$finalize()
594
      return(invisible(self))
James Lamb's avatar
James Lamb committed
595

596
    },
James Lamb's avatar
James Lamb committed
597

598
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
599
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
600

601
      # Set known references
602
603
604
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)
James Lamb's avatar
James Lamb committed
605

606
607
      # Check for identical references
      if (identical(private$reference, reference)) {
608
        return(invisible(self))
609
      }
James Lamb's avatar
James Lamb committed
610

611
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
612
      if (is.null(private$raw_data)) {
James Lamb's avatar
James Lamb committed
613

614
615
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
James Lamb's avatar
James Lamb committed
616

Guolin Ke's avatar
Guolin Ke committed
617
      }
James Lamb's avatar
James Lamb committed
618

619
      # Check for non-existing reference
Guolin Ke's avatar
Guolin Ke committed
620
      if (!is.null(reference)) {
James Lamb's avatar
James Lamb committed
621

622
        # Reference is unknown
623
        if (!lgb.is.Dataset(reference)) {
624
          stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
625
        }
James Lamb's avatar
James Lamb committed
626

Guolin Ke's avatar
Guolin Ke committed
627
      }
James Lamb's avatar
James Lamb committed
628

629
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
630
      private$reference <- reference
James Lamb's avatar
James Lamb committed
631

632
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
633
      self$finalize()
634
      return(invisible(self))
James Lamb's avatar
James Lamb committed
635

Guolin Ke's avatar
Guolin Ke committed
636
    },
James Lamb's avatar
James Lamb committed
637

638
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
639
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
640

641
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
642
      self$construct()
643
644
      .Call(
        LGBM_DatasetSaveBinary_R
645
        , private$handle
646
        , fname
647
      )
648
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
649
    }
James Lamb's avatar
James Lamb committed
650

Guolin Ke's avatar
Guolin Ke committed
651
652
  ),
  private = list(
653
654
655
656
657
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
658
    categorical_feature = NULL,
659
660
661
662
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
663
    version = 0L,
James Lamb's avatar
James Lamb committed
664

665
666
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
667

668
      # Get handle and construct if needed
669
      if (lgb.is.null.handle(x = private$handle)) {
670
671
        self$construct()
      }
672
      return(private$handle)
James Lamb's avatar
James Lamb committed
673

Guolin Ke's avatar
Guolin Ke committed
674
    },
James Lamb's avatar
James Lamb committed
675

676
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
677
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
678

679
      if (identical(private$predictor, predictor)) {
680
        return(invisible(self))
681
      }
James Lamb's avatar
James Lamb committed
682

683
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
684
      if (is.null(private$raw_data)) {
685
686
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
687
      }
James Lamb's avatar
James Lamb committed
688

689
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
690
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
691

692
        # Predictor is unknown
693
        if (!lgb.is.Predictor(predictor)) {
694
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
695
        }
James Lamb's avatar
James Lamb committed
696

Guolin Ke's avatar
Guolin Ke committed
697
      }
James Lamb's avatar
James Lamb committed
698

699
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
700
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
701

702
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
703
      self$finalize()
704
      return(invisible(self))
James Lamb's avatar
James Lamb committed
705

Guolin Ke's avatar
Guolin Ke committed
706
    }
James Lamb's avatar
James Lamb committed
707

Guolin Ke's avatar
Guolin Ke committed
708
709
710
  )
)

711
712
713
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
Guolin Ke's avatar
Guolin Ke committed
714
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
715
716
717
718
719
720
721
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
722
#' @param colnames names of columns
723
724
725
726
727
728
729
730
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
Nikita Titov's avatar
Nikita Titov committed
731
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
732
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
James Lamb's avatar
James Lamb committed
733
#'
Guolin Ke's avatar
Guolin Ke committed
734
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
735
#'
Guolin Ke's avatar
Guolin Ke committed
736
#' @examples
737
#' \donttest{
738
739
740
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
741
742
743
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
744
#' lgb.Dataset.construct(dtrain)
745
#' }
Guolin Ke's avatar
Guolin Ke committed
746
747
#' @export
lgb.Dataset <- function(data,
748
749
750
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
751
                        categorical_feature = NULL,
752
753
                        free_raw_data = TRUE,
                        info = list(),
Guolin Ke's avatar
Guolin Ke committed
754
                        ...) {
James Lamb's avatar
James Lamb committed
755

756
  # Create new dataset
757
758
759
760
761
762
763
764
765
766
767
768
769
770
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
      , ...
    ))
  )
James Lamb's avatar
James Lamb committed
771

Guolin Ke's avatar
Guolin Ke committed
772
773
}

774
775
776
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
Guolin Ke's avatar
Guolin Ke committed
777
778
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
Nikita Titov's avatar
Nikita Titov committed
779
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
780
#' @param ... other information to pass to \code{info}.
James Lamb's avatar
James Lamb committed
781
#'
Guolin Ke's avatar
Guolin Ke committed
782
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
783
#'
Guolin Ke's avatar
Guolin Ke committed
784
#' @examples
785
#' \donttest{
786
787
788
789
790
791
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
792
#' }
Guolin Ke's avatar
Guolin Ke committed
793
#' @export
794
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
James Lamb's avatar
James Lamb committed
795

796
  # Check if dataset is not a dataset
797
  if (!lgb.is.Dataset(x = dataset)) {
798
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
799
  }
James Lamb's avatar
James Lamb committed
800

801
  # Create validation dataset
802
  return(invisible(dataset$create_valid(data = data, info = info, ...)))
James Lamb's avatar
James Lamb committed
803

804
}
Guolin Ke's avatar
Guolin Ke committed
805

806
807
808
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
809
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
810
#'
Guolin Ke's avatar
Guolin Ke committed
811
#' @examples
812
#' \donttest{
813
814
815
816
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
817
#' }
818
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
819
820
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
821

822
  # Check if dataset is not a dataset
823
  if (!lgb.is.Dataset(x = dataset)) {
824
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
825
  }
James Lamb's avatar
James Lamb committed
826

827
  # Construct the dataset
828
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
829

Guolin Ke's avatar
Guolin Ke committed
830
831
}

832
833
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
834
835
#' @param x Object of class \code{lgb.Dataset}
#' @param ... other parameters
James Lamb's avatar
James Lamb committed
836
#'
Guolin Ke's avatar
Guolin Ke committed
837
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
838
#'
Guolin Ke's avatar
Guolin Ke committed
839
840
841
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
842
#'
Guolin Ke's avatar
Guolin Ke committed
843
#' @examples
844
#' \donttest{
845
846
847
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
848
#'
849
850
851
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
852
#' }
Guolin Ke's avatar
Guolin Ke committed
853
854
855
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
856

857
  # Check if dataset is not a dataset
858
  if (!lgb.is.Dataset(x = x)) {
859
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
860
  }
James Lamb's avatar
James Lamb committed
861

862
  return(x$dim())
James Lamb's avatar
James Lamb committed
863

Guolin Ke's avatar
Guolin Ke committed
864
865
}

866
867
868
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
869
870
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
871
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
872
873
874
875
876
877
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
878
#' \donttest{
879
880
881
882
883
884
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
885
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
886
#' print(dtrain, verbose = TRUE)
887
#' }
Guolin Ke's avatar
Guolin Ke committed
888
#' @rdname dimnames.lgb.Dataset
889
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
890
891
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
892

893
  # Check if dataset is not a dataset
894
  if (!lgb.is.Dataset(x = x)) {
895
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
896
  }
James Lamb's avatar
James Lamb committed
897

898
  # Return dimension names
899
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
900

Guolin Ke's avatar
Guolin Ke committed
901
902
903
904
905
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
906

907
  # Check if invalid element list
908
  if (!identical(class(value), "list") || length(value) != 2L) {
909
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
910
  }
James Lamb's avatar
James Lamb committed
911

912
913
914
915
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
916

917
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
918

919
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
920
    return(x)
James Lamb's avatar
James Lamb committed
921

922
  }
James Lamb's avatar
James Lamb committed
923

924
  # Check for unmatching column size
925
  if (ncol(x) != length(value[[2L]])) {
926
927
    stop(
      "can't assign "
928
      , sQuote(length(value[[2L]]))
929
930
931
932
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
933
  }
James Lamb's avatar
James Lamb committed
934

935
  # Set column names properly, and return
936
  x$set_colnames(colnames = value[[2L]])
937
  return(x)
James Lamb's avatar
James Lamb committed
938

Guolin Ke's avatar
Guolin Ke committed
939
940
}

941
942
943
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
944
#' @param dataset Object of class \code{lgb.Dataset}
945
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
946
947
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
948
#'
Guolin Ke's avatar
Guolin Ke committed
949
#' @examples
950
#' \donttest{
951
952
953
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
954
#'
955
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
956
#' lgb.Dataset.construct(dsub)
957
#' labels <- lightgbm::getinfo(dsub, "label")
958
#' }
Guolin Ke's avatar
Guolin Ke committed
959
#' @export
960
961
962
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
963
964
965
966

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
967

968
  # Check if dataset is not a dataset
969
  if (!lgb.is.Dataset(x = dataset)) {
970
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
971
  }
James Lamb's avatar
James Lamb committed
972

973
  # Return sliced set
974
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
975

Guolin Ke's avatar
Guolin Ke committed
976
977
}

978
979
980
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
981
982
983
984
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#' @return info data
James Lamb's avatar
James Lamb committed
985
#'
Guolin Ke's avatar
Guolin Ke committed
986
987
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
988
#'
Guolin Ke's avatar
Guolin Ke committed
989
990
991
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
992
993
994
995
996
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
997
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
998
#' }
James Lamb's avatar
James Lamb committed
999
#'
Guolin Ke's avatar
Guolin Ke committed
1000
#' @examples
1001
#' \donttest{
1002
1003
1004
1005
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1006
#'
1007
1008
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1009
#'
1010
1011
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1012
#' }
Guolin Ke's avatar
Guolin Ke committed
1013
#' @export
1014
1015
1016
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1017
1018
1019
1020

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1021

1022
  # Check if dataset is not a dataset
1023
  if (!lgb.is.Dataset(x = dataset)) {
1024
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1025
  }
James Lamb's avatar
James Lamb committed
1026

1027
  return(dataset$getinfo(name = name))
James Lamb's avatar
James Lamb committed
1028

Guolin Ke's avatar
Guolin Ke committed
1029
1030
}

1031
1032
1033
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1034
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1035
1036
1037
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
1038
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1039
#'
Guolin Ke's avatar
Guolin Ke committed
1040
1041
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1042
#'
Guolin Ke's avatar
Guolin Ke committed
1043
#' \itemize{
1044
1045
1046
1047
1048
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1049
1050
1051
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1052
#' }
James Lamb's avatar
James Lamb committed
1053
#'
Guolin Ke's avatar
Guolin Ke committed
1054
#' @examples
1055
#' \donttest{
1056
1057
1058
1059
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1060
#'
1061
1062
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1063
#'
1064
1065
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1066
#' }
Guolin Ke's avatar
Guolin Ke committed
1067
#' @export
1068
1069
1070
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1071
1072
1073
1074

#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1075

1076
  if (!lgb.is.Dataset(x = dataset)) {
1077
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1078
  }
James Lamb's avatar
James Lamb committed
1079

1080
  # Set information
1081
  return(invisible(dataset$setinfo(name = name, info = info)))
Guolin Ke's avatar
Guolin Ke committed
1082
1083
}

1084
1085
1086
1087
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1088
#' @param dataset object of class \code{lgb.Dataset}
1089
1090
1091
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1092
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1093
#'
1094
#' @examples
1095
#' \donttest{
1096
1097
1098
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1099
1100
1101
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1102
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1103
#' }
1104
1105
1106
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1107

1108
  if (!lgb.is.Dataset(x = dataset)) {
1109
1110
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1111

1112
  # Set categoricals
1113
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1114

1115
1116
}

1117
1118
1119
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1120
1121
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1122
#'
1123
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1124
#'
Guolin Ke's avatar
Guolin Ke committed
1125
#' @examples
1126
#' \donttest{
1127
1128
1129
1130
1131
1132
1133
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test = train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
1134
#' }
Guolin Ke's avatar
Guolin Ke committed
1135
1136
1137
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1138

1139
  # Check if dataset is not a dataset
1140
  if (!lgb.is.Dataset(x = dataset)) {
1141
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1142
  }
James Lamb's avatar
James Lamb committed
1143

1144
  # Set reference
1145
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1146
1147
}

1148
1149
1150
1151
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1152
1153
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1154
#'
1155
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1156
#'
Guolin Ke's avatar
Guolin Ke committed
1157
#' @examples
1158
#' \donttest{
1159
1160
1161
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1162
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1163
#' }
Guolin Ke's avatar
Guolin Ke committed
1164
1165
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1166

1167
  # Check if dataset is not a dataset
1168
  if (!lgb.is.Dataset(x = dataset)) {
1169
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1170
  }
James Lamb's avatar
James Lamb committed
1171

1172
  # File-type is not matching
1173
1174
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1175
  }
James Lamb's avatar
James Lamb committed
1176

1177
  # Store binary
1178
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1179
}