lgb.Dataset.R 38 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @keywords internal
NULL

16
17
18
19
20
21
22
23
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
24
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
25
#' @importFrom R6 R6Class
26
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
27
28
Dataset <- R6::R6Class(

29
  classname = "lgb.Dataset",
30
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
31
  public = list(
James Lamb's avatar
James Lamb committed
32

33
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
34
    finalize = function() {
35
36
37
38
39
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
40
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
41
    },
James Lamb's avatar
James Lamb committed
42

43
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
44
    initialize = function(data,
45
46
47
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
48
                          categorical_feature = NULL,
49
50
51
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
52
53
54
55
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
56

57
      # validate inputs early to avoid unnecessary computation
58
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
59
60
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
61
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
62
63
64
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

65
      info <- list()
66
67
68
69
70
71
72
73
74
75
76
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
77
      }
James Lamb's avatar
James Lamb committed
78

79
80
81
82
83
84
85
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
86

87
88
89
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
90
      private$reference <- reference
91
      private$colnames <- colnames
92

93
      private$categorical_feature <- categorical_feature
94
95
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
96
      private$used_indices <- sort(used_indices, decreasing = FALSE)
97
      private$info <- info
98
      private$version <- 0L
James Lamb's avatar
James Lamb committed
99

100
101
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
102
    },
James Lamb's avatar
James Lamb committed
103

104
    create_valid = function(data,
105
106
107
108
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
109
                            params = list()) {
110
111

      # the Dataset's existing parameters should be overwritten by any passed in to this call
112
      params <- modifyList(private$params, params)
113

114
      # Create new dataset
115
116
      ret <- Dataset$new(
        data = data
117
        , params = params
118
119
120
121
122
123
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
124
125
126
127
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
128
      )
James Lamb's avatar
James Lamb committed
129

130
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
131

Guolin Ke's avatar
Guolin Ke committed
132
    },
James Lamb's avatar
James Lamb committed
133

134
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
135
    construct = function() {
James Lamb's avatar
James Lamb committed
136

137
      # Check for handle null
138
      if (!lgb.is.null.handle(x = private$handle)) {
139
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
140
      }
James Lamb's avatar
James Lamb committed
141

Guolin Ke's avatar
Guolin Ke committed
142
143
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
144
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
145
146
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
147

148
      # set feature names if they do not exist
149
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
150
151
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
152

153
154
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
155

156
        # Check for character name
157
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
158

159
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
160

161
            # Provided indices, but some indices are missing?
162
            if (sum(is.na(cate_indices)) > 0L) {
163
164
165
166
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
167
            }
James Lamb's avatar
James Lamb committed
168

169
          } else {
James Lamb's avatar
James Lamb committed
170

171
            # Check if more categorical features were output over the feature space
172
            if (max(private$categorical_feature) > length(private$colnames)) {
173
174
175
176
177
178
179
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
180
            }
James Lamb's avatar
James Lamb committed
181

182
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
183
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
184

185
          }
James Lamb's avatar
James Lamb committed
186

187
        # Store indices for categorical features
188
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
189

190
      }
James Lamb's avatar
James Lamb committed
191

Guolin Ke's avatar
Guolin Ke committed
192
      # Generate parameter str
193
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
194

195
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
200

201
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
202
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
203

204
205
206
207
208
209
210
211
212
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

213
        # Are we using a data file?
214
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
215

216
          handle <- .Call(
217
            LGBM_DatasetCreateFromFile_R
218
            , path.expand(private$raw_data)
219
220
221
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
222

Guolin Ke's avatar
Guolin Ke committed
223
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
224

225
          # Are we using a matrix?
226
          handle <- .Call(
227
            LGBM_DatasetCreateFromMat_R
228
229
230
231
232
233
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
234
235

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
236
          if (length(private$raw_data@p) > 2147483647L) {
237
238
            stop("Cannot support large CSC matrix")
          }
239
          # Are we using a dgCMatrix (sparse matrix column compressed)
240
          handle <- .Call(
241
            LGBM_DatasetCreateFromCSC_R
242
243
244
245
246
247
248
249
250
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
251

Guolin Ke's avatar
Guolin Ke committed
252
        } else {
James Lamb's avatar
James Lamb committed
253

254
          # Unknown data type
255
256
257
258
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
259

Guolin Ke's avatar
Guolin Ke committed
260
        }
James Lamb's avatar
James Lamb committed
261

Guolin Ke's avatar
Guolin Ke committed
262
      } else {
James Lamb's avatar
James Lamb committed
263

264
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
265
        if (is.null(private$reference)) {
266
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
267
        }
James Lamb's avatar
James Lamb committed
268

269
        # Construct subset
270
        handle <- .Call(
271
          LGBM_DatasetGetSubset_R
272
273
274
275
276
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
277

Guolin Ke's avatar
Guolin Ke committed
278
      }
279
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
280
281
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
282
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
283
284
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
285

286
287
      # Set feature names
      if (!is.null(private$colnames)) {
288
        self$set_colnames(colnames = private$colnames)
289
      }
290

291
292
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
293

294
        # Setup initial scores
295
        init_score <- private$predictor$predict(
296
          data = private$raw_data
297
298
          , rawscore = TRUE
        )
James Lamb's avatar
James Lamb committed
299

300
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
301
302
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
303

304
      }
James Lamb's avatar
James Lamb committed
305

306
307
308
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
309
      }
James Lamb's avatar
James Lamb committed
310

311
      # Get private information
312
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
313

314
        # Set infos
315
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
316

Guolin Ke's avatar
Guolin Ke committed
317
          p <- private$info[i]
318
319
320
321
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
322

Guolin Ke's avatar
Guolin Ke committed
323
        }
James Lamb's avatar
James Lamb committed
324

Guolin Ke's avatar
Guolin Ke committed
325
      }
James Lamb's avatar
James Lamb committed
326

327
      # Get label information existence
328
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
329
330
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
331

332
      return(invisible(self))
James Lamb's avatar
James Lamb committed
333

Guolin Ke's avatar
Guolin Ke committed
334
    },
James Lamb's avatar
James Lamb committed
335

336
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
337
    dim = function() {
James Lamb's avatar
James Lamb committed
338

339
      # Check for handle
340
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
341

342
343
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
344

345
        # Get numeric data and numeric features
346
347
348
349
350
351
352
353
354
355
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
356
        return(
357
          c(num_row, num_col)
358
        )
James Lamb's avatar
James Lamb committed
359
360
361

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

362
        # Check if dgCMatrix (sparse matrix column compressed)
363
        # NOTE: requires Matrix package
364
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
365

Guolin Ke's avatar
Guolin Ke committed
366
      } else {
James Lamb's avatar
James Lamb committed
367

368
        # Trying to work with unknown dimensions is not possible
369
370
371
372
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
373

Guolin Ke's avatar
Guolin Ke committed
374
      }
James Lamb's avatar
James Lamb committed
375

Guolin Ke's avatar
Guolin Ke committed
376
    },
James Lamb's avatar
James Lamb committed
377

378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
    # Get number of bins for feature
    get_feature_num_bin = function(feature) {
      if (lgb.is.null.handle(x = private$handle)) {
        stop("Cannot get number of bins in feature before constructing Dataset.")
      }
      num_bin <- integer(1L)
      .Call(
        LGBM_DatasetGetFeatureNumBin_R
        , private$handle
        , feature - 1L
        , num_bin
      )
      return(num_bin)
    },

393
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
394
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
395

396
      # Check for handle
397
      if (!lgb.is.null.handle(x = private$handle)) {
398
        private$colnames <- .Call(
399
400
          LGBM_DatasetGetFeatureNames_R
          , private$handle
401
        )
402
        return(private$colnames)
James Lamb's avatar
James Lamb committed
403
404
405

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

406
        # Check if dgCMatrix (sparse matrix column compressed)
407
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
408

Guolin Ke's avatar
Guolin Ke committed
409
      } else {
James Lamb's avatar
James Lamb committed
410

411
        # Trying to work with unknown formats is not possible
412
        stop(
413
414
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
415
        )
James Lamb's avatar
James Lamb committed
416

Guolin Ke's avatar
Guolin Ke committed
417
      }
James Lamb's avatar
James Lamb committed
418

Guolin Ke's avatar
Guolin Ke committed
419
    },
James Lamb's avatar
James Lamb committed
420

421
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
422
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
423

424
425
      # Check column names non-existence
      if (is.null(colnames)) {
426
        return(invisible(self))
427
      }
James Lamb's avatar
James Lamb committed
428

429
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
430
      colnames <- as.character(colnames)
431
      if (length(colnames) == 0L) {
432
        return(invisible(self))
433
      }
James Lamb's avatar
James Lamb committed
434

435
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
436
      private$colnames <- colnames
437
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
438

439
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
440
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
441
442
        .Call(
          LGBM_DatasetSetFeatureNames_R
443
          , private$handle
444
          , merged_name
445
        )
James Lamb's avatar
James Lamb committed
446

Guolin Ke's avatar
Guolin Ke committed
447
      }
James Lamb's avatar
James Lamb committed
448

449
      return(invisible(self))
James Lamb's avatar
James Lamb committed
450

Guolin Ke's avatar
Guolin Ke committed
451
    },
James Lamb's avatar
James Lamb committed
452

453
    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
454

455
      # Check if attribute key is in the known attribute list
456
457
458
459
460
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
461
      }
James Lamb's avatar
James Lamb committed
462

463
      # Check for info name and handle
464
      if (is.null(private$info[[field_name]])) {
465

466
        if (lgb.is.null.handle(x = private$handle)) {
467
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
468
        }
469

470
        # Get field size of info
471
        info_len <- 0L
472
473
        .Call(
          LGBM_DatasetGetFieldSize_R
474
          , private$handle
475
          , field_name
476
          , info_len
477
        )
James Lamb's avatar
James Lamb committed
478

479
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
480

481
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
482
          ret <- NULL
483
          ret <- if (field_name == "group") {
484
            integer(info_len)
485
          } else {
486
            numeric(info_len)
487
          }
James Lamb's avatar
James Lamb committed
488

489
490
          .Call(
            LGBM_DatasetGetField_R
491
            , private$handle
492
            , field_name
493
            , ret
494
          )
James Lamb's avatar
James Lamb committed
495

496
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
497

Guolin Ke's avatar
Guolin Ke committed
498
499
        }
      }
James Lamb's avatar
James Lamb committed
500

501
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
502

Guolin Ke's avatar
Guolin Ke committed
503
    },
James Lamb's avatar
James Lamb committed
504

505
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
506

507
      # Check if attribute key is in the known attribute list
508
509
510
511
512
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
513
      }
James Lamb's avatar
James Lamb committed
514

515
      # Check for type of information
516
      data <- if (field_name == "group") {
517
        as.integer(data)
518
      } else {
519
        as.numeric(data)
520
      }
James Lamb's avatar
James Lamb committed
521

522
      # Store information privately
523
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
524

525
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
526

527
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
528

529
530
          .Call(
            LGBM_DatasetSetField_R
531
            , private$handle
532
533
534
            , field_name
            , data
            , length(data)
535
          )
James Lamb's avatar
James Lamb committed
536

537
538
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
539
        }
James Lamb's avatar
James Lamb committed
540

Guolin Ke's avatar
Guolin Ke committed
541
      }
James Lamb's avatar
James Lamb committed
542

543
      return(invisible(self))
James Lamb's avatar
James Lamb committed
544

Guolin Ke's avatar
Guolin Ke committed
545
    },
James Lamb's avatar
James Lamb committed
546

547
    slice = function(idxset) {
548

549
550
551
      return(
        Dataset$new(
          data = NULL
552
          , params = private$params
553
554
555
556
557
558
559
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
        )
560
      )
James Lamb's avatar
James Lamb committed
561

Guolin Ke's avatar
Guolin Ke committed
562
    },
James Lamb's avatar
James Lamb committed
563

564
565
566
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
567
    update_params = function(params) {
568
569
570
      if (length(params) == 0L) {
        return(invisible(self))
      }
571
      new_params <- utils::modifyList(private$params, params)
572
      if (lgb.is.null.handle(x = private$handle)) {
573
        private$params <- new_params
574
      } else {
575
576
        tryCatch({
          .Call(
577
            LGBM_DatasetUpdateParamChecking_R
578
            , lgb.params2str(params = private$params)
579
            , lgb.params2str(params = new_params)
580
          )
581
          private$params <- new_params
582
583
584
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
585
          if (is.null(private$raw_data)) {
586
            stop(e)
587
588
          }

589
590
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
591
          private$params <- new_params
592
          self$finalize()
593
        })
594
      }
595
      return(invisible(self))
James Lamb's avatar
James Lamb committed
596

Guolin Ke's avatar
Guolin Ke committed
597
    },
James Lamb's avatar
James Lamb committed
598

599
600
601
602
603
    # [description] Get only Dataset-specific parameters. This is primarily used by
    #               Booster to update its parameters based on the characteristics of
    #               a Dataset. It should not be used by other methods in this class,
    #               since "verbose" is not a Dataset parameter and needs to be passed
    #               through to avoid globally re-setting verbosity.
604
605
606
607
608
609
610
611
612
613
614
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

615
    # Set categorical feature parameter
616
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
617

618
619
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
620
        return(invisible(self))
621
      }
James Lamb's avatar
James Lamb committed
622

623
      # Check for empty data
624
      if (is.null(private$raw_data)) {
625
626
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
627
      }
James Lamb's avatar
James Lamb committed
628

629
      # Overwrite categorical features
630
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
631

632
      # Finalize and return self
633
      self$finalize()
634
      return(invisible(self))
James Lamb's avatar
James Lamb committed
635

636
    },
James Lamb's avatar
James Lamb committed
637

Guolin Ke's avatar
Guolin Ke committed
638
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
639

640
      # setting reference to this same Dataset object doesn't require any changes
641
      if (identical(private$reference, reference)) {
642
        return(invisible(self))
643
      }
James Lamb's avatar
James Lamb committed
644

645
646
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
647
      if (is.null(private$raw_data)) {
648
649
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
650
      }
James Lamb's avatar
James Lamb committed
651

652
653
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
654
      }
James Lamb's avatar
James Lamb committed
655

656
657
658
659
660
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

661
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
662
      private$reference <- reference
James Lamb's avatar
James Lamb committed
663

664
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
665
      self$finalize()
666
      return(invisible(self))
James Lamb's avatar
James Lamb committed
667

Guolin Ke's avatar
Guolin Ke committed
668
    },
James Lamb's avatar
James Lamb committed
669

670
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
671
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
672

673
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
674
      self$construct()
675
676
      .Call(
        LGBM_DatasetSaveBinary_R
677
        , private$handle
678
        , path.expand(fname)
679
      )
680
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
681
    }
James Lamb's avatar
James Lamb committed
682

Guolin Ke's avatar
Guolin Ke committed
683
684
  ),
  private = list(
685
686
687
688
689
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
690
    categorical_feature = NULL,
691
692
693
694
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
695
    version = 0L,
James Lamb's avatar
James Lamb committed
696

697
    get_handle = function() {
James Lamb's avatar
James Lamb committed
698

699
      # Get handle and construct if needed
700
      if (lgb.is.null.handle(x = private$handle)) {
701
702
        self$construct()
      }
703
      return(private$handle)
James Lamb's avatar
James Lamb committed
704

Guolin Ke's avatar
Guolin Ke committed
705
    },
James Lamb's avatar
James Lamb committed
706

Guolin Ke's avatar
Guolin Ke committed
707
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
708

709
      if (identical(private$predictor, predictor)) {
710
        return(invisible(self))
711
      }
James Lamb's avatar
James Lamb committed
712

713
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
714
      if (is.null(private$raw_data)) {
715
716
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
717
      }
James Lamb's avatar
James Lamb committed
718

719
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
720
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
721

722
        # Predictor is unknown
723
        if (!lgb.is.Predictor(predictor)) {
724
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
725
        }
James Lamb's avatar
James Lamb committed
726

Guolin Ke's avatar
Guolin Ke committed
727
      }
James Lamb's avatar
James Lamb committed
728

729
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
730
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
731

732
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
733
      self$finalize()
734
      return(invisible(self))
James Lamb's avatar
James Lamb committed
735

Guolin Ke's avatar
Guolin Ke committed
736
    }
James Lamb's avatar
James Lamb committed
737

Guolin Ke's avatar
Guolin Ke committed
738
739
740
  )
)

741
742
743
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
744
#' @inheritParams lgb_shared_dataset_params
745
746
747
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
748
749
750
751
752
753
754
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
755
#' @param colnames names of columns
756
757
758
759
760
761
762
763
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
James Lamb's avatar
James Lamb committed
764
#'
Guolin Ke's avatar
Guolin Ke committed
765
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
766
#'
Guolin Ke's avatar
Guolin Ke committed
767
#' @examples
768
#' \donttest{
769
770
771
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
772
773
774
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
775
#' lgb.Dataset.construct(dtrain)
776
#' }
Guolin Ke's avatar
Guolin Ke committed
777
778
#' @export
lgb.Dataset <- function(data,
779
780
781
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
782
                        categorical_feature = NULL,
783
                        free_raw_data = TRUE,
784
785
786
                        label = NULL,
                        weight = NULL,
                        group = NULL,
787
                        init_score = NULL) {
788

789
790
791
792
793
794
795
796
797
798
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
799
800
801
802
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
803
804
    ))
  )
James Lamb's avatar
James Lamb committed
805

Guolin Ke's avatar
Guolin Ke committed
806
807
}

808
809
810
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
811
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
812
#' @param dataset \code{lgb.Dataset} object, training data
813
814
815
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
816
817
818
819
820
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
James Lamb's avatar
James Lamb committed
821
#'
Guolin Ke's avatar
Guolin Ke committed
822
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
823
#'
Guolin Ke's avatar
Guolin Ke committed
824
#' @examples
825
#' \donttest{
826
827
828
829
830
831
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
867
#' }
Guolin Ke's avatar
Guolin Ke committed
868
#' @export
869
870
871
872
873
874
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
875
                                     params = list()) {
James Lamb's avatar
James Lamb committed
876

877
  if (!lgb.is.Dataset(x = dataset)) {
878
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
879
  }
James Lamb's avatar
James Lamb committed
880

881
  # Create validation dataset
882
883
884
885
886
887
888
  return(invisible(
    dataset$create_valid(
      data = data
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
889
      , params = params
890
891
    )
  ))
James Lamb's avatar
James Lamb committed
892

893
}
Guolin Ke's avatar
Guolin Ke committed
894

895
896
897
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
898
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
899
#'
Guolin Ke's avatar
Guolin Ke committed
900
#' @examples
901
#' \donttest{
902
903
904
905
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
906
#' }
907
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
908
909
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
910

911
  if (!lgb.is.Dataset(x = dataset)) {
912
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
913
  }
James Lamb's avatar
James Lamb committed
914

915
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
916

Guolin Ke's avatar
Guolin Ke committed
917
918
}

919
920
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
921
#' @param x Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
922
#'
Guolin Ke's avatar
Guolin Ke committed
923
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
924
#'
Guolin Ke's avatar
Guolin Ke committed
925
926
927
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
928
#'
Guolin Ke's avatar
Guolin Ke committed
929
#' @examples
930
#' \donttest{
931
932
933
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
934
#'
935
936
937
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
938
#' }
Guolin Ke's avatar
Guolin Ke committed
939
940
#' @rdname dim
#' @export
941
dim.lgb.Dataset <- function(x) {
942

943
  if (!lgb.is.Dataset(x = x)) {
944
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
945
  }
James Lamb's avatar
James Lamb committed
946

947
  return(x$dim())
James Lamb's avatar
James Lamb committed
948

Guolin Ke's avatar
Guolin Ke committed
949
950
}

951
952
953
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
954
955
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
956
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
957
958
959
960
961
962
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
963
#' \donttest{
964
965
966
967
968
969
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
970
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
971
#' print(dtrain, verbose = TRUE)
972
#' }
Guolin Ke's avatar
Guolin Ke committed
973
#' @rdname dimnames.lgb.Dataset
974
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
975
976
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
977

978
  if (!lgb.is.Dataset(x = x)) {
979
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
980
  }
James Lamb's avatar
James Lamb committed
981

982
  # Return dimension names
983
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
984

Guolin Ke's avatar
Guolin Ke committed
985
986
987
988
989
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
990

991
  # Check if invalid element list
992
  if (!identical(class(value), "list") || length(value) != 2L) {
993
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
994
  }
James Lamb's avatar
James Lamb committed
995

996
997
998
999
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1000

1001
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1002

1003
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1004
    return(x)
James Lamb's avatar
James Lamb committed
1005

1006
  }
James Lamb's avatar
James Lamb committed
1007

1008
  # Check for unmatching column size
1009
  if (ncol(x) != length(value[[2L]])) {
1010
1011
    stop(
      "can't assign "
1012
      , sQuote(length(value[[2L]]))
1013
1014
1015
1016
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1017
  }
James Lamb's avatar
James Lamb committed
1018

1019
  # Set column names properly, and return
1020
  x$set_colnames(colnames = value[[2L]])
1021
  return(x)
James Lamb's avatar
James Lamb committed
1022

Guolin Ke's avatar
Guolin Ke committed
1023
1024
}

1025
1026
1027
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1028
#' @param dataset Object of class \code{lgb.Dataset}
1029
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1030
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1031
#'
Guolin Ke's avatar
Guolin Ke committed
1032
#' @examples
1033
#' \donttest{
1034
1035
1036
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1037
#'
1038
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1039
#' lgb.Dataset.construct(dsub)
1040
#' labels <- lightgbm::get_field(dsub, "label")
1041
#' }
Guolin Ke's avatar
Guolin Ke committed
1042
#' @export
1043
slice <- function(dataset, idxset) {
1044
1045
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1046
1047
1048

#' @rdname slice
#' @export
1049
slice.lgb.Dataset <- function(dataset, idxset) {
James Lamb's avatar
James Lamb committed
1050

1051
  if (!lgb.is.Dataset(x = dataset)) {
1052
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1053
  }
James Lamb's avatar
James Lamb committed
1054

1055
  return(invisible(dataset$slice(idxset = idxset)))
James Lamb's avatar
James Lamb committed
1056

Guolin Ke's avatar
Guolin Ke committed
1057
1058
}

1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1152
1153
}

1154
1155
1156
1157
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1158
#' @param dataset object of class \code{lgb.Dataset}
1159
1160
1161
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1162
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1163
#'
1164
#' @examples
1165
#' \donttest{
1166
1167
1168
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1169
1170
1171
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1172
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1173
#' }
1174
1175
1176
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1177

1178
  if (!lgb.is.Dataset(x = dataset)) {
1179
1180
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1181

1182
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1183

1184
1185
}

1186
1187
1188
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1189
1190
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1191
#'
1192
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1193
#'
Guolin Ke's avatar
Guolin Ke committed
1194
#' @examples
1195
#' \donttest{
1196
#' # create training Dataset
1197
1198
1199
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1200
1201
#'
#' # create a validation Dataset, using dtrain as a reference
1202
1203
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1204
#' dtest <- lgb.Dataset(test$data, label = test$label)
1205
#' lgb.Dataset.set.reference(dtest, dtrain)
1206
#' }
Guolin Ke's avatar
Guolin Ke committed
1207
1208
1209
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1210

1211
  if (!lgb.is.Dataset(x = dataset)) {
1212
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1213
  }
James Lamb's avatar
James Lamb committed
1214

1215
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1216
1217
}

1218
1219
1220
1221
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1222
1223
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1224
#'
1225
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1226
#'
Guolin Ke's avatar
Guolin Ke committed
1227
#' @examples
1228
#' \donttest{
1229
1230
1231
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1232
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1233
#' }
Guolin Ke's avatar
Guolin Ke committed
1234
1235
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1236

1237
  if (!lgb.is.Dataset(x = dataset)) {
1238
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1239
  }
James Lamb's avatar
James Lamb committed
1240

1241
1242
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1243
  }
James Lamb's avatar
James Lamb committed
1244

1245
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1246
}