lgb.Dataset.R 39.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
13
#' @details This page contains shared documentation for dataset-related parameters used throughout the package.
14
15
16
#' @keywords internal
NULL

17
18
19
20
21
22
23
24
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
25
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
26
#' @importFrom R6 R6Class
27
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
28
29
Dataset <- R6::R6Class(

30
  classname = "lgb.Dataset",
31
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
32
  public = list(
James Lamb's avatar
James Lamb committed
33

34
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
35
    initialize = function(data,
36
37
38
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
39
                          categorical_feature = NULL,
40
41
42
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
43
44
45
46
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
47

48
      # validate inputs early to avoid unnecessary computation
49
      if (!(is.null(reference) || .is_Dataset(reference))) {
50
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset", q = FALSE))
51
      }
52
      if (!(is.null(predictor) || .is_Predictor(predictor))) {
53
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor", q = FALSE))
54
55
      }

56
      info <- list()
57
58
59
60
61
62
63
64
65
66
67
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
68
      }
James Lamb's avatar
James Lamb committed
69

70
71
72
73
74
75
76
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
77

78
79
80
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
81
      private$reference <- reference
82
      private$colnames <- colnames
83

84
      private$categorical_feature <- categorical_feature
85
86
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
87
      private$used_indices <- sort(used_indices, decreasing = FALSE)
88
      private$info <- info
89
      private$version <- 0L
James Lamb's avatar
James Lamb committed
90

91
92
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
93
    },
James Lamb's avatar
James Lamb committed
94

95
    create_valid = function(data,
96
97
98
99
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
100
                            params = list()) {
101
102

      # the Dataset's existing parameters should be overwritten by any passed in to this call
103
      params <- modifyList(private$params, params)
104

105
      # Create new dataset
106
107
      ret <- Dataset$new(
        data = data
108
        , params = params
109
110
111
112
113
114
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
115
116
117
118
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
119
      )
James Lamb's avatar
James Lamb committed
120

121
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
122

Guolin Ke's avatar
Guolin Ke committed
123
    },
James Lamb's avatar
James Lamb committed
124

125
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
126
    construct = function() {
James Lamb's avatar
James Lamb committed
127

128
      # Check for handle null
129
      if (!.is_null_handle(x = private$handle)) {
130
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
131
      }
James Lamb's avatar
James Lamb committed
132

Guolin Ke's avatar
Guolin Ke committed
133
134
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
135
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
136
137
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
138

139
      # set feature names if they do not exist
140
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
141
142
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
143

144
145
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
146

147
        # Check for character name
148
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
149

150
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
151

152
            # Provided indices, but some indices are missing?
153
            if (sum(is.na(cate_indices)) > 0L) {
154
              stop(
155
                "lgb.Dataset.construct: supplied an unknown feature in categorical_feature: "
156
                , sQuote(private$categorical_feature[is.na(cate_indices)], q = FALSE)
157
              )
158
            }
James Lamb's avatar
James Lamb committed
159

160
          } else {
James Lamb's avatar
James Lamb committed
161

162
            # Check if more categorical features were output over the feature space
163
            data_is_not_filename <- !is.character(private$raw_data)
164
165
166
167
168
169
            if (
              data_is_not_filename
              && !is.null(private$raw_data)
              && is.null(private$used_indices)
              && max(private$categorical_feature) > ncol(private$raw_data)
            ) {
170
              stop(
171
                "lgb.Dataset.construct: supplied a too large value in categorical_feature: "
172
173
                , max(private$categorical_feature)
                , " but only "
174
                , ncol(private$raw_data)
175
176
                , " features"
              )
177
            }
James Lamb's avatar
James Lamb committed
178

179
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
180
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
181

182
          }
James Lamb's avatar
James Lamb committed
183

184
        # Store indices for categorical features
185
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
186

187
      }
James Lamb's avatar
James Lamb committed
188

Guolin Ke's avatar
Guolin Ke committed
189
      # Generate parameter str
190
      params_str <- .params2str(params = private$params)
James Lamb's avatar
James Lamb committed
191

192
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
193
194
195
196
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
197

198
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
199
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
200

201
202
203
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
204
            , "This can happen if the Dataset's finalizer was called or if this Dataset was saved with saveRDS(). "
205
206
207
208
209
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

210
        # Are we using a data file?
211
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
212

213
          handle <- .Call(
214
            LGBM_DatasetCreateFromFile_R
215
            , path.expand(private$raw_data)
216
217
218
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
219

Guolin Ke's avatar
Guolin Ke committed
220
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
221

222
          # Are we using a matrix?
223
          handle <- .Call(
224
            LGBM_DatasetCreateFromMat_R
225
226
227
228
229
230
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
231
232

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
233
          if (length(private$raw_data@p) > 2147483647L) {
234
235
            stop("Cannot support large CSC matrix")
          }
236
          # Are we using a dgCMatrix (sparse matrix column compressed)
237
          handle <- .Call(
238
            LGBM_DatasetCreateFromCSC_R
239
240
241
242
243
244
245
246
247
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
248

Guolin Ke's avatar
Guolin Ke committed
249
        } else {
James Lamb's avatar
James Lamb committed
250

251
          # Unknown data type
252
253
          stop(
            "lgb.Dataset.construct: does not support constructing from "
254
            , sQuote(class(private$raw_data), q = FALSE)
255
          )
James Lamb's avatar
James Lamb committed
256

Guolin Ke's avatar
Guolin Ke committed
257
        }
James Lamb's avatar
James Lamb committed
258

Guolin Ke's avatar
Guolin Ke committed
259
      } else {
James Lamb's avatar
James Lamb committed
260

261
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
262
        if (is.null(private$reference)) {
263
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
264
        }
James Lamb's avatar
James Lamb committed
265

266
        # Construct subset
267
        handle <- .Call(
268
          LGBM_DatasetGetSubset_R
269
          , ref_handle
270
          , c(private$used_indices)
271
272
273
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
274

Guolin Ke's avatar
Guolin Ke committed
275
      }
276
      if (.is_null_handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
277
278
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
279
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
280
281
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
282

283
284
      # Set feature names
      if (!is.null(private$colnames)) {
285
        self$set_colnames(colnames = private$colnames)
286
      }
287

288
289
290
291
292
293
294
      # Ensure that private$colnames matches the feature names on the C++ side. This line is necessary
      # in cases like constructing from a file or from a matrix with no column names.
      private$colnames <- .Call(
          LGBM_DatasetGetFeatureNames_R
          , private$handle
      )

295
296
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
297

298
        # Setup initial scores
299
        init_score <- private$predictor$predict(
300
          data = private$raw_data
301
302
          , rawscore = TRUE
        )
James Lamb's avatar
James Lamb committed
303

304
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
305
306
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
307

308
      }
James Lamb's avatar
James Lamb committed
309

310
311
312
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
313
      }
James Lamb's avatar
James Lamb committed
314

315
      # Get private information
316
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
317

318
        # Set infos
319
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
320

Guolin Ke's avatar
Guolin Ke committed
321
          p <- private$info[i]
322
323
324
325
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
326

Guolin Ke's avatar
Guolin Ke committed
327
        }
James Lamb's avatar
James Lamb committed
328

Guolin Ke's avatar
Guolin Ke committed
329
      }
James Lamb's avatar
James Lamb committed
330

331
      # Get label information existence
332
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
333
334
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
335

336
      return(invisible(self))
James Lamb's avatar
James Lamb committed
337

Guolin Ke's avatar
Guolin Ke committed
338
    },
James Lamb's avatar
James Lamb committed
339

340
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
341
    dim = function() {
James Lamb's avatar
James Lamb committed
342

343
      # Check for handle
344
      if (!.is_null_handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
345

346
347
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
348

349
        # Get numeric data and numeric features
350
351
352
353
354
355
356
357
358
359
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
360
        return(
361
          c(num_row, num_col)
362
        )
James Lamb's avatar
James Lamb committed
363
364
365

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

366
        # Check if dgCMatrix (sparse matrix column compressed)
367
        # NOTE: requires Matrix package
368
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
369

Guolin Ke's avatar
Guolin Ke committed
370
      } else {
James Lamb's avatar
James Lamb committed
371

372
        # Trying to work with unknown dimensions is not possible
373
374
375
376
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
377

Guolin Ke's avatar
Guolin Ke committed
378
      }
James Lamb's avatar
James Lamb committed
379

Guolin Ke's avatar
Guolin Ke committed
380
    },
James Lamb's avatar
James Lamb committed
381

382
383
    # Get number of bins for feature
    get_feature_num_bin = function(feature) {
384
      if (.is_null_handle(x = private$handle)) {
385
386
        stop("Cannot get number of bins in feature before constructing Dataset.")
      }
387
388
389
390
391
392
393
      if (is.character(feature)) {
        feature_name <- feature
        feature <- which(private$colnames == feature_name)
        if (length(feature) == 0L) {
          stop(sprintf("feature '%s' not found", feature_name))
        }
      }
394
395
396
397
398
399
400
401
402
403
      num_bin <- integer(1L)
      .Call(
        LGBM_DatasetGetFeatureNumBin_R
        , private$handle
        , feature - 1L
        , num_bin
      )
      return(num_bin)
    },

404
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
405
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
406

407
      # Check for handle
408
      if (!.is_null_handle(x = private$handle)) {
409
        private$colnames <- .Call(
410
411
          LGBM_DatasetGetFeatureNames_R
          , private$handle
412
        )
413
        return(private$colnames)
James Lamb's avatar
James Lamb committed
414
415
416

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

417
        # Check if dgCMatrix (sparse matrix column compressed)
418
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
419

Guolin Ke's avatar
Guolin Ke committed
420
      } else {
James Lamb's avatar
James Lamb committed
421

422
        # Trying to work with unknown formats is not possible
423
        stop(
424
425
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
426
        )
James Lamb's avatar
James Lamb committed
427

Guolin Ke's avatar
Guolin Ke committed
428
      }
James Lamb's avatar
James Lamb committed
429

Guolin Ke's avatar
Guolin Ke committed
430
    },
James Lamb's avatar
James Lamb committed
431

432
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
433
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
434

435
436
      # Check column names non-existence
      if (is.null(colnames)) {
437
        return(invisible(self))
438
      }
James Lamb's avatar
James Lamb committed
439

440
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
441
      colnames <- as.character(colnames)
442
      if (length(colnames) == 0L) {
443
        return(invisible(self))
444
      }
James Lamb's avatar
James Lamb committed
445

446
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
447
      private$colnames <- colnames
448
      if (!.is_null_handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
449

450
        # Merge names with tab separation
451
        merged_name <- paste(as.list(private$colnames), collapse = "\t")
452
453
        .Call(
          LGBM_DatasetSetFeatureNames_R
454
          , private$handle
455
          , merged_name
456
        )
James Lamb's avatar
James Lamb committed
457

Guolin Ke's avatar
Guolin Ke committed
458
      }
James Lamb's avatar
James Lamb committed
459

460
      return(invisible(self))
James Lamb's avatar
James Lamb committed
461

Guolin Ke's avatar
Guolin Ke committed
462
    },
James Lamb's avatar
James Lamb committed
463

464
    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
465

466
      # Check if attribute key is in the known attribute list
467
468
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
469
470
          "Dataset$get_field(): field_name must be one of the following: "
          , toString(sQuote(.INFO_KEYS(), q = FALSE))
471
        )
Guolin Ke's avatar
Guolin Ke committed
472
      }
James Lamb's avatar
James Lamb committed
473

474
      # Check for info name and handle
475
      if (is.null(private$info[[field_name]])) {
476

477
        if (.is_null_handle(x = private$handle)) {
478
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
479
        }
480

481
        # Get field size of info
482
        info_len <- 0L
483
484
        .Call(
          LGBM_DatasetGetFieldSize_R
485
          , private$handle
486
          , field_name
487
          , info_len
488
        )
James Lamb's avatar
James Lamb committed
489

490
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
491

492
          # Get back fields
493
494
          if (field_name == "group") {
            ret <- integer(info_len)
495
          } else {
496
            ret <- numeric(info_len)
497
          }
James Lamb's avatar
James Lamb committed
498

499
500
          .Call(
            LGBM_DatasetGetField_R
501
            , private$handle
502
            , field_name
503
            , ret
504
          )
James Lamb's avatar
James Lamb committed
505

506
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
507

Guolin Ke's avatar
Guolin Ke committed
508
509
        }
      }
James Lamb's avatar
James Lamb committed
510

511
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
512

Guolin Ke's avatar
Guolin Ke committed
513
    },
James Lamb's avatar
James Lamb committed
514

515
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
516

517
      # Check if attribute key is in the known attribute list
518
519
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
520
521
          "Dataset$set_field(): field_name must be one of the following: "
          , toString(sQuote(.INFO_KEYS(), q = FALSE))
522
        )
523
      }
James Lamb's avatar
James Lamb committed
524

525
      # Check for type of information
526
      data <- if (field_name == "group") {
527
        as.integer(data)
528
      } else {
529
        as.numeric(data)
530
      }
James Lamb's avatar
James Lamb committed
531

532
      # Store information privately
533
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
534

535
      if (!.is_null_handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
536

537
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
538

539
540
          .Call(
            LGBM_DatasetSetField_R
541
            , private$handle
542
543
544
            , field_name
            , data
            , length(data)
545
          )
James Lamb's avatar
James Lamb committed
546

547
548
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
549
        }
James Lamb's avatar
James Lamb committed
550

Guolin Ke's avatar
Guolin Ke committed
551
      }
James Lamb's avatar
James Lamb committed
552

553
      return(invisible(self))
James Lamb's avatar
James Lamb committed
554

Guolin Ke's avatar
Guolin Ke committed
555
    },
James Lamb's avatar
James Lamb committed
556

557
    slice = function(idxset) {
558

559
560
561
      return(
        Dataset$new(
          data = NULL
562
          , params = private$params
563
564
565
566
567
568
569
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
        )
570
      )
James Lamb's avatar
James Lamb committed
571

Guolin Ke's avatar
Guolin Ke committed
572
    },
James Lamb's avatar
James Lamb committed
573

574
575
576
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
577
    update_params = function(params) {
578
579
580
      if (length(params) == 0L) {
        return(invisible(self))
      }
581
      new_params <- utils::modifyList(private$params, params)
582
      if (.is_null_handle(x = private$handle)) {
583
        private$params <- new_params
584
      } else {
585
586
        tryCatch({
          .Call(
587
            LGBM_DatasetUpdateParamChecking_R
588
589
            , .params2str(params = private$params)
            , .params2str(params = new_params)
590
          )
591
          private$params <- new_params
592
593
594
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
595
          if (is.null(private$raw_data)) {
596
            stop(e)
597
598
          }

599
600
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
601
          private$params <- new_params
602
          private$finalize()
603
        })
604
      }
605
      return(invisible(self))
James Lamb's avatar
James Lamb committed
606

Guolin Ke's avatar
Guolin Ke committed
607
    },
James Lamb's avatar
James Lamb committed
608

609
610
611
612
613
    # [description] Get only Dataset-specific parameters. This is primarily used by
    #               Booster to update its parameters based on the characteristics of
    #               a Dataset. It should not be used by other methods in this class,
    #               since "verbose" is not a Dataset parameter and needs to be passed
    #               through to avoid globally re-setting verbosity.
614
615
616
617
618
619
620
621
622
623
624
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

625
    # Set categorical feature parameter
626
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
627

628
629
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
630
        return(invisible(self))
631
      }
James Lamb's avatar
James Lamb committed
632

633
      # Check for empty data
634
      if (is.null(private$raw_data)) {
635
636
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
637
      }
James Lamb's avatar
James Lamb committed
638

639
      # Overwrite categorical features
640
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
641

642
      # Finalize and return self
643
      private$finalize()
644
      return(invisible(self))
James Lamb's avatar
James Lamb committed
645

646
    },
James Lamb's avatar
James Lamb committed
647

Guolin Ke's avatar
Guolin Ke committed
648
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
649

650
      # setting reference to this same Dataset object doesn't require any changes
651
      if (identical(private$reference, reference)) {
652
        return(invisible(self))
653
      }
James Lamb's avatar
James Lamb committed
654

655
656
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
657
      if (is.null(private$raw_data)) {
658
659
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
660
      }
James Lamb's avatar
James Lamb committed
661

662
      if (!.is_Dataset(reference)) {
663
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
664
      }
James Lamb's avatar
James Lamb committed
665

666
667
668
669
670
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

671
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
672
      private$reference <- reference
James Lamb's avatar
James Lamb committed
673

674
      # Finalize and return self
675
      private$finalize()
676
      return(invisible(self))
James Lamb's avatar
James Lamb committed
677

Guolin Ke's avatar
Guolin Ke committed
678
    },
James Lamb's avatar
James Lamb committed
679

680
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
681
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
682

683
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
684
      self$construct()
685
686
      .Call(
        LGBM_DatasetSaveBinary_R
687
        , private$handle
688
        , path.expand(fname)
689
      )
690
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
691
    }
James Lamb's avatar
James Lamb committed
692

Guolin Ke's avatar
Guolin Ke committed
693
694
  ),
  private = list(
695
696
697
698
699
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
700
    categorical_feature = NULL,
701
702
703
704
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
705
    version = 0L,
James Lamb's avatar
James Lamb committed
706

707
708
709
710
711
712
713
714
715
716
    # finalize() will free up the handles
    finalize = function() {
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
      return(invisible(NULL))
    },

717
    get_handle = function() {
James Lamb's avatar
James Lamb committed
718

719
      # Get handle and construct if needed
720
      if (.is_null_handle(x = private$handle)) {
721
722
        self$construct()
      }
723
      return(private$handle)
James Lamb's avatar
James Lamb committed
724

Guolin Ke's avatar
Guolin Ke committed
725
    },
James Lamb's avatar
James Lamb committed
726

Guolin Ke's avatar
Guolin Ke committed
727
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
728

729
      if (identical(private$predictor, predictor)) {
730
        return(invisible(self))
731
      }
James Lamb's avatar
James Lamb committed
732

733
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
734
      if (is.null(private$raw_data)) {
735
736
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
737
      }
James Lamb's avatar
James Lamb committed
738

739
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
740
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
741

742
        # Predictor is unknown
743
        if (!.is_Predictor(predictor)) {
744
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
745
        }
James Lamb's avatar
James Lamb committed
746

Guolin Ke's avatar
Guolin Ke committed
747
      }
James Lamb's avatar
James Lamb committed
748

749
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
750
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
751

752
      # Finalize and return self
753
      private$finalize()
754
      return(invisible(self))
James Lamb's avatar
James Lamb committed
755

Guolin Ke's avatar
Guolin Ke committed
756
    }
James Lamb's avatar
James Lamb committed
757

Guolin Ke's avatar
Guolin Ke committed
758
759
760
  )
)

761
#' @title Construct \code{lgb.Dataset} object
762
763
764
765
766
767
768
#' @description LightGBM does not train on raw data.
#'              It discretizes continuous features into histogram bins, tries to
#'              combine categorical features, and automatically handles missing and
#               infinite values.
#'
#'              The \code{Dataset} class handles that preprocessing, and holds that
#'              alternative representation of the input data.
769
#' @inheritParams lgb_shared_dataset_params
770
771
772
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
773
774
775
776
777
778
779
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
780
#' @param colnames names of columns
781
782
783
784
785
786
787
788
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
James Lamb's avatar
James Lamb committed
789
#'
Guolin Ke's avatar
Guolin Ke committed
790
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
791
#'
Guolin Ke's avatar
Guolin Ke committed
792
#' @examples
793
#' \donttest{
794
795
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
796
797
798
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
799
800
801
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
802
#' lgb.Dataset.construct(dtrain)
803
#' }
Guolin Ke's avatar
Guolin Ke committed
804
805
#' @export
lgb.Dataset <- function(data,
806
807
808
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
809
                        categorical_feature = NULL,
810
                        free_raw_data = TRUE,
811
812
813
                        label = NULL,
                        weight = NULL,
                        group = NULL,
814
                        init_score = NULL) {
815

816
817
818
819
820
821
822
823
824
825
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
826
827
828
829
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
830
831
    ))
  )
James Lamb's avatar
James Lamb committed
832

Guolin Ke's avatar
Guolin Ke committed
833
834
}

835
836
837
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
838
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
839
#' @param dataset \code{lgb.Dataset} object, training data
840
841
842
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
843
844
845
846
847
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
James Lamb's avatar
James Lamb committed
848
#'
Guolin Ke's avatar
Guolin Ke committed
849
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
850
#'
Guolin Ke's avatar
Guolin Ke committed
851
#' @examples
852
#' \donttest{
853
854
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
855
856
857
858
859
860
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
896
#' }
Guolin Ke's avatar
Guolin Ke committed
897
#' @export
898
899
900
901
902
903
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
904
                                     params = list()) {
James Lamb's avatar
James Lamb committed
905

906
  if (!.is_Dataset(x = dataset)) {
907
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
908
  }
James Lamb's avatar
James Lamb committed
909

910
  # Create validation dataset
911
912
913
914
915
916
917
  return(invisible(
    dataset$create_valid(
      data = data
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
918
      , params = params
919
920
    )
  ))
James Lamb's avatar
James Lamb committed
921

922
}
Guolin Ke's avatar
Guolin Ke committed
923

924
925
926
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
927
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
928
#'
Guolin Ke's avatar
Guolin Ke committed
929
#' @examples
930
#' \donttest{
931
932
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
933
934
935
936
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
937
#' }
938
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
939
940
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
941

942
  if (!.is_Dataset(x = dataset)) {
943
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
944
  }
James Lamb's avatar
James Lamb committed
945

946
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
947

Guolin Ke's avatar
Guolin Ke committed
948
949
}

950
951
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
952
#' @param x Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
953
#'
Guolin Ke's avatar
Guolin Ke committed
954
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
955
#'
Guolin Ke's avatar
Guolin Ke committed
956
957
958
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
959
#'
Guolin Ke's avatar
Guolin Ke committed
960
#' @examples
961
#' \donttest{
962
963
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
964
965
966
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
967
#'
968
969
970
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
971
#' }
Guolin Ke's avatar
Guolin Ke committed
972
973
#' @rdname dim
#' @export
974
dim.lgb.Dataset <- function(x) {
975

976
  if (!.is_Dataset(x = x)) {
977
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
978
  }
James Lamb's avatar
James Lamb committed
979

980
  return(x$dim())
James Lamb's avatar
James Lamb committed
981

Guolin Ke's avatar
Guolin Ke committed
982
983
}

984
985
986
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
987
988
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
989
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
990
991
992
993
994
995
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
996
#' \donttest{
997
998
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
999
1000
1001
1002
1003
1004
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1005
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1006
#' print(dtrain, verbose = TRUE)
1007
#' }
Guolin Ke's avatar
Guolin Ke committed
1008
#' @rdname dimnames.lgb.Dataset
1009
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1010
1011
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1012

1013
  if (!.is_Dataset(x = x)) {
1014
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1015
  }
James Lamb's avatar
James Lamb committed
1016

1017
  # Return dimension names
1018
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1019

Guolin Ke's avatar
Guolin Ke committed
1020
1021
1022
1023
1024
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1025

1026
  # Check if invalid element list
1027
  if (!identical(class(value), "list") || length(value) != 2L) {
1028
    stop("invalid ", sQuote("value", q = FALSE), " given: must be a list of two elements")
1029
  }
James Lamb's avatar
James Lamb committed
1030

1031
1032
1033
1034
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1035

1036
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1037

1038
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1039
    return(x)
James Lamb's avatar
James Lamb committed
1040

1041
  }
James Lamb's avatar
James Lamb committed
1042

1043
  # Check for unmatching column size
1044
  if (ncol(x) != length(value[[2L]])) {
1045
1046
    stop(
      "can't assign "
1047
      , sQuote(length(value[[2L]]), q = FALSE)
1048
      , " colnames to an lgb.Dataset with "
1049
      , sQuote(ncol(x), q = FALSE)
1050
1051
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1052
  }
James Lamb's avatar
James Lamb committed
1053

1054
  # Set column names properly, and return
1055
  x$set_colnames(colnames = value[[2L]])
1056
  return(x)
James Lamb's avatar
James Lamb committed
1057

Guolin Ke's avatar
Guolin Ke committed
1058
1059
}

1060
1061
1062
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
James Lamb's avatar
James Lamb committed
1063
1064
1065
#'
#'              \emph{Renamed from} \code{slice()} \emph{in 4.4.0}
#'
Nikita Titov's avatar
Nikita Titov committed
1066
#' @param dataset Object of class \code{lgb.Dataset}
1067
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1068
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1069
#'
Guolin Ke's avatar
Guolin Ke committed
1070
#' @examples
1071
#' \donttest{
1072
1073
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
1074
1075
1076
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1077
#'
1078
#' dsub <- lgb.slice.Dataset(dtrain, seq_len(42L))
1079
#' lgb.Dataset.construct(dsub)
1080
#' labels <- lightgbm::get_field(dsub, "label")
1081
#' }
Guolin Ke's avatar
Guolin Ke committed
1082
#' @export
1083
lgb.slice.Dataset <- function(dataset, idxset) {
James Lamb's avatar
James Lamb committed
1084

1085
  if (!.is_Dataset(x = dataset)) {
1086
    stop("lgb.slice.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1087
  }
James Lamb's avatar
James Lamb committed
1088

1089
  return(invisible(dataset$slice(idxset = idxset)))
James Lamb's avatar
James Lamb committed
1090

Guolin Ke's avatar
Guolin Ke committed
1091
1092
}

1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
1112
1113
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
1135
  if (!.is_Dataset(x = dataset)) {
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
1163
1164
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

1185
  if (!.is_Dataset(x = dataset)) {
1186
1187
1188
1189
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1190
1191
}

1192
1193
1194
1195
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1196
#' @param dataset object of class \code{lgb.Dataset}
1197
1198
1199
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1200
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1201
#'
1202
#' @examples
1203
#' \donttest{
1204
1205
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
1206
1207
1208
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1209
1210
1211
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1212
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1213
#' }
1214
1215
1216
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1217

1218
  if (!.is_Dataset(x = dataset)) {
1219
1220
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1221

1222
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1223

1224
1225
}

1226
1227
1228
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1229
1230
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1231
#'
1232
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1233
#'
Guolin Ke's avatar
Guolin Ke committed
1234
#' @examples
1235
#' \donttest{
1236
1237
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
1238
#' # create training Dataset
1239
1240
1241
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1242
1243
#'
#' # create a validation Dataset, using dtrain as a reference
1244
1245
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1246
#' dtest <- lgb.Dataset(test$data, label = test$label)
1247
#' lgb.Dataset.set.reference(dtest, dtrain)
1248
#' }
Guolin Ke's avatar
Guolin Ke committed
1249
1250
1251
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1252

1253
  if (!.is_Dataset(x = dataset)) {
1254
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1255
  }
James Lamb's avatar
James Lamb committed
1256

1257
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1258
1259
}

1260
1261
1262
1263
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1264
1265
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1266
#'
1267
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1268
#'
Guolin Ke's avatar
Guolin Ke committed
1269
#' @examples
1270
#' \donttest{
1271
1272
#' \dontshow{setLGBMthreads(2L)}
#' \dontshow{data.table::setDTthreads(1L)}
1273
1274
1275
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1276
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1277
#' }
Guolin Ke's avatar
Guolin Ke committed
1278
1279
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1280

1281
  if (!.is_Dataset(x = dataset)) {
1282
    stop("lgb.Dataset.save: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1283
  }
James Lamb's avatar
James Lamb committed
1284

1285
  if (!is.character(fname)) {
1286
    stop("lgb.Dataset.save: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1287
  }
James Lamb's avatar
James Lamb committed
1288

1289
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1290
}