lgb.Dataset.R 38 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @keywords internal
NULL

16
17
18
19
20
21
22
23
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
24
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
25
#' @importFrom R6 R6Class
26
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
27
28
Dataset <- R6::R6Class(

29
  classname = "lgb.Dataset",
30
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
31
  public = list(
James Lamb's avatar
James Lamb committed
32

33
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
34
    finalize = function() {
35
36
37
38
39
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
40
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
41
    },
James Lamb's avatar
James Lamb committed
42

43
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
44
    initialize = function(data,
45
46
47
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
48
                          categorical_feature = NULL,
49
50
51
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
52
53
54
55
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
56

57
      # validate inputs early to avoid unnecessary computation
58
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
59
60
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
61
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
62
63
64
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

65
      info <- list()
66
67
68
69
70
71
72
73
74
75
76
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
77
      }
James Lamb's avatar
James Lamb committed
78

79
80
81
82
83
84
85
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
86

87
88
89
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
90
      private$reference <- reference
91
      private$colnames <- colnames
92

93
      private$categorical_feature <- categorical_feature
94
95
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
96
      private$used_indices <- sort(used_indices, decreasing = FALSE)
97
      private$info <- info
98
      private$version <- 0L
James Lamb's avatar
James Lamb committed
99

100
101
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
102
    },
James Lamb's avatar
James Lamb committed
103

104
    create_valid = function(data,
105
106
107
108
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
109
                            params = list()) {
110
111

      # the Dataset's existing parameters should be overwritten by any passed in to this call
112
      params <- modifyList(private$params, params)
113

114
      # Create new dataset
115
116
      ret <- Dataset$new(
        data = data
117
        , params = params
118
119
120
121
122
123
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
124
125
126
127
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
128
      )
James Lamb's avatar
James Lamb committed
129

130
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
131

Guolin Ke's avatar
Guolin Ke committed
132
    },
James Lamb's avatar
James Lamb committed
133

134
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
135
    construct = function() {
James Lamb's avatar
James Lamb committed
136

137
      # Check for handle null
138
      if (!lgb.is.null.handle(x = private$handle)) {
139
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
140
      }
James Lamb's avatar
James Lamb committed
141

Guolin Ke's avatar
Guolin Ke committed
142
143
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
144
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
145
146
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
147

148
      # set feature names if they do not exist
149
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
150
151
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
152

153
154
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
155

156
        # Check for character name
157
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
158

159
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
160

161
            # Provided indices, but some indices are missing?
162
            if (sum(is.na(cate_indices)) > 0L) {
163
164
165
166
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
167
            }
James Lamb's avatar
James Lamb committed
168

169
          } else {
James Lamb's avatar
James Lamb committed
170

171
            # Check if more categorical features were output over the feature space
172
            if (max(private$categorical_feature) > length(private$colnames)) {
173
174
175
176
177
178
179
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
180
            }
James Lamb's avatar
James Lamb committed
181

182
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
183
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
184

185
          }
James Lamb's avatar
James Lamb committed
186

187
        # Store indices for categorical features
188
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
189

190
      }
James Lamb's avatar
James Lamb committed
191

Guolin Ke's avatar
Guolin Ke committed
192
      # Generate parameter str
193
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
194

195
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
200

201
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
202
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
203

204
205
206
207
208
209
210
211
212
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

213
        # Are we using a data file?
214
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
215

216
          handle <- .Call(
217
            LGBM_DatasetCreateFromFile_R
218
            , path.expand(private$raw_data)
219
220
221
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
222

Guolin Ke's avatar
Guolin Ke committed
223
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
224

225
          # Are we using a matrix?
226
          handle <- .Call(
227
            LGBM_DatasetCreateFromMat_R
228
229
230
231
232
233
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
234
235

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
236
          if (length(private$raw_data@p) > 2147483647L) {
237
238
            stop("Cannot support large CSC matrix")
          }
239
          # Are we using a dgCMatrix (sparse matrix column compressed)
240
          handle <- .Call(
241
            LGBM_DatasetCreateFromCSC_R
242
243
244
245
246
247
248
249
250
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
251

Guolin Ke's avatar
Guolin Ke committed
252
        } else {
James Lamb's avatar
James Lamb committed
253

254
          # Unknown data type
255
256
257
258
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
259

Guolin Ke's avatar
Guolin Ke committed
260
        }
James Lamb's avatar
James Lamb committed
261

Guolin Ke's avatar
Guolin Ke committed
262
      } else {
James Lamb's avatar
James Lamb committed
263

264
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
265
        if (is.null(private$reference)) {
266
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
267
        }
James Lamb's avatar
James Lamb committed
268

269
        # Construct subset
270
        handle <- .Call(
271
          LGBM_DatasetGetSubset_R
272
273
274
275
276
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
277

Guolin Ke's avatar
Guolin Ke committed
278
      }
279
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
280
281
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
282
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
283
284
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
285

286
287
      # Set feature names
      if (!is.null(private$colnames)) {
288
        self$set_colnames(colnames = private$colnames)
289
      }
290

291
292
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
293

294
        # Setup initial scores
295
        init_score <- private$predictor$predict(
296
          data = private$raw_data
297
298
299
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
300

301
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
302
303
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
304

305
      }
James Lamb's avatar
James Lamb committed
306

307
308
309
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
310
      }
James Lamb's avatar
James Lamb committed
311

312
      # Get private information
313
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
314

315
        # Set infos
316
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
317

Guolin Ke's avatar
Guolin Ke committed
318
          p <- private$info[i]
319
320
321
322
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
323

Guolin Ke's avatar
Guolin Ke committed
324
        }
James Lamb's avatar
James Lamb committed
325

Guolin Ke's avatar
Guolin Ke committed
326
      }
James Lamb's avatar
James Lamb committed
327

328
      # Get label information existence
329
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
330
331
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
332

333
      return(invisible(self))
James Lamb's avatar
James Lamb committed
334

Guolin Ke's avatar
Guolin Ke committed
335
    },
James Lamb's avatar
James Lamb committed
336

337
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
338
    dim = function() {
James Lamb's avatar
James Lamb committed
339

340
      # Check for handle
341
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
342

343
344
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
345

346
        # Get numeric data and numeric features
347
348
349
350
351
352
353
354
355
356
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
357
        return(
358
          c(num_row, num_col)
359
        )
James Lamb's avatar
James Lamb committed
360
361
362

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

363
        # Check if dgCMatrix (sparse matrix column compressed)
364
        # NOTE: requires Matrix package
365
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
366

Guolin Ke's avatar
Guolin Ke committed
367
      } else {
James Lamb's avatar
James Lamb committed
368

369
        # Trying to work with unknown dimensions is not possible
370
371
372
373
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
374

Guolin Ke's avatar
Guolin Ke committed
375
      }
James Lamb's avatar
James Lamb committed
376

Guolin Ke's avatar
Guolin Ke committed
377
    },
James Lamb's avatar
James Lamb committed
378

379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
    # Get number of bins for feature
    get_feature_num_bin = function(feature) {
      if (lgb.is.null.handle(x = private$handle)) {
        stop("Cannot get number of bins in feature before constructing Dataset.")
      }
      num_bin <- integer(1L)
      .Call(
        LGBM_DatasetGetFeatureNumBin_R
        , private$handle
        , feature - 1L
        , num_bin
      )
      return(num_bin)
    },

394
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
395
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
396

397
      # Check for handle
398
      if (!lgb.is.null.handle(x = private$handle)) {
399
        private$colnames <- .Call(
400
401
          LGBM_DatasetGetFeatureNames_R
          , private$handle
402
        )
403
        return(private$colnames)
James Lamb's avatar
James Lamb committed
404
405
406

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

407
        # Check if dgCMatrix (sparse matrix column compressed)
408
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
409

Guolin Ke's avatar
Guolin Ke committed
410
      } else {
James Lamb's avatar
James Lamb committed
411

412
        # Trying to work with unknown formats is not possible
413
        stop(
414
415
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
416
        )
James Lamb's avatar
James Lamb committed
417

Guolin Ke's avatar
Guolin Ke committed
418
      }
James Lamb's avatar
James Lamb committed
419

Guolin Ke's avatar
Guolin Ke committed
420
    },
James Lamb's avatar
James Lamb committed
421

422
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
423
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
424

425
426
      # Check column names non-existence
      if (is.null(colnames)) {
427
        return(invisible(self))
428
      }
James Lamb's avatar
James Lamb committed
429

430
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
431
      colnames <- as.character(colnames)
432
      if (length(colnames) == 0L) {
433
        return(invisible(self))
434
      }
James Lamb's avatar
James Lamb committed
435

436
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
437
      private$colnames <- colnames
438
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
439

440
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
441
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
442
443
        .Call(
          LGBM_DatasetSetFeatureNames_R
444
          , private$handle
445
          , merged_name
446
        )
James Lamb's avatar
James Lamb committed
447

Guolin Ke's avatar
Guolin Ke committed
448
      }
James Lamb's avatar
James Lamb committed
449

450
      return(invisible(self))
James Lamb's avatar
James Lamb committed
451

Guolin Ke's avatar
Guolin Ke committed
452
    },
James Lamb's avatar
James Lamb committed
453

454
    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
455

456
      # Check if attribute key is in the known attribute list
457
458
459
460
461
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
462
      }
James Lamb's avatar
James Lamb committed
463

464
      # Check for info name and handle
465
      if (is.null(private$info[[field_name]])) {
466

467
        if (lgb.is.null.handle(x = private$handle)) {
468
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
469
        }
470

471
        # Get field size of info
472
        info_len <- 0L
473
474
        .Call(
          LGBM_DatasetGetFieldSize_R
475
          , private$handle
476
          , field_name
477
          , info_len
478
        )
James Lamb's avatar
James Lamb committed
479

480
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
481

482
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
483
          ret <- NULL
484
          ret <- if (field_name == "group") {
485
            integer(info_len)
486
          } else {
487
            numeric(info_len)
488
          }
James Lamb's avatar
James Lamb committed
489

490
491
          .Call(
            LGBM_DatasetGetField_R
492
            , private$handle
493
            , field_name
494
            , ret
495
          )
James Lamb's avatar
James Lamb committed
496

497
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
498

Guolin Ke's avatar
Guolin Ke committed
499
500
        }
      }
James Lamb's avatar
James Lamb committed
501

502
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
503

Guolin Ke's avatar
Guolin Ke committed
504
    },
James Lamb's avatar
James Lamb committed
505

506
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
507

508
      # Check if attribute key is in the known attribute list
509
510
511
512
513
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
514
      }
James Lamb's avatar
James Lamb committed
515

516
      # Check for type of information
517
      data <- if (field_name == "group") {
518
        as.integer(data)
519
      } else {
520
        as.numeric(data)
521
      }
James Lamb's avatar
James Lamb committed
522

523
      # Store information privately
524
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
525

526
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
527

528
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
529

530
531
          .Call(
            LGBM_DatasetSetField_R
532
            , private$handle
533
534
535
            , field_name
            , data
            , length(data)
536
          )
James Lamb's avatar
James Lamb committed
537

538
539
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
540
        }
James Lamb's avatar
James Lamb committed
541

Guolin Ke's avatar
Guolin Ke committed
542
      }
James Lamb's avatar
James Lamb committed
543

544
      return(invisible(self))
James Lamb's avatar
James Lamb committed
545

Guolin Ke's avatar
Guolin Ke committed
546
    },
James Lamb's avatar
James Lamb committed
547

548
    slice = function(idxset) {
549

550
551
552
      return(
        Dataset$new(
          data = NULL
553
          , params = private$params
554
555
556
557
558
559
560
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
        )
561
      )
James Lamb's avatar
James Lamb committed
562

Guolin Ke's avatar
Guolin Ke committed
563
    },
James Lamb's avatar
James Lamb committed
564

565
566
567
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
568
    update_params = function(params) {
569
570
571
      if (length(params) == 0L) {
        return(invisible(self))
      }
572
      new_params <- utils::modifyList(private$params, params)
573
      if (lgb.is.null.handle(x = private$handle)) {
574
        private$params <- new_params
575
      } else {
576
577
        tryCatch({
          .Call(
578
            LGBM_DatasetUpdateParamChecking_R
579
            , lgb.params2str(params = private$params)
580
            , lgb.params2str(params = new_params)
581
          )
582
          private$params <- new_params
583
584
585
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
586
          if (is.null(private$raw_data)) {
587
            stop(e)
588
589
          }

590
591
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
592
          private$params <- new_params
593
          self$finalize()
594
        })
595
      }
596
      return(invisible(self))
James Lamb's avatar
James Lamb committed
597

Guolin Ke's avatar
Guolin Ke committed
598
    },
James Lamb's avatar
James Lamb committed
599

600
601
602
603
604
    # [description] Get only Dataset-specific parameters. This is primarily used by
    #               Booster to update its parameters based on the characteristics of
    #               a Dataset. It should not be used by other methods in this class,
    #               since "verbose" is not a Dataset parameter and needs to be passed
    #               through to avoid globally re-setting verbosity.
605
606
607
608
609
610
611
612
613
614
615
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

616
    # Set categorical feature parameter
617
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
618

619
620
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
621
        return(invisible(self))
622
      }
James Lamb's avatar
James Lamb committed
623

624
      # Check for empty data
625
      if (is.null(private$raw_data)) {
626
627
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
628
      }
James Lamb's avatar
James Lamb committed
629

630
      # Overwrite categorical features
631
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
632

633
      # Finalize and return self
634
      self$finalize()
635
      return(invisible(self))
James Lamb's avatar
James Lamb committed
636

637
    },
James Lamb's avatar
James Lamb committed
638

Guolin Ke's avatar
Guolin Ke committed
639
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
640

641
      # setting reference to this same Dataset object doesn't require any changes
642
      if (identical(private$reference, reference)) {
643
        return(invisible(self))
644
      }
James Lamb's avatar
James Lamb committed
645

646
647
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
648
      if (is.null(private$raw_data)) {
649
650
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
651
      }
James Lamb's avatar
James Lamb committed
652

653
654
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
655
      }
James Lamb's avatar
James Lamb committed
656

657
658
659
660
661
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

662
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
663
      private$reference <- reference
James Lamb's avatar
James Lamb committed
664

665
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
666
      self$finalize()
667
      return(invisible(self))
James Lamb's avatar
James Lamb committed
668

Guolin Ke's avatar
Guolin Ke committed
669
    },
James Lamb's avatar
James Lamb committed
670

671
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
672
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
673

674
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
675
      self$construct()
676
677
      .Call(
        LGBM_DatasetSaveBinary_R
678
        , private$handle
679
        , path.expand(fname)
680
      )
681
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
682
    }
James Lamb's avatar
James Lamb committed
683

Guolin Ke's avatar
Guolin Ke committed
684
685
  ),
  private = list(
686
687
688
689
690
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
691
    categorical_feature = NULL,
692
693
694
695
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
696
    version = 0L,
James Lamb's avatar
James Lamb committed
697

698
    get_handle = function() {
James Lamb's avatar
James Lamb committed
699

700
      # Get handle and construct if needed
701
      if (lgb.is.null.handle(x = private$handle)) {
702
703
        self$construct()
      }
704
      return(private$handle)
James Lamb's avatar
James Lamb committed
705

Guolin Ke's avatar
Guolin Ke committed
706
    },
James Lamb's avatar
James Lamb committed
707

Guolin Ke's avatar
Guolin Ke committed
708
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
709

710
      if (identical(private$predictor, predictor)) {
711
        return(invisible(self))
712
      }
James Lamb's avatar
James Lamb committed
713

714
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
715
      if (is.null(private$raw_data)) {
716
717
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
718
      }
James Lamb's avatar
James Lamb committed
719

720
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
721
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
722

723
        # Predictor is unknown
724
        if (!lgb.is.Predictor(predictor)) {
725
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
726
        }
James Lamb's avatar
James Lamb committed
727

Guolin Ke's avatar
Guolin Ke committed
728
      }
James Lamb's avatar
James Lamb committed
729

730
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
731
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
732

733
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
734
      self$finalize()
735
      return(invisible(self))
James Lamb's avatar
James Lamb committed
736

Guolin Ke's avatar
Guolin Ke committed
737
    }
James Lamb's avatar
James Lamb committed
738

Guolin Ke's avatar
Guolin Ke committed
739
740
741
  )
)

742
743
744
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
745
#' @inheritParams lgb_shared_dataset_params
746
747
748
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
749
750
751
752
753
754
755
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
756
#' @param colnames names of columns
757
758
759
760
761
762
763
764
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
James Lamb's avatar
James Lamb committed
765
#'
Guolin Ke's avatar
Guolin Ke committed
766
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
767
#'
Guolin Ke's avatar
Guolin Ke committed
768
#' @examples
769
#' \donttest{
770
771
772
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
773
774
775
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
776
#' lgb.Dataset.construct(dtrain)
777
#' }
Guolin Ke's avatar
Guolin Ke committed
778
779
#' @export
lgb.Dataset <- function(data,
780
781
782
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
783
                        categorical_feature = NULL,
784
                        free_raw_data = TRUE,
785
786
787
                        label = NULL,
                        weight = NULL,
                        group = NULL,
788
                        init_score = NULL) {
789

790
791
792
793
794
795
796
797
798
799
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
800
801
802
803
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
804
805
    ))
  )
James Lamb's avatar
James Lamb committed
806

Guolin Ke's avatar
Guolin Ke committed
807
808
}

809
810
811
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
812
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
813
#' @param dataset \code{lgb.Dataset} object, training data
814
815
816
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
817
818
819
820
821
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
James Lamb's avatar
James Lamb committed
822
#'
Guolin Ke's avatar
Guolin Ke committed
823
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
824
#'
Guolin Ke's avatar
Guolin Ke committed
825
#' @examples
826
#' \donttest{
827
828
829
830
831
832
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
868
#' }
Guolin Ke's avatar
Guolin Ke committed
869
#' @export
870
871
872
873
874
875
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
876
                                     params = list()) {
James Lamb's avatar
James Lamb committed
877

878
  if (!lgb.is.Dataset(x = dataset)) {
879
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
880
  }
James Lamb's avatar
James Lamb committed
881

882
  # Create validation dataset
883
884
885
886
887
888
889
  return(invisible(
    dataset$create_valid(
      data = data
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
890
      , params = params
891
892
    )
  ))
James Lamb's avatar
James Lamb committed
893

894
}
Guolin Ke's avatar
Guolin Ke committed
895

896
897
898
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
899
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
900
#'
Guolin Ke's avatar
Guolin Ke committed
901
#' @examples
902
#' \donttest{
903
904
905
906
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
907
#' }
908
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
909
910
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
911

912
  if (!lgb.is.Dataset(x = dataset)) {
913
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
914
  }
James Lamb's avatar
James Lamb committed
915

916
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
917

Guolin Ke's avatar
Guolin Ke committed
918
919
}

920
921
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
922
#' @param x Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
923
#'
Guolin Ke's avatar
Guolin Ke committed
924
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
925
#'
Guolin Ke's avatar
Guolin Ke committed
926
927
928
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
929
#'
Guolin Ke's avatar
Guolin Ke committed
930
#' @examples
931
#' \donttest{
932
933
934
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
935
#'
936
937
938
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
939
#' }
Guolin Ke's avatar
Guolin Ke committed
940
941
#' @rdname dim
#' @export
942
dim.lgb.Dataset <- function(x) {
943

944
  if (!lgb.is.Dataset(x = x)) {
945
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
946
  }
James Lamb's avatar
James Lamb committed
947

948
  return(x$dim())
James Lamb's avatar
James Lamb committed
949

Guolin Ke's avatar
Guolin Ke committed
950
951
}

952
953
954
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
955
956
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
957
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
958
959
960
961
962
963
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
964
#' \donttest{
965
966
967
968
969
970
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
971
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
972
#' print(dtrain, verbose = TRUE)
973
#' }
Guolin Ke's avatar
Guolin Ke committed
974
#' @rdname dimnames.lgb.Dataset
975
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
976
977
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
978

979
  if (!lgb.is.Dataset(x = x)) {
980
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
981
  }
James Lamb's avatar
James Lamb committed
982

983
  # Return dimension names
984
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
985

Guolin Ke's avatar
Guolin Ke committed
986
987
988
989
990
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
991

992
  # Check if invalid element list
993
  if (!identical(class(value), "list") || length(value) != 2L) {
994
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
995
  }
James Lamb's avatar
James Lamb committed
996

997
998
999
1000
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1001

1002
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1003

1004
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1005
    return(x)
James Lamb's avatar
James Lamb committed
1006

1007
  }
James Lamb's avatar
James Lamb committed
1008

1009
  # Check for unmatching column size
1010
  if (ncol(x) != length(value[[2L]])) {
1011
1012
    stop(
      "can't assign "
1013
      , sQuote(length(value[[2L]]))
1014
1015
1016
1017
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1018
  }
James Lamb's avatar
James Lamb committed
1019

1020
  # Set column names properly, and return
1021
  x$set_colnames(colnames = value[[2L]])
1022
  return(x)
James Lamb's avatar
James Lamb committed
1023

Guolin Ke's avatar
Guolin Ke committed
1024
1025
}

1026
1027
1028
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1029
#' @param dataset Object of class \code{lgb.Dataset}
1030
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1031
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1032
#'
Guolin Ke's avatar
Guolin Ke committed
1033
#' @examples
1034
#' \donttest{
1035
1036
1037
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1038
#'
1039
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1040
#' lgb.Dataset.construct(dsub)
1041
#' labels <- lightgbm::get_field(dsub, "label")
1042
#' }
Guolin Ke's avatar
Guolin Ke committed
1043
#' @export
1044
slice <- function(dataset, idxset) {
1045
1046
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1047
1048
1049

#' @rdname slice
#' @export
1050
slice.lgb.Dataset <- function(dataset, idxset) {
James Lamb's avatar
James Lamb committed
1051

1052
  if (!lgb.is.Dataset(x = dataset)) {
1053
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1054
  }
James Lamb's avatar
James Lamb committed
1055

1056
  return(invisible(dataset$slice(idxset = idxset)))
James Lamb's avatar
James Lamb committed
1057

Guolin Ke's avatar
Guolin Ke committed
1058
1059
}

1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1153
1154
}

1155
1156
1157
1158
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1159
#' @param dataset object of class \code{lgb.Dataset}
1160
1161
1162
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1163
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1164
#'
1165
#' @examples
1166
#' \donttest{
1167
1168
1169
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1170
1171
1172
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1173
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1174
#' }
1175
1176
1177
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1178

1179
  if (!lgb.is.Dataset(x = dataset)) {
1180
1181
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1182

1183
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1184

1185
1186
}

1187
1188
1189
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1190
1191
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1192
#'
1193
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1194
#'
Guolin Ke's avatar
Guolin Ke committed
1195
#' @examples
1196
#' \donttest{
1197
#' # create training Dataset
1198
1199
1200
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1201
1202
#'
#' # create a validation Dataset, using dtrain as a reference
1203
1204
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1205
#' dtest <- lgb.Dataset(test$data, label = test$label)
1206
#' lgb.Dataset.set.reference(dtest, dtrain)
1207
#' }
Guolin Ke's avatar
Guolin Ke committed
1208
1209
1210
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1211

1212
  if (!lgb.is.Dataset(x = dataset)) {
1213
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1214
  }
James Lamb's avatar
James Lamb committed
1215

1216
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1217
1218
}

1219
1220
1221
1222
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1223
1224
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1225
#'
1226
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1227
#'
Guolin Ke's avatar
Guolin Ke committed
1228
#' @examples
1229
#' \donttest{
1230
1231
1232
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1233
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1234
#' }
Guolin Ke's avatar
Guolin Ke committed
1235
1236
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1237

1238
  if (!lgb.is.Dataset(x = dataset)) {
1239
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1240
  }
James Lamb's avatar
James Lamb committed
1241

1242
1243
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1244
  }
James Lamb's avatar
James Lamb committed
1245

1246
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1247
}