lgb.Dataset.R 39.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @keywords internal
NULL

16
17
18
19
20
21
22
23
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
24
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
25
#' @importFrom R6 R6Class
26
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
27
28
Dataset <- R6::R6Class(

29
  classname = "lgb.Dataset",
30
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
31
  public = list(
James Lamb's avatar
James Lamb committed
32

33
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
34
    finalize = function() {
35
36
37
38
39
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
40
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
41
    },
James Lamb's avatar
James Lamb committed
42

43
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
44
    initialize = function(data,
45
46
47
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
48
                          categorical_feature = NULL,
49
50
51
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
52
53
54
55
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
56

57
      # validate inputs early to avoid unnecessary computation
58
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
59
60
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
61
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
62
63
64
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

65
      info <- list()
66
67
68
69
70
71
72
73
74
75
76
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
77
      }
James Lamb's avatar
James Lamb committed
78

79
80
81
82
83
84
85
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
86

87
88
89
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
90
      private$reference <- reference
91
      private$colnames <- colnames
92

93
      private$categorical_feature <- categorical_feature
94
95
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
96
      private$used_indices <- sort(used_indices, decreasing = FALSE)
97
      private$info <- info
98
      private$version <- 0L
James Lamb's avatar
James Lamb committed
99

100
101
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
102
    },
James Lamb's avatar
James Lamb committed
103

104
    create_valid = function(data,
105
106
107
108
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
109
                            params = list()) {
110
111
112
113

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

114
      # Create new dataset
115
116
      ret <- Dataset$new(
        data = data
117
        , params = params
118
119
120
121
122
123
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
124
125
126
127
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
128
      )
James Lamb's avatar
James Lamb committed
129

130
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
131

Guolin Ke's avatar
Guolin Ke committed
132
    },
James Lamb's avatar
James Lamb committed
133

134
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
135
    construct = function() {
James Lamb's avatar
James Lamb committed
136

137
      # Check for handle null
138
      if (!lgb.is.null.handle(x = private$handle)) {
139
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
140
      }
James Lamb's avatar
James Lamb committed
141

Guolin Ke's avatar
Guolin Ke committed
142
143
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
144
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
145
146
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
147

148
      # set feature names if they do not exist
149
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
150
151
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
152

153
154
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
155

156
        # Check for character name
157
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
158

159
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
160

161
            # Provided indices, but some indices are missing?
162
            if (sum(is.na(cate_indices)) > 0L) {
163
164
165
166
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
167
            }
James Lamb's avatar
James Lamb committed
168

169
          } else {
James Lamb's avatar
James Lamb committed
170

171
            # Check if more categorical features were output over the feature space
172
            if (max(private$categorical_feature) > length(private$colnames)) {
173
174
175
176
177
178
179
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
180
            }
James Lamb's avatar
James Lamb committed
181

182
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
183
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
184

185
          }
James Lamb's avatar
James Lamb committed
186

187
        # Store indices for categorical features
188
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
189

190
      }
James Lamb's avatar
James Lamb committed
191

Guolin Ke's avatar
Guolin Ke committed
192
      # Generate parameter str
193
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
194

195
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
200

201
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
202
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
203

204
205
206
207
208
209
210
211
212
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

213
        # Are we using a data file?
214
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
215

216
          handle <- .Call(
217
            LGBM_DatasetCreateFromFile_R
218
            , path.expand(private$raw_data)
219
220
221
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
222

Guolin Ke's avatar
Guolin Ke committed
223
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
224

225
          # Are we using a matrix?
226
          handle <- .Call(
227
            LGBM_DatasetCreateFromMat_R
228
229
230
231
232
233
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
234
235

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
236
          if (length(private$raw_data@p) > 2147483647L) {
237
238
            stop("Cannot support large CSC matrix")
          }
239
          # Are we using a dgCMatrix (sparsed matrix column compressed)
240
          handle <- .Call(
241
            LGBM_DatasetCreateFromCSC_R
242
243
244
245
246
247
248
249
250
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
251

Guolin Ke's avatar
Guolin Ke committed
252
        } else {
James Lamb's avatar
James Lamb committed
253

254
          # Unknown data type
255
256
257
258
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
259

Guolin Ke's avatar
Guolin Ke committed
260
        }
James Lamb's avatar
James Lamb committed
261

Guolin Ke's avatar
Guolin Ke committed
262
      } else {
James Lamb's avatar
James Lamb committed
263

264
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
265
        if (is.null(private$reference)) {
266
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
267
        }
James Lamb's avatar
James Lamb committed
268

269
        # Construct subset
270
        handle <- .Call(
271
          LGBM_DatasetGetSubset_R
272
273
274
275
276
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
277

Guolin Ke's avatar
Guolin Ke committed
278
      }
279
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
280
281
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
282
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
283
284
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
285

286
287
      # Set feature names
      if (!is.null(private$colnames)) {
288
        self$set_colnames(colnames = private$colnames)
289
      }
290

291
292
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
293

294
        # Setup initial scores
295
        init_score <- private$predictor$predict(
296
          data = private$raw_data
297
298
299
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
300

301
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
302
303
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
304

305
      }
James Lamb's avatar
James Lamb committed
306

307
308
309
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
310
      }
James Lamb's avatar
James Lamb committed
311

312
      # Get private information
313
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
314

315
        # Set infos
316
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
317

Guolin Ke's avatar
Guolin Ke committed
318
          p <- private$info[i]
319
320
321
322
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
323

Guolin Ke's avatar
Guolin Ke committed
324
        }
James Lamb's avatar
James Lamb committed
325

Guolin Ke's avatar
Guolin Ke committed
326
      }
James Lamb's avatar
James Lamb committed
327

328
      # Get label information existence
329
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
330
331
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
332

333
      return(invisible(self))
James Lamb's avatar
James Lamb committed
334

Guolin Ke's avatar
Guolin Ke committed
335
    },
James Lamb's avatar
James Lamb committed
336

337
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
338
    dim = function() {
James Lamb's avatar
James Lamb committed
339

340
      # Check for handle
341
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
342

343
344
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
345

346
        # Get numeric data and numeric features
347
348
349
350
351
352
353
354
355
356
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
357
        return(
358
          c(num_row, num_col)
359
        )
James Lamb's avatar
James Lamb committed
360
361
362

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

363
        # Check if dgCMatrix (sparse matrix column compressed)
364
        # NOTE: requires Matrix package
365
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
366

Guolin Ke's avatar
Guolin Ke committed
367
      } else {
James Lamb's avatar
James Lamb committed
368

369
        # Trying to work with unknown dimensions is not possible
370
371
372
373
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
374

Guolin Ke's avatar
Guolin Ke committed
375
      }
James Lamb's avatar
James Lamb committed
376

Guolin Ke's avatar
Guolin Ke committed
377
    },
James Lamb's avatar
James Lamb committed
378

379
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
380
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
381

382
      # Check for handle
383
      if (!lgb.is.null.handle(x = private$handle)) {
384
        private$colnames <- .Call(
385
386
          LGBM_DatasetGetFeatureNames_R
          , private$handle
387
        )
388
        return(private$colnames)
James Lamb's avatar
James Lamb committed
389
390
391

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

392
        # Check if dgCMatrix (sparse matrix column compressed)
393
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
394

Guolin Ke's avatar
Guolin Ke committed
395
      } else {
James Lamb's avatar
James Lamb committed
396

397
        # Trying to work with unknown formats is not possible
398
        stop(
399
400
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
401
        )
James Lamb's avatar
James Lamb committed
402

Guolin Ke's avatar
Guolin Ke committed
403
      }
James Lamb's avatar
James Lamb committed
404

Guolin Ke's avatar
Guolin Ke committed
405
    },
James Lamb's avatar
James Lamb committed
406

407
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
408
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
409

410
411
      # Check column names non-existence
      if (is.null(colnames)) {
412
        return(invisible(self))
413
      }
James Lamb's avatar
James Lamb committed
414

415
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
416
      colnames <- as.character(colnames)
417
      if (length(colnames) == 0L) {
418
        return(invisible(self))
419
      }
James Lamb's avatar
James Lamb committed
420

421
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
422
      private$colnames <- colnames
423
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
424

425
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
426
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
427
428
        .Call(
          LGBM_DatasetSetFeatureNames_R
429
          , private$handle
430
          , merged_name
431
        )
James Lamb's avatar
James Lamb committed
432

Guolin Ke's avatar
Guolin Ke committed
433
      }
James Lamb's avatar
James Lamb committed
434

435
      return(invisible(self))
James Lamb's avatar
James Lamb committed
436

Guolin Ke's avatar
Guolin Ke committed
437
    },
James Lamb's avatar
James Lamb committed
438

439
    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
440

441
      # Check if attribute key is in the known attribute list
442
443
444
445
446
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
447
      }
James Lamb's avatar
James Lamb committed
448

449
      # Check for info name and handle
450
      if (is.null(private$info[[field_name]])) {
451

452
        if (lgb.is.null.handle(x = private$handle)) {
453
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
454
        }
455

456
        # Get field size of info
457
        info_len <- 0L
458
459
        .Call(
          LGBM_DatasetGetFieldSize_R
460
          , private$handle
461
          , field_name
462
          , info_len
463
        )
James Lamb's avatar
James Lamb committed
464

465
        # Check if info is not empty
466
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
467

468
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
469
          ret <- NULL
470
          ret <- if (field_name == "group") {
471
472
473
474
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
475

476
477
          .Call(
            LGBM_DatasetGetField_R
478
            , private$handle
479
            , field_name
480
            , ret
481
          )
James Lamb's avatar
James Lamb committed
482

483
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
484

Guolin Ke's avatar
Guolin Ke committed
485
486
        }
      }
James Lamb's avatar
James Lamb committed
487

488
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
489

Guolin Ke's avatar
Guolin Ke committed
490
    },
James Lamb's avatar
James Lamb committed
491

492
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
493

494
      # Check if attribute key is in the known attribute list
495
496
497
498
499
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
500
      }
James Lamb's avatar
James Lamb committed
501

502
      # Check for type of information
503
504
      data <- if (field_name == "group") {
        as.integer(data) # Integer
505
      } else {
506
        as.numeric(data) # Numeric
507
      }
James Lamb's avatar
James Lamb committed
508

509
      # Store information privately
510
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
511

512
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
513

514
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
515

516
517
          .Call(
            LGBM_DatasetSetField_R
518
            , private$handle
519
520
521
            , field_name
            , data
            , length(data)
522
          )
James Lamb's avatar
James Lamb committed
523

524
525
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
526
        }
James Lamb's avatar
James Lamb committed
527

Guolin Ke's avatar
Guolin Ke committed
528
      }
James Lamb's avatar
James Lamb committed
529

530
      return(invisible(self))
James Lamb's avatar
James Lamb committed
531

Guolin Ke's avatar
Guolin Ke committed
532
    },
James Lamb's avatar
James Lamb committed
533

534
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
535
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
536

537
538
539
      additional_keyword_args <- list(...)

      if (length(additional_keyword_args) > 0L) {
540
541
        warning(paste0(
          "Dataset$slice(): Found the following passed through '...': "
542
          , paste(names(additional_keyword_args), collapse = ", ")
543
          , ". These are ignored and should be removed. "
544
          , "To change the parameters of a Dataset produced by Dataset$slice(), use Dataset$set_params(). "
545
          , "To modify attributes like 'init_score', use Dataset$set_field(). "
546
547
548
549
          , "In future releases of lightgbm, this warning will become an error."
        ))
      }

550
551
552
553
554
555
556
557
558
559
560
561
562
      # extract Dataset attributes passed through '...'
      #
      # NOTE: takes advantage of the fact that list[["non-existent-key"]] returns NULL
      group <- additional_keyword_args[["group"]]
      init_score <- additional_keyword_args[["init_score"]]
      label <- additional_keyword_args[["label"]]
      weight <- additional_keyword_args[["weight"]]

      # remove attributes from '...', so only params are left
      for (info_key in .INFO_KEYS()) {
        additional_keyword_args[[info_key]] <- NULL
      }

563
      # Perform slicing
564
565
566
      return(
        Dataset$new(
          data = NULL
567
          , params = utils::modifyList(self$get_params(), additional_keyword_args)
568
569
570
571
572
573
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
574
575
576
577
          , group = group
          , init_score = init_score
          , label = label
          , weight = weight
578
        )
579
      )
James Lamb's avatar
James Lamb committed
580

Guolin Ke's avatar
Guolin Ke committed
581
    },
James Lamb's avatar
James Lamb committed
582

583
584
585
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
586
    update_params = function(params) {
587
588
589
      if (length(params) == 0L) {
        return(invisible(self))
      }
590
      if (lgb.is.null.handle(x = private$handle)) {
591
        private$params <- utils::modifyList(private$params, params)
592
      } else {
593
594
        tryCatch({
          .Call(
595
            LGBM_DatasetUpdateParamChecking_R
596
597
598
599
600
601
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
602
          if (is.null(private$raw_data)) {
603
            stop(e)
604
605
          }

606
607
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
608
          private$params <- utils::modifyList(private$params, params)
609
          self$finalize()
610
        })
611
      }
612
      return(invisible(self))
James Lamb's avatar
James Lamb committed
613

Guolin Ke's avatar
Guolin Ke committed
614
    },
James Lamb's avatar
James Lamb committed
615

616
617
618
619
620
621
622
623
624
625
626
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

627
    # Set categorical feature parameter
628
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
629

630
631
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
632
        return(invisible(self))
633
      }
James Lamb's avatar
James Lamb committed
634

635
      # Check for empty data
636
      if (is.null(private$raw_data)) {
637
638
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
639
      }
James Lamb's avatar
James Lamb committed
640

641
      # Overwrite categorical features
642
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
643

644
      # Finalize and return self
645
      self$finalize()
646
      return(invisible(self))
James Lamb's avatar
James Lamb committed
647

648
    },
James Lamb's avatar
James Lamb committed
649

650
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
651
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
652

653
      # setting reference to this same Dataset object doesn't require any changes
654
      if (identical(private$reference, reference)) {
655
        return(invisible(self))
656
      }
James Lamb's avatar
James Lamb committed
657

658
659
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
660
      if (is.null(private$raw_data)) {
661
662
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
663
      }
James Lamb's avatar
James Lamb committed
664

665
666
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
667
      }
James Lamb's avatar
James Lamb committed
668

669
670
671
672
673
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

674
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
675
      private$reference <- reference
James Lamb's avatar
James Lamb committed
676

677
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
678
      self$finalize()
679
      return(invisible(self))
James Lamb's avatar
James Lamb committed
680

Guolin Ke's avatar
Guolin Ke committed
681
    },
James Lamb's avatar
James Lamb committed
682

683
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
684
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
685

686
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
687
      self$construct()
688
689
      .Call(
        LGBM_DatasetSaveBinary_R
690
        , private$handle
691
        , path.expand(fname)
692
      )
693
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
694
    }
James Lamb's avatar
James Lamb committed
695

Guolin Ke's avatar
Guolin Ke committed
696
697
  ),
  private = list(
698
699
700
701
702
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
703
    categorical_feature = NULL,
704
705
706
707
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
708
    version = 0L,
James Lamb's avatar
James Lamb committed
709

710
711
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
712

713
      # Get handle and construct if needed
714
      if (lgb.is.null.handle(x = private$handle)) {
715
716
        self$construct()
      }
717
      return(private$handle)
James Lamb's avatar
James Lamb committed
718

Guolin Ke's avatar
Guolin Ke committed
719
    },
James Lamb's avatar
James Lamb committed
720

721
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
722
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
723

724
      if (identical(private$predictor, predictor)) {
725
        return(invisible(self))
726
      }
James Lamb's avatar
James Lamb committed
727

728
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
729
      if (is.null(private$raw_data)) {
730
731
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
732
      }
James Lamb's avatar
James Lamb committed
733

734
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
735
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
736

737
        # Predictor is unknown
738
        if (!lgb.is.Predictor(predictor)) {
739
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
740
        }
James Lamb's avatar
James Lamb committed
741

Guolin Ke's avatar
Guolin Ke committed
742
      }
James Lamb's avatar
James Lamb committed
743

744
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
745
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
746

747
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
748
      self$finalize()
749
      return(invisible(self))
James Lamb's avatar
James Lamb committed
750

Guolin Ke's avatar
Guolin Ke committed
751
    }
James Lamb's avatar
James Lamb committed
752

Guolin Ke's avatar
Guolin Ke committed
753
754
755
  )
)

756
757
758
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
759
#' @inheritParams lgb_shared_dataset_params
760
761
762
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
763
764
765
766
767
768
769
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
770
#' @param colnames names of columns
771
772
773
774
775
776
777
778
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
779
#' @param ... other parameters passed to \code{params}
James Lamb's avatar
James Lamb committed
780
#'
Guolin Ke's avatar
Guolin Ke committed
781
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
782
#'
Guolin Ke's avatar
Guolin Ke committed
783
#' @examples
784
#' \donttest{
785
786
787
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
788
789
790
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
791
#' lgb.Dataset.construct(dtrain)
792
#' }
Guolin Ke's avatar
Guolin Ke committed
793
794
#' @export
lgb.Dataset <- function(data,
795
796
797
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
798
                        categorical_feature = NULL,
799
                        free_raw_data = TRUE,
800
801
802
803
                        label = NULL,
                        weight = NULL,
                        group = NULL,
                        init_score = NULL,
Guolin Ke's avatar
Guolin Ke committed
804
                        ...) {
James Lamb's avatar
James Lamb committed
805

806
807
808
809
810
811
812
813
814
815
816
817
  additional_params <- list(...)
  params <- modifyList(params, additional_params)

  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
    ))
  }

818
  # Create new dataset
819
820
821
822
823
824
825
826
827
828
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
829
830
831
832
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
833
834
    ))
  )
James Lamb's avatar
James Lamb committed
835

Guolin Ke's avatar
Guolin Ke committed
836
837
}

838
839
840
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
841
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
842
#' @param dataset \code{lgb.Dataset} object, training data
843
844
845
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
846
847
848
849
850
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
James Lamb's avatar
James Lamb committed
851
#'
Guolin Ke's avatar
Guolin Ke committed
852
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
853
#'
Guolin Ke's avatar
Guolin Ke committed
854
#' @examples
855
#' \donttest{
856
857
858
859
860
861
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
897
#' }
Guolin Ke's avatar
Guolin Ke committed
898
#' @export
899
900
901
902
903
904
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
905
                                     params = list()) {
James Lamb's avatar
James Lamb committed
906

907
  if (!lgb.is.Dataset(x = dataset)) {
908
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
909
  }
James Lamb's avatar
James Lamb committed
910

911
  # Create validation dataset
912
913
914
915
916
917
918
  return(invisible(
    dataset$create_valid(
      data = data
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
919
      , params = params
920
921
    )
  ))
James Lamb's avatar
James Lamb committed
922

923
}
Guolin Ke's avatar
Guolin Ke committed
924

925
926
927
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
928
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
929
#'
Guolin Ke's avatar
Guolin Ke committed
930
#' @examples
931
#' \donttest{
932
933
934
935
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
936
#' }
937
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
938
939
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
940

941
  if (!lgb.is.Dataset(x = dataset)) {
942
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
943
  }
James Lamb's avatar
James Lamb committed
944

945
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
946

Guolin Ke's avatar
Guolin Ke committed
947
948
}

949
950
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
951
#' @param x Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
952
#'
Guolin Ke's avatar
Guolin Ke committed
953
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
954
#'
Guolin Ke's avatar
Guolin Ke committed
955
956
957
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
958
#'
Guolin Ke's avatar
Guolin Ke committed
959
#' @examples
960
#' \donttest{
961
962
963
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
964
#'
965
966
967
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
968
#' }
Guolin Ke's avatar
Guolin Ke committed
969
970
#' @rdname dim
#' @export
971
dim.lgb.Dataset <- function(x) {
972

973
  if (!lgb.is.Dataset(x = x)) {
974
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
975
  }
James Lamb's avatar
James Lamb committed
976

977
  return(x$dim())
James Lamb's avatar
James Lamb committed
978

Guolin Ke's avatar
Guolin Ke committed
979
980
}

981
982
983
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
984
985
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
986
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
987
988
989
990
991
992
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
993
#' \donttest{
994
995
996
997
998
999
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1000
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1001
#' print(dtrain, verbose = TRUE)
1002
#' }
Guolin Ke's avatar
Guolin Ke committed
1003
#' @rdname dimnames.lgb.Dataset
1004
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1005
1006
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1007

1008
  if (!lgb.is.Dataset(x = x)) {
1009
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1010
  }
James Lamb's avatar
James Lamb committed
1011

1012
  # Return dimension names
1013
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1014

Guolin Ke's avatar
Guolin Ke committed
1015
1016
1017
1018
1019
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1020

1021
  # Check if invalid element list
1022
  if (!identical(class(value), "list") || length(value) != 2L) {
1023
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
1024
  }
James Lamb's avatar
James Lamb committed
1025

1026
1027
1028
1029
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1030

1031
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1032

1033
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1034
    return(x)
James Lamb's avatar
James Lamb committed
1035

1036
  }
James Lamb's avatar
James Lamb committed
1037

1038
  # Check for unmatching column size
1039
  if (ncol(x) != length(value[[2L]])) {
1040
1041
    stop(
      "can't assign "
1042
      , sQuote(length(value[[2L]]))
1043
1044
1045
1046
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1047
  }
James Lamb's avatar
James Lamb committed
1048

1049
  # Set column names properly, and return
1050
  x$set_colnames(colnames = value[[2L]])
1051
  return(x)
James Lamb's avatar
James Lamb committed
1052

Guolin Ke's avatar
Guolin Ke committed
1053
1054
}

1055
1056
1057
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1058
#' @param dataset Object of class \code{lgb.Dataset}
1059
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1060
1061
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1062
#'
Guolin Ke's avatar
Guolin Ke committed
1063
#' @examples
1064
#' \donttest{
1065
1066
1067
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1068
#'
1069
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1070
#' lgb.Dataset.construct(dsub)
1071
#' labels <- lightgbm::get_field(dsub, "label")
1072
#' }
Guolin Ke's avatar
Guolin Ke committed
1073
#' @export
1074
1075
1076
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1077
1078
1079
1080

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1081

1082
  if (!lgb.is.Dataset(x = dataset)) {
1083
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1084
  }
James Lamb's avatar
James Lamb committed
1085

1086
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1087

Guolin Ke's avatar
Guolin Ke committed
1088
1089
}

1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1183
1184
}

1185
1186
1187
1188
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1189
#' @param dataset object of class \code{lgb.Dataset}
1190
1191
1192
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1193
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1194
#'
1195
#' @examples
1196
#' \donttest{
1197
1198
1199
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1200
1201
1202
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1203
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1204
#' }
1205
1206
1207
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1208

1209
  if (!lgb.is.Dataset(x = dataset)) {
1210
1211
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1212

1213
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1214

1215
1216
}

1217
1218
1219
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1220
1221
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1222
#'
1223
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1224
#'
Guolin Ke's avatar
Guolin Ke committed
1225
#' @examples
1226
#' \donttest{
1227
#' # create training Dataset
1228
1229
1230
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1231
1232
#'
#' # create a validation Dataset, using dtrain as a reference
1233
1234
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1235
#' dtest <- lgb.Dataset(test$data, label = test$label)
1236
#' lgb.Dataset.set.reference(dtest, dtrain)
1237
#' }
Guolin Ke's avatar
Guolin Ke committed
1238
1239
1240
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1241

1242
  if (!lgb.is.Dataset(x = dataset)) {
1243
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1244
  }
James Lamb's avatar
James Lamb committed
1245

1246
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1247
1248
}

1249
1250
1251
1252
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1253
1254
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1255
#'
1256
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1257
#'
Guolin Ke's avatar
Guolin Ke committed
1258
#' @examples
1259
#' \donttest{
1260
1261
1262
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1263
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1264
#' }
Guolin Ke's avatar
Guolin Ke committed
1265
1266
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1267

1268
  if (!lgb.is.Dataset(x = dataset)) {
1269
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1270
  }
James Lamb's avatar
James Lamb committed
1271

1272
1273
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1274
  }
James Lamb's avatar
James Lamb committed
1275

1276
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1277
}