lgb.Dataset.R 47 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @param info a list of information of the \code{lgb.Dataset} object. NOTE: use of \code{info}
#'             is deprecated as of v3.3.0. Use keyword arguments (e.g. \code{init_score = init_score})
#'             directly.
#' @keywords internal
NULL

19
20
21
22
23
24
25
26
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
27
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
28
#' @importFrom R6 R6Class
29
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
30
31
Dataset <- R6::R6Class(

32
  classname = "lgb.Dataset",
33
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
34
  public = list(
James Lamb's avatar
James Lamb committed
35

36
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
37
    finalize = function() {
38
39
40
41
42
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
43
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
44
    },
James Lamb's avatar
James Lamb committed
45

46
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
47
    initialize = function(data,
48
49
50
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
51
                          categorical_feature = NULL,
52
53
54
55
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
56
57
58
59
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
60

61
      # validate inputs early to avoid unnecessary computation
62
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
63
64
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
65
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
66
67
68
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

69
70
71
72
73
74
75
      if (length(info) > 0L) {
        warning(paste0(
          "lgb.Dataset: found fields passed through 'info'. "
          , "As of v3.3.0, this behavior is deprecated, and support for it will be removed in a future release. "
          , "To suppress this warning, use keyword arguments 'label', 'weight', 'group', or 'init_score' directly"
        ))
      }
James Lamb's avatar
James Lamb committed
76

77
78
79
80
81
82
83
84
85
86
87
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
88
      }
James Lamb's avatar
James Lamb committed
89

90
91
92
93
94
95
96
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
97

98
99
100
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
101
      private$reference <- reference
102
      private$colnames <- colnames
103

104
      private$categorical_feature <- categorical_feature
105
106
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
107
      private$used_indices <- sort(used_indices, decreasing = FALSE)
108
      private$info <- info
109
      private$version <- 0L
James Lamb's avatar
James Lamb committed
110

111
112
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
113
    },
James Lamb's avatar
James Lamb committed
114

115
116
    create_valid = function(data,
                            info = list(),
117
118
119
120
121
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
                            params = list(),
122
                            ...) {
James Lamb's avatar
James Lamb committed
123

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      additional_params <- list(...)
      if (length(additional_params) > 0L) {
        warning(paste0(
          "Dataset$create_valid(): Found the following passed through '...': "
          , paste(names(additional_params), collapse = ", ")
          , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
          , "Add these to 'params' instead. "
          , "See ?lgb.Dataset.create.valid for documentation on how to call this function."
        ))
      }

      # anything passed into '...' should be overwritten by things passed to 'params'
      params <- modifyList(additional_params, params)

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

141
      # Create new dataset
142
143
      ret <- Dataset$new(
        data = data
144
        , params = params
145
146
147
148
149
150
151
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
152
153
154
155
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
156
      )
James Lamb's avatar
James Lamb committed
157

158
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
159

Guolin Ke's avatar
Guolin Ke committed
160
    },
James Lamb's avatar
James Lamb committed
161

162
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
163
    construct = function() {
James Lamb's avatar
James Lamb committed
164

165
      # Check for handle null
166
      if (!lgb.is.null.handle(x = private$handle)) {
167
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
168
      }
James Lamb's avatar
James Lamb committed
169

Guolin Ke's avatar
Guolin Ke committed
170
171
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
172
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
173
174
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
175

176
      # set feature names if they do not exist
177
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
178
179
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
180

181
182
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
183

184
        # Check for character name
185
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
186

187
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
188

189
            # Provided indices, but some indices are missing?
190
            if (sum(is.na(cate_indices)) > 0L) {
191
192
193
194
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
195
            }
James Lamb's avatar
James Lamb committed
196

197
          } else {
James Lamb's avatar
James Lamb committed
198

199
            # Check if more categorical features were output over the feature space
200
            if (max(private$categorical_feature) > length(private$colnames)) {
201
202
203
204
205
206
207
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
208
            }
James Lamb's avatar
James Lamb committed
209

210
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
211
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
212

213
          }
James Lamb's avatar
James Lamb committed
214

215
        # Store indices for categorical features
216
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
217

218
      }
James Lamb's avatar
James Lamb committed
219

Guolin Ke's avatar
Guolin Ke committed
220
      # Generate parameter str
221
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
222

223
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
224
225
226
227
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
228

229
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
230
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
231

232
233
234
235
236
237
238
239
240
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

241
        # Are we using a data file?
242
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
243

244
          handle <- .Call(
245
            LGBM_DatasetCreateFromFile_R
246
            , private$raw_data
247
248
249
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
250

Guolin Ke's avatar
Guolin Ke committed
251
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
252

253
          # Are we using a matrix?
254
          handle <- .Call(
255
            LGBM_DatasetCreateFromMat_R
256
257
258
259
260
261
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
262
263

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
264
          if (length(private$raw_data@p) > 2147483647L) {
265
266
            stop("Cannot support large CSC matrix")
          }
267
          # Are we using a dgCMatrix (sparsed matrix column compressed)
268
          handle <- .Call(
269
            LGBM_DatasetCreateFromCSC_R
270
271
272
273
274
275
276
277
278
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
279

Guolin Ke's avatar
Guolin Ke committed
280
        } else {
James Lamb's avatar
James Lamb committed
281

282
          # Unknown data type
283
284
285
286
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
287

Guolin Ke's avatar
Guolin Ke committed
288
        }
James Lamb's avatar
James Lamb committed
289

Guolin Ke's avatar
Guolin Ke committed
290
      } else {
James Lamb's avatar
James Lamb committed
291

292
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
293
        if (is.null(private$reference)) {
294
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
295
        }
James Lamb's avatar
James Lamb committed
296

297
        # Construct subset
298
        handle <- .Call(
299
          LGBM_DatasetGetSubset_R
300
301
302
303
304
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
305

Guolin Ke's avatar
Guolin Ke committed
306
      }
307
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
308
309
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
310
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
311
312
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
313

314
315
      # Set feature names
      if (!is.null(private$colnames)) {
316
        self$set_colnames(colnames = private$colnames)
317
      }
318

319
320
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
321

322
        # Setup initial scores
323
        init_score <- private$predictor$predict(
324
          data = private$raw_data
325
326
327
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
328

329
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
330
331
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
332

333
      }
James Lamb's avatar
James Lamb committed
334

335
336
337
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
338
      }
James Lamb's avatar
James Lamb committed
339

340
      # Get private information
341
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
342

343
        # Set infos
344
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
345

Guolin Ke's avatar
Guolin Ke committed
346
          p <- private$info[i]
347
348
349
350
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
351

Guolin Ke's avatar
Guolin Ke committed
352
        }
James Lamb's avatar
James Lamb committed
353

Guolin Ke's avatar
Guolin Ke committed
354
      }
James Lamb's avatar
James Lamb committed
355

356
      # Get label information existence
357
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
358
359
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
360

361
      return(invisible(self))
James Lamb's avatar
James Lamb committed
362

Guolin Ke's avatar
Guolin Ke committed
363
    },
James Lamb's avatar
James Lamb committed
364

365
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
366
    dim = function() {
James Lamb's avatar
James Lamb committed
367

368
      # Check for handle
369
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
370

371
372
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
373

374
        # Get numeric data and numeric features
375
376
377
378
379
380
381
382
383
384
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
385
        return(
386
          c(num_row, num_col)
387
        )
James Lamb's avatar
James Lamb committed
388
389
390

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

391
        # Check if dgCMatrix (sparse matrix column compressed)
392
        # NOTE: requires Matrix package
393
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
394

Guolin Ke's avatar
Guolin Ke committed
395
      } else {
James Lamb's avatar
James Lamb committed
396

397
        # Trying to work with unknown dimensions is not possible
398
399
400
401
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
402

Guolin Ke's avatar
Guolin Ke committed
403
      }
James Lamb's avatar
James Lamb committed
404

Guolin Ke's avatar
Guolin Ke committed
405
    },
James Lamb's avatar
James Lamb committed
406

407
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
408
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
409

410
      # Check for handle
411
      if (!lgb.is.null.handle(x = private$handle)) {
412
        private$colnames <- .Call(
413
414
          LGBM_DatasetGetFeatureNames_R
          , private$handle
415
        )
416
        return(private$colnames)
James Lamb's avatar
James Lamb committed
417
418
419

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

420
        # Check if dgCMatrix (sparse matrix column compressed)
421
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
422

Guolin Ke's avatar
Guolin Ke committed
423
      } else {
James Lamb's avatar
James Lamb committed
424

425
        # Trying to work with unknown formats is not possible
426
        stop(
427
428
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
429
        )
James Lamb's avatar
James Lamb committed
430

Guolin Ke's avatar
Guolin Ke committed
431
      }
James Lamb's avatar
James Lamb committed
432

Guolin Ke's avatar
Guolin Ke committed
433
    },
James Lamb's avatar
James Lamb committed
434

435
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
436
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
437

438
439
      # Check column names non-existence
      if (is.null(colnames)) {
440
        return(invisible(self))
441
      }
James Lamb's avatar
James Lamb committed
442

443
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
444
      colnames <- as.character(colnames)
445
      if (length(colnames) == 0L) {
446
        return(invisible(self))
447
      }
James Lamb's avatar
James Lamb committed
448

449
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
450
      private$colnames <- colnames
451
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
452

453
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
454
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
455
456
        .Call(
          LGBM_DatasetSetFeatureNames_R
457
          , private$handle
458
          , merged_name
459
        )
James Lamb's avatar
James Lamb committed
460

Guolin Ke's avatar
Guolin Ke committed
461
      }
James Lamb's avatar
James Lamb committed
462

463
      return(invisible(self))
James Lamb's avatar
James Lamb committed
464

Guolin Ke's avatar
Guolin Ke committed
465
    },
James Lamb's avatar
James Lamb committed
466

Guolin Ke's avatar
Guolin Ke committed
467
    getinfo = function(name) {
468
469
470
471
472
473
474
475
476
477
478
479
      warning(paste0(
        "Dataset$getinfo() is deprecated and will be removed in a future release. "
        , "Use Dataset$get_field() instead."
      ))
      return(
        self$get_field(
          field_name = name
        )
      )
    },

    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
480

481
      # Check if attribute key is in the known attribute list
482
483
484
485
486
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
487
      }
James Lamb's avatar
James Lamb committed
488

489
      # Check for info name and handle
490
      if (is.null(private$info[[field_name]])) {
491

492
        if (lgb.is.null.handle(x = private$handle)) {
493
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
494
        }
495

496
        # Get field size of info
497
        info_len <- 0L
498
499
        .Call(
          LGBM_DatasetGetFieldSize_R
500
          , private$handle
501
          , field_name
502
          , info_len
503
        )
James Lamb's avatar
James Lamb committed
504

505
        # Check if info is not empty
506
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
507

508
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
509
          ret <- NULL
510
          ret <- if (field_name == "group") {
511
512
513
514
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
515

516
517
          .Call(
            LGBM_DatasetGetField_R
518
            , private$handle
519
            , field_name
520
            , ret
521
          )
James Lamb's avatar
James Lamb committed
522

523
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
524

Guolin Ke's avatar
Guolin Ke committed
525
526
        }
      }
James Lamb's avatar
James Lamb committed
527

528
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
529

Guolin Ke's avatar
Guolin Ke committed
530
    },
James Lamb's avatar
James Lamb committed
531

Guolin Ke's avatar
Guolin Ke committed
532
    setinfo = function(name, info) {
533
534
535
536
537
538
539
540
541
542
543
544
545
      warning(paste0(
        "Dataset$setinfo() is deprecated and will be removed in a future release. "
        , "Use Dataset$set_field() instead."
      ))
      return(
        self$set_field(
          field_name = name
          , data = info
        )
      )
    },

    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
546

547
      # Check if attribute key is in the known attribute list
548
549
550
551
552
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
553
      }
James Lamb's avatar
James Lamb committed
554

555
      # Check for type of information
556
557
      data <- if (field_name == "group") {
        as.integer(data) # Integer
558
      } else {
559
        as.numeric(data) # Numeric
560
      }
James Lamb's avatar
James Lamb committed
561

562
      # Store information privately
563
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
564

565
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
566

567
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
568

569
570
          .Call(
            LGBM_DatasetSetField_R
571
            , private$handle
572
573
574
            , field_name
            , data
            , length(data)
575
          )
James Lamb's avatar
James Lamb committed
576

577
578
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
579
        }
James Lamb's avatar
James Lamb committed
580

Guolin Ke's avatar
Guolin Ke committed
581
      }
James Lamb's avatar
James Lamb committed
582

583
      return(invisible(self))
James Lamb's avatar
James Lamb committed
584

Guolin Ke's avatar
Guolin Ke committed
585
    },
James Lamb's avatar
James Lamb committed
586

587
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
588
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
589

590
591
592
      additional_keyword_args <- list(...)

      if (length(additional_keyword_args) > 0L) {
593
594
        warning(paste0(
          "Dataset$slice(): Found the following passed through '...': "
595
          , paste(names(additional_keyword_args), collapse = ", ")
596
          , ". These are ignored and should be removed. "
597
          , "To change the parameters of a Dataset produced by Dataset$slice(), use Dataset$set_params(). "
598
          , "To modify attributes like 'init_score', use Dataset$set_field(). "
599
600
601
602
          , "In future releases of lightgbm, this warning will become an error."
        ))
      }

603
604
605
606
607
608
609
610
611
612
613
614
615
      # extract Dataset attributes passed through '...'
      #
      # NOTE: takes advantage of the fact that list[["non-existent-key"]] returns NULL
      group <- additional_keyword_args[["group"]]
      init_score <- additional_keyword_args[["init_score"]]
      label <- additional_keyword_args[["label"]]
      weight <- additional_keyword_args[["weight"]]

      # remove attributes from '...', so only params are left
      for (info_key in .INFO_KEYS()) {
        additional_keyword_args[[info_key]] <- NULL
      }

616
      # Perform slicing
617
618
619
      return(
        Dataset$new(
          data = NULL
620
          , params = utils::modifyList(self$get_params(), additional_keyword_args)
621
622
623
624
625
626
627
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
628
629
630
631
          , group = group
          , init_score = init_score
          , label = label
          , weight = weight
632
        )
633
      )
James Lamb's avatar
James Lamb committed
634

Guolin Ke's avatar
Guolin Ke committed
635
    },
James Lamb's avatar
James Lamb committed
636

637
638
639
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
640
    update_params = function(params) {
641
642
643
      if (length(params) == 0L) {
        return(invisible(self))
      }
644
      if (lgb.is.null.handle(x = private$handle)) {
645
        private$params <- utils::modifyList(private$params, params)
646
      } else {
647
648
        tryCatch({
          .Call(
649
            LGBM_DatasetUpdateParamChecking_R
650
651
652
653
654
655
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
656
          if (is.null(private$raw_data)) {
657
            stop(e)
658
659
          }

660
661
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
662
          private$params <- utils::modifyList(private$params, params)
663
          self$finalize()
664
        })
665
      }
666
      return(invisible(self))
James Lamb's avatar
James Lamb committed
667

Guolin Ke's avatar
Guolin Ke committed
668
    },
James Lamb's avatar
James Lamb committed
669

670
671
672
673
674
675
676
677
678
679
680
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

681
    # Set categorical feature parameter
682
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
683

684
685
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
686
        return(invisible(self))
687
      }
James Lamb's avatar
James Lamb committed
688

689
      # Check for empty data
690
      if (is.null(private$raw_data)) {
691
692
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
693
      }
James Lamb's avatar
James Lamb committed
694

695
      # Overwrite categorical features
696
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
697

698
      # Finalize and return self
699
      self$finalize()
700
      return(invisible(self))
James Lamb's avatar
James Lamb committed
701

702
    },
James Lamb's avatar
James Lamb committed
703

704
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
705
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
706

707
      # setting reference to this same Dataset object doesn't require any changes
708
      if (identical(private$reference, reference)) {
709
        return(invisible(self))
710
      }
James Lamb's avatar
James Lamb committed
711

712
713
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
714
      if (is.null(private$raw_data)) {
715
716
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
717
      }
James Lamb's avatar
James Lamb committed
718

719
720
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
721
      }
James Lamb's avatar
James Lamb committed
722

723
724
725
726
727
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

728
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
729
      private$reference <- reference
James Lamb's avatar
James Lamb committed
730

731
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
732
      self$finalize()
733
      return(invisible(self))
James Lamb's avatar
James Lamb committed
734

Guolin Ke's avatar
Guolin Ke committed
735
    },
James Lamb's avatar
James Lamb committed
736

737
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
738
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
739

740
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
741
      self$construct()
742
743
      .Call(
        LGBM_DatasetSaveBinary_R
744
        , private$handle
745
        , fname
746
      )
747
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
748
    }
James Lamb's avatar
James Lamb committed
749

Guolin Ke's avatar
Guolin Ke committed
750
751
  ),
  private = list(
752
753
754
755
756
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
757
    categorical_feature = NULL,
758
759
760
761
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
762
    version = 0L,
James Lamb's avatar
James Lamb committed
763

764
765
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
766

767
      # Get handle and construct if needed
768
      if (lgb.is.null.handle(x = private$handle)) {
769
770
        self$construct()
      }
771
      return(private$handle)
James Lamb's avatar
James Lamb committed
772

Guolin Ke's avatar
Guolin Ke committed
773
    },
James Lamb's avatar
James Lamb committed
774

775
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
776
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
777

778
      if (identical(private$predictor, predictor)) {
779
        return(invisible(self))
780
      }
James Lamb's avatar
James Lamb committed
781

782
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
783
      if (is.null(private$raw_data)) {
784
785
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
786
      }
James Lamb's avatar
James Lamb committed
787

788
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
789
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
790

791
        # Predictor is unknown
792
        if (!lgb.is.Predictor(predictor)) {
793
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
794
        }
James Lamb's avatar
James Lamb committed
795

Guolin Ke's avatar
Guolin Ke committed
796
      }
James Lamb's avatar
James Lamb committed
797

798
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
799
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
800

801
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
802
      self$finalize()
803
      return(invisible(self))
James Lamb's avatar
James Lamb committed
804

Guolin Ke's avatar
Guolin Ke committed
805
    }
James Lamb's avatar
James Lamb committed
806

Guolin Ke's avatar
Guolin Ke committed
807
808
809
  )
)

810
811
812
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
813
#' @inheritParams lgb_shared_dataset_params
814
815
816
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
817
818
819
820
821
822
823
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
824
#' @param colnames names of columns
825
826
827
828
829
830
831
832
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
833
#' @param ... other parameters passed to \code{params}
James Lamb's avatar
James Lamb committed
834
#'
Guolin Ke's avatar
Guolin Ke committed
835
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
836
#'
Guolin Ke's avatar
Guolin Ke committed
837
#' @examples
838
#' \donttest{
839
840
841
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
842
843
844
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
845
#' lgb.Dataset.construct(dtrain)
846
#' }
Guolin Ke's avatar
Guolin Ke committed
847
848
#' @export
lgb.Dataset <- function(data,
849
850
851
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
852
                        categorical_feature = NULL,
853
854
                        free_raw_data = TRUE,
                        info = list(),
855
856
857
858
                        label = NULL,
                        weight = NULL,
                        group = NULL,
                        init_score = NULL,
Guolin Ke's avatar
Guolin Ke committed
859
                        ...) {
James Lamb's avatar
James Lamb committed
860

861
862
863
864
865
866
867
868
869
870
871
872
  additional_params <- list(...)
  params <- modifyList(params, additional_params)

  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset for documentation on how to call this function."
    ))
  }

873
  # Create new dataset
874
875
876
877
878
879
880
881
882
883
884
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
885
886
887
888
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
889
890
    ))
  )
James Lamb's avatar
James Lamb committed
891

Guolin Ke's avatar
Guolin Ke committed
892
893
}

894
895
896
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
897
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
898
#' @param dataset \code{lgb.Dataset} object, training data
899
900
901
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
902
903
904
905
906
907
908
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
#' @param ... additional \code{lgb.Dataset} parameters.
#'            NOTE: As of v3.3.0, use of \code{...} is deprecated. Add parameters to \code{params} directly.
James Lamb's avatar
James Lamb committed
909
#'
Guolin Ke's avatar
Guolin Ke committed
910
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
911
#'
Guolin Ke's avatar
Guolin Ke committed
912
#' @examples
913
#' \donttest{
914
915
916
917
918
919
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
955
#' }
956
#' @importFrom utils modifyList
Guolin Ke's avatar
Guolin Ke committed
957
#' @export
958
959
960
961
962
963
964
965
966
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     info = list(),
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
                                     params = list(),
                                     ...) {
James Lamb's avatar
James Lamb committed
967

968
  if (!lgb.is.Dataset(x = dataset)) {
969
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
970
  }
James Lamb's avatar
James Lamb committed
971

972
973
974
975
976
977
978
979
980
981
  additional_params <- list(...)
  if (length(additional_params) > 0L) {
    warning(paste0(
      "lgb.Dataset.create.valid: Found the following passed through '...': "
      , paste(names(additional_params), collapse = ", ")
      , ". These will be used, but in future releases of lightgbm, this warning will become an error. "
      , "Add these to 'params' instead. See ?lgb.Dataset.create.valid for documentation on how to call this function."
    ))
  }

982
  # Create validation dataset
983
984
985
986
987
988
989
990
991
992
993
  return(invisible(
    dataset$create_valid(
      data = data
      , info = info
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
      , params = utils::modifyList(params, additional_params)
    )
  ))
James Lamb's avatar
James Lamb committed
994

995
}
Guolin Ke's avatar
Guolin Ke committed
996

997
998
999
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
1000
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1001
#'
Guolin Ke's avatar
Guolin Ke committed
1002
#' @examples
1003
#' \donttest{
1004
1005
1006
1007
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
1008
#' }
1009
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
1010
1011
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
1012

1013
  if (!lgb.is.Dataset(x = dataset)) {
1014
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1015
  }
James Lamb's avatar
James Lamb committed
1016

1017
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
1018

Guolin Ke's avatar
Guolin Ke committed
1019
1020
}

1021
1022
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
1023
#' @param x Object of class \code{lgb.Dataset}
1024
#' @param ... other parameters (ignored)
James Lamb's avatar
James Lamb committed
1025
#'
Guolin Ke's avatar
Guolin Ke committed
1026
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
1027
#'
Guolin Ke's avatar
Guolin Ke committed
1028
1029
1030
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
1031
#'
Guolin Ke's avatar
Guolin Ke committed
1032
#' @examples
1033
#' \donttest{
1034
1035
1036
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1037
#'
1038
1039
1040
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
1041
#' }
Guolin Ke's avatar
Guolin Ke committed
1042
1043
1044
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
1045

1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "dim.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?dim.lgb.Dataset for documentation on how to call this function."
    ))
  }

1056
  if (!lgb.is.Dataset(x = x)) {
1057
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1058
  }
James Lamb's avatar
James Lamb committed
1059

1060
  return(x$dim())
James Lamb's avatar
James Lamb committed
1061

Guolin Ke's avatar
Guolin Ke committed
1062
1063
}

1064
1065
1066
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
1067
1068
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
1069
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
1070
1071
1072
1073
1074
1075
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
1076
#' \donttest{
1077
1078
1079
1080
1081
1082
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
1083
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
1084
#' print(dtrain, verbose = TRUE)
1085
#' }
Guolin Ke's avatar
Guolin Ke committed
1086
#' @rdname dimnames.lgb.Dataset
1087
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
1088
1089
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
1090

1091
  if (!lgb.is.Dataset(x = x)) {
1092
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1093
  }
James Lamb's avatar
James Lamb committed
1094

1095
  # Return dimension names
1096
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
1097

Guolin Ke's avatar
Guolin Ke committed
1098
1099
1100
1101
1102
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
1103

1104
  # Check if invalid element list
1105
  if (!identical(class(value), "list") || length(value) != 2L) {
1106
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
1107
  }
James Lamb's avatar
James Lamb committed
1108

1109
1110
1111
1112
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
1113

1114
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
1115

1116
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
1117
    return(x)
James Lamb's avatar
James Lamb committed
1118

1119
  }
James Lamb's avatar
James Lamb committed
1120

1121
  # Check for unmatching column size
1122
  if (ncol(x) != length(value[[2L]])) {
1123
1124
    stop(
      "can't assign "
1125
      , sQuote(length(value[[2L]]))
1126
1127
1128
1129
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1130
  }
James Lamb's avatar
James Lamb committed
1131

1132
  # Set column names properly, and return
1133
  x$set_colnames(colnames = value[[2L]])
1134
  return(x)
James Lamb's avatar
James Lamb committed
1135

Guolin Ke's avatar
Guolin Ke committed
1136
1137
}

1138
1139
1140
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1141
#' @param dataset Object of class \code{lgb.Dataset}
1142
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1143
1144
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1145
#'
Guolin Ke's avatar
Guolin Ke committed
1146
#' @examples
1147
#' \donttest{
1148
1149
1150
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1151
#'
1152
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1153
#' lgb.Dataset.construct(dsub)
1154
#' labels <- lightgbm::get_field(dsub, "label")
1155
#' }
Guolin Ke's avatar
Guolin Ke committed
1156
#' @export
1157
1158
1159
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1160
1161
1162
1163

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1164

1165
  if (!lgb.is.Dataset(x = dataset)) {
1166
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1167
  }
James Lamb's avatar
James Lamb committed
1168

1169
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1170

Guolin Ke's avatar
Guolin Ke committed
1171
1172
}

1173
1174
1175
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1176
1177
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
1178
#' @param ... other parameters (ignored)
Guolin Ke's avatar
Guolin Ke committed
1179
#' @return info data
James Lamb's avatar
James Lamb committed
1180
#'
Guolin Ke's avatar
Guolin Ke committed
1181
1182
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1183
#'
Guolin Ke's avatar
Guolin Ke committed
1184
1185
1186
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
1187
1188
1189
1190
1191
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1192
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1193
#' }
James Lamb's avatar
James Lamb committed
1194
#'
Guolin Ke's avatar
Guolin Ke committed
1195
#' @examples
1196
#' \donttest{
1197
1198
1199
1200
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1201
#'
1202
1203
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1204
#'
1205
1206
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1207
#' }
Guolin Ke's avatar
Guolin Ke committed
1208
#' @export
1209
1210
1211
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1212
1213
1214
1215

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1216

1217
1218
  warning("Calling getinfo() on a lgb.Dataset is deprecated. Use get_field() instead.")

1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "getinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?getinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1229
  if (!lgb.is.Dataset(x = dataset)) {
1230
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1231
  }
James Lamb's avatar
James Lamb committed
1232

1233
  return(dataset$get_field(field_name = name))
James Lamb's avatar
James Lamb committed
1234

Guolin Ke's avatar
Guolin Ke committed
1235
1236
}

1237
1238
1239
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1240
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1241
1242
#' @param name the name of the field to get
#' @param info the specific field of information to set
1243
#' @param ... other parameters (ignored)
1244
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1245
#'
Guolin Ke's avatar
Guolin Ke committed
1246
1247
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1248
#'
Guolin Ke's avatar
Guolin Ke committed
1249
#' \itemize{
1250
1251
1252
1253
1254
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1255
1256
1257
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1258
#' }
James Lamb's avatar
James Lamb committed
1259
#'
Guolin Ke's avatar
Guolin Ke committed
1260
#' @examples
1261
#' \donttest{
1262
1263
1264
1265
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1266
#'
1267
1268
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1269
#'
1270
1271
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1272
#' }
Guolin Ke's avatar
Guolin Ke committed
1273
#' @export
1274
1275
1276
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1277
1278
1279
1280

#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1281

1282
1283
  warning("Calling setinfo() on a lgb.Dataset is deprecated. Use set_field() instead.")

1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
  additional_args <- list(...)
  if (length(additional_args) > 0L) {
    warning(paste0(
      "setinfo.lgb.Dataset: Found the following passed through '...': "
      , paste(names(additional_args), collapse = ", ")
      , ". These are ignored. In future releases of lightgbm, this warning will become an error. "
      , "See ?setinfo.lgb.Dataset for documentation on how to call this function."
    ))
  }

1294
  if (!lgb.is.Dataset(x = dataset)) {
1295
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1296
  }
James Lamb's avatar
James Lamb committed
1297

1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
  return(invisible(dataset$set_field(field_name = name, data = info)))
}

#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1394
1395
}

1396
1397
1398
1399
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1400
#' @param dataset object of class \code{lgb.Dataset}
1401
1402
1403
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1404
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1405
#'
1406
#' @examples
1407
#' \donttest{
1408
1409
1410
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1411
1412
1413
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1414
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1415
#' }
1416
1417
1418
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1419

1420
  if (!lgb.is.Dataset(x = dataset)) {
1421
1422
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1423

1424
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1425

1426
1427
}

1428
1429
1430
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1431
1432
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1433
#'
1434
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1435
#'
Guolin Ke's avatar
Guolin Ke committed
1436
#' @examples
1437
#' \donttest{
1438
#' # create training Dataset
1439
1440
1441
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1442
1443
#'
#' # create a validation Dataset, using dtrain as a reference
1444
1445
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1446
#' dtest <- lgb.Dataset(test$data, label = test$label)
1447
#' lgb.Dataset.set.reference(dtest, dtrain)
1448
#' }
Guolin Ke's avatar
Guolin Ke committed
1449
1450
1451
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1452

1453
  if (!lgb.is.Dataset(x = dataset)) {
1454
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1455
  }
James Lamb's avatar
James Lamb committed
1456

1457
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1458
1459
}

1460
1461
1462
1463
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1464
1465
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1466
#'
1467
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1468
#'
Guolin Ke's avatar
Guolin Ke committed
1469
#' @examples
1470
#' \donttest{
1471
1472
1473
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1474
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1475
#' }
Guolin Ke's avatar
Guolin Ke committed
1476
1477
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1478

1479
  if (!lgb.is.Dataset(x = dataset)) {
1480
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1481
  }
James Lamb's avatar
James Lamb committed
1482

1483
1484
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1485
  }
James Lamb's avatar
James Lamb committed
1486

1487
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1488
}