lgb.Dataset.R 37.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#' @name lgb_shared_dataset_params
#' @title Shared Dataset parameter docs
#' @description Parameter docs for fields used in \code{lgb.Dataset} construction
#' @param label vector of labels to use as the target variable
#' @param weight numeric vector of sample weights
#' @param init_score initial score is the base prediction lightgbm will boost from
#' @param group used for learning-to-rank tasks. An integer vector describing how to
#'              group rows together as ordered results from the same set of candidate results
#'              to be ranked. For example, if you have a 100-document dataset with
#'              \code{group = c(10, 20, 40, 10, 10, 10)}, that means that you have 6 groups,
#'              where the first 10 records are in the first group, records 11-30 are in the
#'              second group, etc.
#' @keywords internal
NULL

16
17
18
19
20
21
22
23
# [description] List of valid keys for "info" arguments in lgb.Dataset.
#               Wrapped in a function to take advantage of lazy evaluation
#               (so it doesn't matter what order R sources files during installation).
# [return] A character vector of names.
.INFO_KEYS <- function() {
  return(c("label", "weight", "init_score", "group"))
}

James Lamb's avatar
James Lamb committed
24
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
25
#' @importFrom R6 R6Class
26
#' @importFrom utils modifyList
James Lamb's avatar
James Lamb committed
27
28
Dataset <- R6::R6Class(

29
  classname = "lgb.Dataset",
30
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
31
  public = list(
James Lamb's avatar
James Lamb committed
32

33
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
34
    finalize = function() {
35
36
37
38
39
      .Call(
        LGBM_DatasetFree_R
        , private$handle
      )
      private$handle <- NULL
40
      return(invisible(NULL))
Guolin Ke's avatar
Guolin Ke committed
41
    },
James Lamb's avatar
James Lamb committed
42

43
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
44
    initialize = function(data,
45
46
47
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
48
                          categorical_feature = NULL,
49
50
51
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
52
53
54
55
                          label = NULL,
                          weight = NULL,
                          group = NULL,
                          init_score = NULL) {
James Lamb's avatar
James Lamb committed
56

57
      # validate inputs early to avoid unnecessary computation
58
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
59
60
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
61
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
62
63
64
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

65
      info <- list()
66
67
68
69
70
71
72
73
74
75
76
      if (!is.null(label)) {
        info[["label"]] <- label
      }
      if (!is.null(weight)) {
        info[["weight"]] <- weight
      }
      if (!is.null(group)) {
        info[["group"]] <- group
      }
      if (!is.null(init_score)) {
        info[["init_score"]] <- init_score
Guolin Ke's avatar
Guolin Ke committed
77
      }
James Lamb's avatar
James Lamb committed
78

79
80
81
82
83
84
85
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
86

87
88
89
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
90
      private$reference <- reference
91
      private$colnames <- colnames
92

93
      private$categorical_feature <- categorical_feature
94
95
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
96
      private$used_indices <- sort(used_indices, decreasing = FALSE)
97
      private$info <- info
98
      private$version <- 0L
James Lamb's avatar
James Lamb committed
99

100
101
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
102
    },
James Lamb's avatar
James Lamb committed
103

104
    create_valid = function(data,
105
106
107
108
                            label = NULL,
                            weight = NULL,
                            group = NULL,
                            init_score = NULL,
109
                            params = list()) {
110
111
112
113

      # the Dataset's existing parameters should be overwritten by any passed in to this call
      params <- modifyList(self$get_params(), params)

114
      # Create new dataset
115
116
      ret <- Dataset$new(
        data = data
117
        , params = params
118
119
120
121
122
123
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
124
125
126
127
        , label = label
        , weight = weight
        , group = group
        , init_score = init_score
128
      )
James Lamb's avatar
James Lamb committed
129

130
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
131

Guolin Ke's avatar
Guolin Ke committed
132
    },
James Lamb's avatar
James Lamb committed
133

134
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
135
    construct = function() {
James Lamb's avatar
James Lamb committed
136

137
      # Check for handle null
138
      if (!lgb.is.null.handle(x = private$handle)) {
139
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
140
      }
James Lamb's avatar
James Lamb committed
141

Guolin Ke's avatar
Guolin Ke committed
142
143
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
144
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
145
146
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
147

148
      # set feature names if they do not exist
149
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
150
151
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
152

153
154
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
155

156
        # Check for character name
157
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
158

159
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
160

161
            # Provided indices, but some indices are missing?
162
            if (sum(is.na(cate_indices)) > 0L) {
163
164
165
166
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
167
            }
James Lamb's avatar
James Lamb committed
168

169
          } else {
James Lamb's avatar
James Lamb committed
170

171
            # Check if more categorical features were output over the feature space
172
            if (max(private$categorical_feature) > length(private$colnames)) {
173
174
175
176
177
178
179
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
180
            }
James Lamb's avatar
James Lamb committed
181

182
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
183
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
184

185
          }
James Lamb's avatar
James Lamb committed
186

187
        # Store indices for categorical features
188
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
189

190
      }
James Lamb's avatar
James Lamb committed
191

Guolin Ke's avatar
Guolin Ke committed
192
      # Generate parameter str
193
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
194

195
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
196
197
198
199
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
200

201
      # not subsetting, constructing from raw data
Guolin Ke's avatar
Guolin Ke committed
202
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
203

204
205
206
207
208
209
210
211
212
        if (is.null(private$raw_data)) {
          stop(paste0(
            "Attempting to create a Dataset without any raw data. "
            , "This can happen if you have called Dataset$finalize() or if this Dataset was saved with saveRDS(). "
            , "To avoid this error in the future, use lgb.Dataset.save() or "
            , "Dataset$save_binary() to save lightgbm Datasets."
          ))
        }

213
        # Are we using a data file?
214
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
215

216
          handle <- .Call(
217
            LGBM_DatasetCreateFromFile_R
218
            , path.expand(private$raw_data)
219
220
221
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
222

Guolin Ke's avatar
Guolin Ke committed
223
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
224

225
          # Are we using a matrix?
226
          handle <- .Call(
227
            LGBM_DatasetCreateFromMat_R
228
229
230
231
232
233
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
234
235

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
236
          if (length(private$raw_data@p) > 2147483647L) {
237
238
            stop("Cannot support large CSC matrix")
          }
239
          # Are we using a dgCMatrix (sparsed matrix column compressed)
240
          handle <- .Call(
241
            LGBM_DatasetCreateFromCSC_R
242
243
244
245
246
247
248
249
250
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
251

Guolin Ke's avatar
Guolin Ke committed
252
        } else {
James Lamb's avatar
James Lamb committed
253

254
          # Unknown data type
255
256
257
258
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
259

Guolin Ke's avatar
Guolin Ke committed
260
        }
James Lamb's avatar
James Lamb committed
261

Guolin Ke's avatar
Guolin Ke committed
262
      } else {
James Lamb's avatar
James Lamb committed
263

264
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
265
        if (is.null(private$reference)) {
266
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
267
        }
James Lamb's avatar
James Lamb committed
268

269
        # Construct subset
270
        handle <- .Call(
271
          LGBM_DatasetGetSubset_R
272
273
274
275
276
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
277

Guolin Ke's avatar
Guolin Ke committed
278
      }
279
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
280
281
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
282
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
283
284
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
285

286
287
      # Set feature names
      if (!is.null(private$colnames)) {
288
        self$set_colnames(colnames = private$colnames)
289
      }
290

291
292
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
293

294
        # Setup initial scores
295
        init_score <- private$predictor$predict(
296
          data = private$raw_data
297
298
299
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
300

301
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
302
303
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
304

305
      }
James Lamb's avatar
James Lamb committed
306

307
308
309
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
310
      }
James Lamb's avatar
James Lamb committed
311

312
      # Get private information
313
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
314

315
        # Set infos
316
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
317

Guolin Ke's avatar
Guolin Ke committed
318
          p <- private$info[i]
319
320
321
322
          self$set_field(
            field_name = names(p)
            , data = p[[1L]]
          )
James Lamb's avatar
James Lamb committed
323

Guolin Ke's avatar
Guolin Ke committed
324
        }
James Lamb's avatar
James Lamb committed
325

Guolin Ke's avatar
Guolin Ke committed
326
      }
James Lamb's avatar
James Lamb committed
327

328
      # Get label information existence
329
      if (is.null(self$get_field(field_name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
330
331
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
332

333
      return(invisible(self))
James Lamb's avatar
James Lamb committed
334

Guolin Ke's avatar
Guolin Ke committed
335
    },
James Lamb's avatar
James Lamb committed
336

337
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
338
    dim = function() {
James Lamb's avatar
James Lamb committed
339

340
      # Check for handle
341
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
342

343
344
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
345

346
        # Get numeric data and numeric features
347
348
349
350
351
352
353
354
355
356
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
357
        return(
358
          c(num_row, num_col)
359
        )
James Lamb's avatar
James Lamb committed
360
361
362

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

363
        # Check if dgCMatrix (sparse matrix column compressed)
364
        # NOTE: requires Matrix package
365
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
366

Guolin Ke's avatar
Guolin Ke committed
367
      } else {
James Lamb's avatar
James Lamb committed
368

369
        # Trying to work with unknown dimensions is not possible
370
371
372
373
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
374

Guolin Ke's avatar
Guolin Ke committed
375
      }
James Lamb's avatar
James Lamb committed
376

Guolin Ke's avatar
Guolin Ke committed
377
    },
James Lamb's avatar
James Lamb committed
378

379
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
380
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
381

382
      # Check for handle
383
      if (!lgb.is.null.handle(x = private$handle)) {
384
        private$colnames <- .Call(
385
386
          LGBM_DatasetGetFeatureNames_R
          , private$handle
387
        )
388
        return(private$colnames)
James Lamb's avatar
James Lamb committed
389
390
391

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

392
        # Check if dgCMatrix (sparse matrix column compressed)
393
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
394

Guolin Ke's avatar
Guolin Ke committed
395
      } else {
James Lamb's avatar
James Lamb committed
396

397
        # Trying to work with unknown formats is not possible
398
        stop(
399
400
          "Dataset$get_colnames(): cannot get column names before dataset has been constructed, please call "
          , "lgb.Dataset.construct() explicitly"
401
        )
James Lamb's avatar
James Lamb committed
402

Guolin Ke's avatar
Guolin Ke committed
403
      }
James Lamb's avatar
James Lamb committed
404

Guolin Ke's avatar
Guolin Ke committed
405
    },
James Lamb's avatar
James Lamb committed
406

407
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
408
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
409

410
411
      # Check column names non-existence
      if (is.null(colnames)) {
412
        return(invisible(self))
413
      }
James Lamb's avatar
James Lamb committed
414

415
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
416
      colnames <- as.character(colnames)
417
      if (length(colnames) == 0L) {
418
        return(invisible(self))
419
      }
James Lamb's avatar
James Lamb committed
420

421
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
422
      private$colnames <- colnames
423
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
424

425
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
426
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
427
428
        .Call(
          LGBM_DatasetSetFeatureNames_R
429
          , private$handle
430
          , merged_name
431
        )
James Lamb's avatar
James Lamb committed
432

Guolin Ke's avatar
Guolin Ke committed
433
      }
James Lamb's avatar
James Lamb committed
434

435
      return(invisible(self))
James Lamb's avatar
James Lamb committed
436

Guolin Ke's avatar
Guolin Ke committed
437
    },
James Lamb's avatar
James Lamb committed
438

439
    get_field = function(field_name) {
James Lamb's avatar
James Lamb committed
440

441
      # Check if attribute key is in the known attribute list
442
443
444
445
446
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$get_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
Guolin Ke's avatar
Guolin Ke committed
447
      }
James Lamb's avatar
James Lamb committed
448

449
      # Check for info name and handle
450
      if (is.null(private$info[[field_name]])) {
451

452
        if (lgb.is.null.handle(x = private$handle)) {
453
          stop("Cannot perform Dataset$get_field() before constructing Dataset.")
454
        }
455

456
        # Get field size of info
457
        info_len <- 0L
458
459
        .Call(
          LGBM_DatasetGetFieldSize_R
460
          , private$handle
461
          , field_name
462
          , info_len
463
        )
James Lamb's avatar
James Lamb committed
464

465
        # Check if info is not empty
466
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
467

468
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
469
          ret <- NULL
470
          ret <- if (field_name == "group") {
471
472
473
474
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
475

476
477
          .Call(
            LGBM_DatasetGetField_R
478
            , private$handle
479
            , field_name
480
            , ret
481
          )
James Lamb's avatar
James Lamb committed
482

483
          private$info[[field_name]] <- ret
James Lamb's avatar
James Lamb committed
484

Guolin Ke's avatar
Guolin Ke committed
485
486
        }
      }
James Lamb's avatar
James Lamb committed
487

488
      return(private$info[[field_name]])
James Lamb's avatar
James Lamb committed
489

Guolin Ke's avatar
Guolin Ke committed
490
    },
James Lamb's avatar
James Lamb committed
491

492
    set_field = function(field_name, data) {
James Lamb's avatar
James Lamb committed
493

494
      # Check if attribute key is in the known attribute list
495
496
497
498
499
      if (!is.character(field_name) || length(field_name) != 1L || !field_name %in% .INFO_KEYS()) {
        stop(
          "Dataset$set_field(): field_name must one of the following: "
          , paste0(sQuote(.INFO_KEYS()), collapse = ", ")
        )
500
      }
James Lamb's avatar
James Lamb committed
501

502
      # Check for type of information
503
504
      data <- if (field_name == "group") {
        as.integer(data) # Integer
505
      } else {
506
        as.numeric(data) # Numeric
507
      }
James Lamb's avatar
James Lamb committed
508

509
      # Store information privately
510
      private$info[[field_name]] <- data
James Lamb's avatar
James Lamb committed
511

512
      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
James Lamb's avatar
James Lamb committed
513

514
        if (length(data) > 0L) {
James Lamb's avatar
James Lamb committed
515

516
517
          .Call(
            LGBM_DatasetSetField_R
518
            , private$handle
519
520
521
            , field_name
            , data
            , length(data)
522
          )
James Lamb's avatar
James Lamb committed
523

524
525
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
526
        }
James Lamb's avatar
James Lamb committed
527

Guolin Ke's avatar
Guolin Ke committed
528
      }
James Lamb's avatar
James Lamb committed
529

530
      return(invisible(self))
James Lamb's avatar
James Lamb committed
531

Guolin Ke's avatar
Guolin Ke committed
532
    },
James Lamb's avatar
James Lamb committed
533

534
    # Slice dataset
535
    slice = function(idxset) {
536

537
      # Perform slicing
538
539
540
      return(
        Dataset$new(
          data = NULL
541
          , params = self$get_params()
542
543
544
545
546
547
548
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
        )
549
      )
James Lamb's avatar
James Lamb committed
550

Guolin Ke's avatar
Guolin Ke committed
551
    },
James Lamb's avatar
James Lamb committed
552

553
554
555
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
556
    update_params = function(params) {
557
558
559
      if (length(params) == 0L) {
        return(invisible(self))
      }
560
      if (lgb.is.null.handle(x = private$handle)) {
561
        private$params <- utils::modifyList(private$params, params)
562
      } else {
563
564
        tryCatch({
          .Call(
565
            LGBM_DatasetUpdateParamChecking_R
566
567
568
569
570
571
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
572
          if (is.null(private$raw_data)) {
573
            stop(e)
574
575
          }

576
577
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
578
          private$params <- utils::modifyList(private$params, params)
579
          self$finalize()
580
        })
581
      }
582
      return(invisible(self))
James Lamb's avatar
James Lamb committed
583

Guolin Ke's avatar
Guolin Ke committed
584
    },
James Lamb's avatar
James Lamb committed
585

586
587
588
589
590
591
592
593
594
595
596
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

597
    # Set categorical feature parameter
598
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
599

600
601
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
602
        return(invisible(self))
603
      }
James Lamb's avatar
James Lamb committed
604

605
      # Check for empty data
606
      if (is.null(private$raw_data)) {
607
608
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
609
      }
James Lamb's avatar
James Lamb committed
610

611
      # Overwrite categorical features
612
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
613

614
      # Finalize and return self
615
      self$finalize()
616
      return(invisible(self))
James Lamb's avatar
James Lamb committed
617

618
    },
James Lamb's avatar
James Lamb committed
619

620
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
621
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
622

623
      # setting reference to this same Dataset object doesn't require any changes
624
      if (identical(private$reference, reference)) {
625
        return(invisible(self))
626
      }
James Lamb's avatar
James Lamb committed
627

628
629
      # changing the reference removes the Dataset object on the C++ side, so it should only
      # be done if you still have the raw_data available, so that the new Dataset can be reconstructed
Guolin Ke's avatar
Guolin Ke committed
630
      if (is.null(private$raw_data)) {
631
632
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
633
      }
James Lamb's avatar
James Lamb committed
634

635
636
      if (!lgb.is.Dataset(reference)) {
        stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
637
      }
James Lamb's avatar
James Lamb committed
638

639
640
641
642
643
      # Set known references
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)

644
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
645
      private$reference <- reference
James Lamb's avatar
James Lamb committed
646

647
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
648
      self$finalize()
649
      return(invisible(self))
James Lamb's avatar
James Lamb committed
650

Guolin Ke's avatar
Guolin Ke committed
651
    },
James Lamb's avatar
James Lamb committed
652

653
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
654
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
655

656
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
657
      self$construct()
658
659
      .Call(
        LGBM_DatasetSaveBinary_R
660
        , private$handle
661
        , path.expand(fname)
662
      )
663
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
664
    }
James Lamb's avatar
James Lamb committed
665

Guolin Ke's avatar
Guolin Ke committed
666
667
  ),
  private = list(
668
669
670
671
672
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
673
    categorical_feature = NULL,
674
675
676
677
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
678
    version = 0L,
James Lamb's avatar
James Lamb committed
679

680
681
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
682

683
      # Get handle and construct if needed
684
      if (lgb.is.null.handle(x = private$handle)) {
685
686
        self$construct()
      }
687
      return(private$handle)
James Lamb's avatar
James Lamb committed
688

Guolin Ke's avatar
Guolin Ke committed
689
    },
James Lamb's avatar
James Lamb committed
690

691
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
692
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
693

694
      if (identical(private$predictor, predictor)) {
695
        return(invisible(self))
696
      }
James Lamb's avatar
James Lamb committed
697

698
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
699
      if (is.null(private$raw_data)) {
700
701
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
702
      }
James Lamb's avatar
James Lamb committed
703

704
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
705
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
706

707
        # Predictor is unknown
708
        if (!lgb.is.Predictor(predictor)) {
709
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
710
        }
James Lamb's avatar
James Lamb committed
711

Guolin Ke's avatar
Guolin Ke committed
712
      }
James Lamb's avatar
James Lamb committed
713

714
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
715
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
716

717
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
718
      self$finalize()
719
      return(invisible(self))
James Lamb's avatar
James Lamb committed
720

Guolin Ke's avatar
Guolin Ke committed
721
    }
James Lamb's avatar
James Lamb committed
722

Guolin Ke's avatar
Guolin Ke committed
723
724
725
  )
)

726
727
728
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
729
#' @inheritParams lgb_shared_dataset_params
730
731
732
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{lgb.Dataset} file
733
734
735
736
737
738
739
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
740
#' @param colnames names of columns
741
742
743
744
745
746
747
748
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
James Lamb's avatar
James Lamb committed
749
#'
Guolin Ke's avatar
Guolin Ke committed
750
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
751
#'
Guolin Ke's avatar
Guolin Ke committed
752
#' @examples
753
#' \donttest{
754
755
756
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
757
758
759
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
760
#' lgb.Dataset.construct(dtrain)
761
#' }
Guolin Ke's avatar
Guolin Ke committed
762
763
#' @export
lgb.Dataset <- function(data,
764
765
766
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
767
                        categorical_feature = NULL,
768
                        free_raw_data = TRUE,
769
770
771
                        label = NULL,
                        weight = NULL,
                        group = NULL,
772
                        init_score = NULL) {
773

774
775
776
777
778
779
780
781
782
783
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
784
785
786
787
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
788
789
    ))
  )
James Lamb's avatar
James Lamb committed
790

Guolin Ke's avatar
Guolin Ke committed
791
792
}

793
794
795
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
796
#' @inheritParams lgb_shared_dataset_params
Guolin Ke's avatar
Guolin Ke committed
797
#' @param dataset \code{lgb.Dataset} object, training data
798
799
800
#' @param data a \code{matrix} object, a \code{dgCMatrix} object,
#'             a character representing a path to a text file (CSV, TSV, or LibSVM),
#'             or a character representing a path to a binary \code{Dataset} file
801
802
803
804
805
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values. If this is an empty list (the default), the validation Dataset
#'               will have the same parameters as the Dataset passed to argument \code{dataset}.
James Lamb's avatar
James Lamb committed
806
#'
Guolin Ke's avatar
Guolin Ke committed
807
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
808
#'
Guolin Ke's avatar
Guolin Ke committed
809
#' @examples
810
#' \donttest{
811
812
813
814
815
816
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
#'
#' # parameters can be changed between the training data and validation set,
#' # for example to account for training data in a text file with a header row
#' # and validation data in a text file without it
#' train_file <- tempfile(pattern = "train_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = train_file
#'   , sep = ","
#'   , col.names = TRUE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' valid_file <- tempfile(pattern = "valid_", fileext = ".csv")
#' write.table(
#'   data.frame(y = rnorm(100L), x1 = rnorm(100L), x2 = rnorm(100L))
#'   , file = valid_file
#'   , sep = ","
#'   , col.names = FALSE
#'   , row.names = FALSE
#'   , quote = FALSE
#' )
#'
#' dtrain <- lgb.Dataset(
#'   data = train_file
#'   , params = list(has_header = TRUE)
#' )
#' dtrain$construct()
#'
#' dvalid <- lgb.Dataset(
#'   data = valid_file
#'   , params = list(has_header = FALSE)
#' )
#' dvalid$construct()
852
#' }
Guolin Ke's avatar
Guolin Ke committed
853
#' @export
854
855
856
857
858
859
lgb.Dataset.create.valid <- function(dataset,
                                     data,
                                     label = NULL,
                                     weight = NULL,
                                     group = NULL,
                                     init_score = NULL,
860
                                     params = list()) {
James Lamb's avatar
James Lamb committed
861

862
  if (!lgb.is.Dataset(x = dataset)) {
863
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
864
  }
James Lamb's avatar
James Lamb committed
865

866
  # Create validation dataset
867
868
869
870
871
872
873
  return(invisible(
    dataset$create_valid(
      data = data
      , label = label
      , weight = weight
      , group = group
      , init_score = init_score
874
      , params = params
875
876
    )
  ))
James Lamb's avatar
James Lamb committed
877

878
}
Guolin Ke's avatar
Guolin Ke committed
879

880
881
882
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
883
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
884
#'
Guolin Ke's avatar
Guolin Ke committed
885
#' @examples
886
#' \donttest{
887
888
889
890
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
891
#' }
892
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
893
894
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
895

896
  if (!lgb.is.Dataset(x = dataset)) {
897
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
898
  }
James Lamb's avatar
James Lamb committed
899

900
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
901

Guolin Ke's avatar
Guolin Ke committed
902
903
}

904
905
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
906
#' @param x Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
907
#'
Guolin Ke's avatar
Guolin Ke committed
908
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
909
#'
Guolin Ke's avatar
Guolin Ke committed
910
911
912
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
913
#'
Guolin Ke's avatar
Guolin Ke committed
914
#' @examples
915
#' \donttest{
916
917
918
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
919
#'
920
921
922
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
923
#' }
Guolin Ke's avatar
Guolin Ke committed
924
925
#' @rdname dim
#' @export
926
dim.lgb.Dataset <- function(x) {
927

928
  if (!lgb.is.Dataset(x = x)) {
929
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
930
  }
James Lamb's avatar
James Lamb committed
931

932
  return(x$dim())
James Lamb's avatar
James Lamb committed
933

Guolin Ke's avatar
Guolin Ke committed
934
935
}

936
937
938
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
939
940
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
941
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
942
943
944
945
946
947
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
948
#' \donttest{
949
950
951
952
953
954
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
955
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
956
#' print(dtrain, verbose = TRUE)
957
#' }
Guolin Ke's avatar
Guolin Ke committed
958
#' @rdname dimnames.lgb.Dataset
959
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
960
961
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
962

963
  if (!lgb.is.Dataset(x = x)) {
964
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
965
  }
James Lamb's avatar
James Lamb committed
966

967
  # Return dimension names
968
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
969

Guolin Ke's avatar
Guolin Ke committed
970
971
972
973
974
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
975

976
  # Check if invalid element list
977
  if (!identical(class(value), "list") || length(value) != 2L) {
978
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
979
  }
James Lamb's avatar
James Lamb committed
980

981
982
983
984
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
985

986
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
987

988
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
989
    return(x)
James Lamb's avatar
James Lamb committed
990

991
  }
James Lamb's avatar
James Lamb committed
992

993
  # Check for unmatching column size
994
  if (ncol(x) != length(value[[2L]])) {
995
996
    stop(
      "can't assign "
997
      , sQuote(length(value[[2L]]))
998
999
1000
1001
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
1002
  }
James Lamb's avatar
James Lamb committed
1003

1004
  # Set column names properly, and return
1005
  x$set_colnames(colnames = value[[2L]])
1006
  return(x)
James Lamb's avatar
James Lamb committed
1007

Guolin Ke's avatar
Guolin Ke committed
1008
1009
}

1010
1011
1012
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1013
#' @param dataset Object of class \code{lgb.Dataset}
1014
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1015
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1016
#'
Guolin Ke's avatar
Guolin Ke committed
1017
#' @examples
1018
#' \donttest{
1019
1020
1021
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1022
#'
1023
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1024
#' lgb.Dataset.construct(dsub)
1025
#' labels <- lightgbm::get_field(dsub, "label")
1026
#' }
Guolin Ke's avatar
Guolin Ke committed
1027
#' @export
1028
slice <- function(dataset, idxset) {
1029
1030
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1031
1032
1033

#' @rdname slice
#' @export
1034
slice.lgb.Dataset <- function(dataset, idxset) {
James Lamb's avatar
James Lamb committed
1035

1036
  if (!lgb.is.Dataset(x = dataset)) {
1037
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1038
  }
James Lamb's avatar
James Lamb committed
1039

1040
  return(invisible(dataset$slice(idxset = idxset)))
James Lamb's avatar
James Lamb committed
1041

Guolin Ke's avatar
Guolin Ke committed
1042
1043
}

1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
#' @name get_field
#' @title Get one attribute of a \code{lgb.Dataset}
#' @description Get one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to get. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @return requested attribute
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
#' }
#' @export
get_field <- function(dataset, field_name) {
  UseMethod("get_field")
}

#' @rdname get_field
#' @export
get_field.lgb.Dataset <- function(dataset, field_name) {

  # Check if dataset is not a dataset
  if (!lgb.is.Dataset(x = dataset)) {
    stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
  }

  return(dataset$get_field(field_name = field_name))

}

#' @name set_field
#' @title Set one attribute of a \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
#' @param dataset Object of class \code{lgb.Dataset}
#' @param field_name String with the name of the attribute to set. One of the following.
#' \itemize{
#'     \item \code{label}: label lightgbm learns from ;
#'     \item \code{weight}: to do a weight rescale ;
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
#' }
#' @param data The data for the field. See examples.
#' @return The \code{lgb.Dataset} you passed in.
#'
#' @examples
#' \donttest{
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#'
#' labels <- lightgbm::get_field(dtrain, "label")
#' lightgbm::set_field(dtrain, "label", 1 - labels)
#'
#' labels2 <- lightgbm::get_field(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
#' }
#' @export
set_field <- function(dataset, field_name, data) {
  UseMethod("set_field")
}

#' @rdname set_field
#' @export
set_field.lgb.Dataset <- function(dataset, field_name, data) {

  if (!lgb.is.Dataset(x = dataset)) {
    stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
  }

  return(invisible(dataset$set_field(field_name = field_name, data = data)))
Guolin Ke's avatar
Guolin Ke committed
1137
1138
}

1139
1140
1141
1142
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1143
#' @param dataset object of class \code{lgb.Dataset}
1144
1145
1146
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1147
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1148
#'
1149
#' @examples
1150
#' \donttest{
1151
1152
1153
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1154
1155
1156
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1157
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1158
#' }
1159
1160
1161
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1162

1163
  if (!lgb.is.Dataset(x = dataset)) {
1164
1165
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1166

1167
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1168

1169
1170
}

1171
1172
1173
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1174
1175
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1176
#'
1177
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1178
#'
Guolin Ke's avatar
Guolin Ke committed
1179
#' @examples
1180
#' \donttest{
1181
#' # create training Dataset
1182
1183
1184
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1185
1186
#'
#' # create a validation Dataset, using dtrain as a reference
1187
1188
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
1189
#' dtest <- lgb.Dataset(test$data, label = test$label)
1190
#' lgb.Dataset.set.reference(dtest, dtrain)
1191
#' }
Guolin Ke's avatar
Guolin Ke committed
1192
1193
1194
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1195

1196
  if (!lgb.is.Dataset(x = dataset)) {
1197
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1198
  }
James Lamb's avatar
James Lamb committed
1199

1200
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1201
1202
}

1203
1204
1205
1206
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1207
1208
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1209
#'
1210
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1211
#'
Guolin Ke's avatar
Guolin Ke committed
1212
#' @examples
1213
#' \donttest{
1214
1215
1216
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1217
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1218
#' }
Guolin Ke's avatar
Guolin Ke committed
1219
1220
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1221

1222
  if (!lgb.is.Dataset(x = dataset)) {
1223
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1224
  }
James Lamb's avatar
James Lamb committed
1225

1226
1227
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1228
  }
James Lamb's avatar
James Lamb committed
1229

1230
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1231
}