lgb.Dataset.R 34.4 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
2
3
4
#' @importFrom R6 R6Class
Dataset <- R6::R6Class(

5
  classname = "lgb.Dataset",
6
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
7
  public = list(
James Lamb's avatar
James Lamb committed
8

9
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
10
    finalize = function() {
James Lamb's avatar
James Lamb committed
11

12
      # Check the need for freeing handle
13
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
14

15
        # Freeing up handle
16
17
18
19
        .Call(
          LGBM_DatasetFree_R
          , private$handle
        )
Guolin Ke's avatar
Guolin Ke committed
20
        private$handle <- NULL
James Lamb's avatar
James Lamb committed
21

Guolin Ke's avatar
Guolin Ke committed
22
      }
James Lamb's avatar
James Lamb committed
23

24
25
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
26
    },
James Lamb's avatar
James Lamb committed
27

28
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
29
    initialize = function(data,
30
31
32
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
33
                          categorical_feature = NULL,
34
35
36
37
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
Guolin Ke's avatar
Guolin Ke committed
38
                          ...) {
James Lamb's avatar
James Lamb committed
39

40
      # validate inputs early to avoid unnecessary computation
41
      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
42
43
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
44
      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
45
46
47
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

48
      # Check for additional parameters
49
      additional_params <- list(...)
James Lamb's avatar
James Lamb committed
50

51
52
      # Create known attributes list
      INFO_KEYS <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
53

54
      # Check if attribute key is in the known attribute list
55
      for (key in names(additional_params)) {
James Lamb's avatar
James Lamb committed
56

57
        # Key existing
58
        if (key %in% INFO_KEYS) {
James Lamb's avatar
James Lamb committed
59

60
          # Store as info
61
          info[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
62

Guolin Ke's avatar
Guolin Ke committed
63
        } else {
James Lamb's avatar
James Lamb committed
64

65
          # Store as param
66
          params[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
67

Guolin Ke's avatar
Guolin Ke committed
68
        }
James Lamb's avatar
James Lamb committed
69

Guolin Ke's avatar
Guolin Ke committed
70
      }
James Lamb's avatar
James Lamb committed
71

72
73
74
75
76
77
78
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
79

80
81
82
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
83
      private$reference <- reference
84
      private$colnames <- colnames
85

86
      private$categorical_feature <- categorical_feature
87
88
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
89
      private$used_indices <- sort(used_indices, decreasing = FALSE)
90
      private$info <- info
91
      private$version <- 0L
James Lamb's avatar
James Lamb committed
92

93
94
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
95
    },
James Lamb's avatar
James Lamb committed
96

97
98
99
    create_valid = function(data,
                            info = list(),
                            ...) {
James Lamb's avatar
James Lamb committed
100

101
      # Create new dataset
102
103
104
105
106
107
108
109
110
111
112
113
      ret <- Dataset$new(
        data = data
        , params = private$params
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
        , ...
      )
James Lamb's avatar
James Lamb committed
114

115
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
116

Guolin Ke's avatar
Guolin Ke committed
117
    },
James Lamb's avatar
James Lamb committed
118

119
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
120
    construct = function() {
James Lamb's avatar
James Lamb committed
121

122
      # Check for handle null
123
      if (!lgb.is.null.handle(x = private$handle)) {
124
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
125
      }
James Lamb's avatar
James Lamb committed
126

Guolin Ke's avatar
Guolin Ke committed
127
128
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
129
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
130
131
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
132

133
      # set feature names if they do not exist
134
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
135
136
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
137

138
139
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
140

141
        # Check for character name
142
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
143

144
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
145

146
            # Provided indices, but some indices are missing?
147
            if (sum(is.na(cate_indices)) > 0L) {
148
149
150
151
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
152
            }
James Lamb's avatar
James Lamb committed
153

154
          } else {
James Lamb's avatar
James Lamb committed
155

156
            # Check if more categorical features were output over the feature space
157
            if (max(private$categorical_feature) > length(private$colnames)) {
158
159
160
161
162
163
164
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
165
            }
James Lamb's avatar
James Lamb committed
166

167
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
168
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
169

170
          }
James Lamb's avatar
James Lamb committed
171

172
        # Store indices for categorical features
173
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
174

175
      }
James Lamb's avatar
James Lamb committed
176

Guolin Ke's avatar
Guolin Ke committed
177
178
      # Check has header or not
      has_header <- FALSE
179
      if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
180
181
182
        params_has_header <- tolower(as.character(private$params$has_header)) == "true"
        params_header <- tolower(as.character(private$params$header)) == "true"
        if (params_has_header || params_header) {
Guolin Ke's avatar
Guolin Ke committed
183
184
185
          has_header <- TRUE
        }
      }
James Lamb's avatar
James Lamb committed
186

Guolin Ke's avatar
Guolin Ke committed
187
      # Generate parameter str
188
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
189

190
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
191
192
193
194
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
James Lamb's avatar
James Lamb committed
195

196
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
197
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
198

199
        # Are we using a data file?
200
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
201

202
          handle <- .Call(
203
            LGBM_DatasetCreateFromFile_R
204
            , private$raw_data
205
206
207
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
208

Guolin Ke's avatar
Guolin Ke committed
209
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
210

211
          # Are we using a matrix?
212
          handle <- .Call(
213
            LGBM_DatasetCreateFromMat_R
214
215
216
217
218
219
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
220
221

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
222
          if (length(private$raw_data@p) > 2147483647L) {
223
224
            stop("Cannot support large CSC matrix")
          }
225
          # Are we using a dgCMatrix (sparsed matrix column compressed)
226
          handle <- .Call(
227
            LGBM_DatasetCreateFromCSC_R
228
229
230
231
232
233
234
235
236
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
237

Guolin Ke's avatar
Guolin Ke committed
238
        } else {
James Lamb's avatar
James Lamb committed
239

240
          # Unknown data type
241
242
243
244
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
245

Guolin Ke's avatar
Guolin Ke committed
246
        }
James Lamb's avatar
James Lamb committed
247

Guolin Ke's avatar
Guolin Ke committed
248
      } else {
James Lamb's avatar
James Lamb committed
249

250
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
251
        if (is.null(private$reference)) {
252
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
253
        }
James Lamb's avatar
James Lamb committed
254

255
        # Construct subset
256
        handle <- .Call(
257
          LGBM_DatasetGetSubset_R
258
259
260
261
262
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
263

Guolin Ke's avatar
Guolin Ke committed
264
      }
265
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
266
267
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
268
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
269
270
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
271

272
273
      # Set feature names
      if (!is.null(private$colnames)) {
274
        self$set_colnames(colnames = private$colnames)
275
      }
276

277
278
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
279

280
        # Setup initial scores
281
        init_score <- private$predictor$predict(
282
          data = private$raw_data
283
284
285
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
286

287
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
288
289
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
290

291
      }
James Lamb's avatar
James Lamb committed
292

293
294
295
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
296
      }
James Lamb's avatar
James Lamb committed
297

298
      # Get private information
299
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
300

301
        # Set infos
302
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
303

Guolin Ke's avatar
Guolin Ke committed
304
          p <- private$info[i]
305
          self$setinfo(name = names(p), info = p[[1L]])
James Lamb's avatar
James Lamb committed
306

Guolin Ke's avatar
Guolin Ke committed
307
        }
James Lamb's avatar
James Lamb committed
308

Guolin Ke's avatar
Guolin Ke committed
309
      }
James Lamb's avatar
James Lamb committed
310

311
      # Get label information existence
312
      if (is.null(self$getinfo(name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
313
314
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
315

316
      return(invisible(self))
James Lamb's avatar
James Lamb committed
317

Guolin Ke's avatar
Guolin Ke committed
318
    },
James Lamb's avatar
James Lamb committed
319

320
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
321
    dim = function() {
James Lamb's avatar
James Lamb committed
322

323
      # Check for handle
324
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
325

326
327
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
328

329
        # Get numeric data and numeric features
330
331
332
333
334
335
336
337
338
339
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
        )
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
        )
340
        return(
341
          c(num_row, num_col)
342
        )
James Lamb's avatar
James Lamb committed
343
344
345

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

346
        # Check if dgCMatrix (sparse matrix column compressed)
347
        # NOTE: requires Matrix package
348
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
349

Guolin Ke's avatar
Guolin Ke committed
350
      } else {
James Lamb's avatar
James Lamb committed
351

352
        # Trying to work with unknown dimensions is not possible
353
354
355
356
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
357

Guolin Ke's avatar
Guolin Ke committed
358
      }
James Lamb's avatar
James Lamb committed
359

Guolin Ke's avatar
Guolin Ke committed
360
    },
James Lamb's avatar
James Lamb committed
361

362
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
363
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
364

365
      # Check for handle
366
      if (!lgb.is.null.handle(x = private$handle)) {
367
        private$colnames <- .Call(
368
369
          LGBM_DatasetGetFeatureNames_R
          , private$handle
370
        )
371
        return(private$colnames)
James Lamb's avatar
James Lamb committed
372
373
374

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

375
        # Check if dgCMatrix (sparse matrix column compressed)
376
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
377

Guolin Ke's avatar
Guolin Ke committed
378
      } else {
James Lamb's avatar
James Lamb committed
379

380
        # Trying to work with unknown dimensions is not possible
381
382
383
384
        stop(
          "dim: cannot get dimensions before dataset has been constructed, please call "
          , "lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
385

Guolin Ke's avatar
Guolin Ke committed
386
      }
James Lamb's avatar
James Lamb committed
387

Guolin Ke's avatar
Guolin Ke committed
388
    },
James Lamb's avatar
James Lamb committed
389

390
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
391
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
392

393
394
      # Check column names non-existence
      if (is.null(colnames)) {
395
        return(invisible(self))
396
      }
James Lamb's avatar
James Lamb committed
397

398
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
399
      colnames <- as.character(colnames)
400
      if (length(colnames) == 0L) {
401
        return(invisible(self))
402
      }
James Lamb's avatar
James Lamb committed
403

404
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
405
      private$colnames <- colnames
406
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
407

408
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
409
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
410
411
        .Call(
          LGBM_DatasetSetFeatureNames_R
412
          , private$handle
413
          , merged_name
414
        )
James Lamb's avatar
James Lamb committed
415

Guolin Ke's avatar
Guolin Ke committed
416
      }
James Lamb's avatar
James Lamb committed
417

418
      return(invisible(self))
James Lamb's avatar
James Lamb committed
419

Guolin Ke's avatar
Guolin Ke committed
420
    },
James Lamb's avatar
James Lamb committed
421

422
    # Get information
Guolin Ke's avatar
Guolin Ke committed
423
    getinfo = function(name) {
James Lamb's avatar
James Lamb committed
424

425
      # Create known attributes list
426
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
427

428
      # Check if attribute key is in the known attribute list
429
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
430
        stop("getinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
Guolin Ke's avatar
Guolin Ke committed
431
      }
James Lamb's avatar
James Lamb committed
432

433
      # Check for info name and handle
434
      if (is.null(private$info[[name]])) {
435

436
        if (lgb.is.null.handle(x = private$handle)) {
437
          stop("Cannot perform getinfo before constructing Dataset.")
438
        }
439

440
        # Get field size of info
441
        info_len <- 0L
442
443
        .Call(
          LGBM_DatasetGetFieldSize_R
444
          , private$handle
445
          , name
446
          , info_len
447
        )
James Lamb's avatar
James Lamb committed
448

449
        # Check if info is not empty
450
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
451

452
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
453
          ret <- NULL
454
455
456
457
458
          ret <- if (name == "group") {
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
459

460
461
          .Call(
            LGBM_DatasetGetField_R
462
            , private$handle
463
            , name
464
            , ret
465
          )
James Lamb's avatar
James Lamb committed
466

Guolin Ke's avatar
Guolin Ke committed
467
          private$info[[name]] <- ret
James Lamb's avatar
James Lamb committed
468

Guolin Ke's avatar
Guolin Ke committed
469
470
        }
      }
James Lamb's avatar
James Lamb committed
471

472
      return(private$info[[name]])
James Lamb's avatar
James Lamb committed
473

Guolin Ke's avatar
Guolin Ke committed
474
    },
James Lamb's avatar
James Lamb committed
475

476
    # Set information
Guolin Ke's avatar
Guolin Ke committed
477
    setinfo = function(name, info) {
James Lamb's avatar
James Lamb committed
478

479
      # Create known attributes list
480
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
481

482
      # Check if attribute key is in the known attribute list
483
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
484
485
        stop("setinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
      }
James Lamb's avatar
James Lamb committed
486

487
488
489
490
491
492
      # Check for type of information
      info <- if (name == "group") {
        as.integer(info) # Integer
      } else {
        as.numeric(info) # Numeric
      }
James Lamb's avatar
James Lamb committed
493

494
      # Store information privately
Guolin Ke's avatar
Guolin Ke committed
495
      private$info[[name]] <- info
James Lamb's avatar
James Lamb committed
496

497
      if (!lgb.is.null.handle(x = private$handle) && !is.null(info)) {
James Lamb's avatar
James Lamb committed
498

499
        if (length(info) > 0L) {
James Lamb's avatar
James Lamb committed
500

501
502
          .Call(
            LGBM_DatasetSetField_R
503
            , private$handle
504
            , name
505
506
507
            , info
            , length(info)
          )
James Lamb's avatar
James Lamb committed
508

509
510
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
511
        }
James Lamb's avatar
James Lamb committed
512

Guolin Ke's avatar
Guolin Ke committed
513
      }
James Lamb's avatar
James Lamb committed
514

515
      return(invisible(self))
James Lamb's avatar
James Lamb committed
516

Guolin Ke's avatar
Guolin Ke committed
517
    },
James Lamb's avatar
James Lamb committed
518

519
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
520
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
521

522
      # Perform slicing
523
524
525
526
527
528
529
530
531
532
533
534
535
      return(
        Dataset$new(
          data = NULL
          , params = private$params
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
          , ...
        )
536
      )
James Lamb's avatar
James Lamb committed
537

Guolin Ke's avatar
Guolin Ke committed
538
    },
James Lamb's avatar
James Lamb committed
539

540
541
542
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
543
    update_params = function(params) {
544
545
546
      if (length(params) == 0L) {
        return(invisible(self))
      }
547
      if (lgb.is.null.handle(x = private$handle)) {
548
549
        private$params <- modifyList(private$params, params)
      } else {
550
551
        tryCatch({
          .Call(
552
            LGBM_DatasetUpdateParamChecking_R
553
554
555
556
557
558
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
559
          if (is.null(private$raw_data)) {
560
            stop(e)
561
562
          }

563
564
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
565
566
          private$params <- modifyList(private$params, params)
          self$finalize()
567
        })
568
      }
569
      return(invisible(self))
James Lamb's avatar
James Lamb committed
570

Guolin Ke's avatar
Guolin Ke committed
571
    },
James Lamb's avatar
James Lamb committed
572

573
574
575
576
577
578
579
580
581
582
583
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

584
    # Set categorical feature parameter
585
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
586

587
588
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
589
        return(invisible(self))
590
      }
James Lamb's avatar
James Lamb committed
591

592
      # Check for empty data
593
      if (is.null(private$raw_data)) {
594
595
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
596
      }
James Lamb's avatar
James Lamb committed
597

598
      # Overwrite categorical features
599
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
600

601
      # Finalize and return self
602
      self$finalize()
603
      return(invisible(self))
James Lamb's avatar
James Lamb committed
604

605
    },
James Lamb's avatar
James Lamb committed
606

607
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
608
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
609

610
      # Set known references
611
612
613
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)
James Lamb's avatar
James Lamb committed
614

615
616
      # Check for identical references
      if (identical(private$reference, reference)) {
617
        return(invisible(self))
618
      }
James Lamb's avatar
James Lamb committed
619

620
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
621
      if (is.null(private$raw_data)) {
James Lamb's avatar
James Lamb committed
622

623
624
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
James Lamb's avatar
James Lamb committed
625

Guolin Ke's avatar
Guolin Ke committed
626
      }
James Lamb's avatar
James Lamb committed
627

628
      # Check for non-existing reference
Guolin Ke's avatar
Guolin Ke committed
629
      if (!is.null(reference)) {
James Lamb's avatar
James Lamb committed
630

631
        # Reference is unknown
632
        if (!lgb.is.Dataset(reference)) {
633
          stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
634
        }
James Lamb's avatar
James Lamb committed
635

Guolin Ke's avatar
Guolin Ke committed
636
      }
James Lamb's avatar
James Lamb committed
637

638
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
639
      private$reference <- reference
James Lamb's avatar
James Lamb committed
640

641
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
642
      self$finalize()
643
      return(invisible(self))
James Lamb's avatar
James Lamb committed
644

Guolin Ke's avatar
Guolin Ke committed
645
    },
James Lamb's avatar
James Lamb committed
646

647
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
648
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
649

650
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
651
      self$construct()
652
653
      .Call(
        LGBM_DatasetSaveBinary_R
654
        , private$handle
655
        , fname
656
      )
657
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
658
    }
James Lamb's avatar
James Lamb committed
659

Guolin Ke's avatar
Guolin Ke committed
660
661
  ),
  private = list(
662
663
664
665
666
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
667
    categorical_feature = NULL,
668
669
670
671
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
672
    version = 0L,
James Lamb's avatar
James Lamb committed
673

674
675
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
676

677
      # Get handle and construct if needed
678
      if (lgb.is.null.handle(x = private$handle)) {
679
680
        self$construct()
      }
681
      return(private$handle)
James Lamb's avatar
James Lamb committed
682

Guolin Ke's avatar
Guolin Ke committed
683
    },
James Lamb's avatar
James Lamb committed
684

685
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
686
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
687

688
      if (identical(private$predictor, predictor)) {
689
        return(invisible(self))
690
      }
James Lamb's avatar
James Lamb committed
691

692
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
693
      if (is.null(private$raw_data)) {
694
695
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
696
      }
James Lamb's avatar
James Lamb committed
697

698
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
699
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
700

701
        # Predictor is unknown
702
        if (!lgb.is.Predictor(predictor)) {
703
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
704
        }
James Lamb's avatar
James Lamb committed
705

Guolin Ke's avatar
Guolin Ke committed
706
      }
James Lamb's avatar
James Lamb committed
707

708
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
709
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
710

711
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
712
      self$finalize()
713
      return(invisible(self))
James Lamb's avatar
James Lamb committed
714

Guolin Ke's avatar
Guolin Ke committed
715
    }
James Lamb's avatar
James Lamb committed
716

Guolin Ke's avatar
Guolin Ke committed
717
718
719
  )
)

720
721
722
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
Guolin Ke's avatar
Guolin Ke committed
723
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
724
725
726
727
728
729
730
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
731
#' @param colnames names of columns
732
733
734
735
736
737
738
739
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
Nikita Titov's avatar
Nikita Titov committed
740
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
741
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
James Lamb's avatar
James Lamb committed
742
#'
Guolin Ke's avatar
Guolin Ke committed
743
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
744
#'
Guolin Ke's avatar
Guolin Ke committed
745
#' @examples
746
#' \donttest{
747
748
749
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
750
751
752
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
753
#' lgb.Dataset.construct(dtrain)
754
#' }
Guolin Ke's avatar
Guolin Ke committed
755
756
#' @export
lgb.Dataset <- function(data,
757
758
759
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
760
                        categorical_feature = NULL,
761
762
                        free_raw_data = TRUE,
                        info = list(),
Guolin Ke's avatar
Guolin Ke committed
763
                        ...) {
James Lamb's avatar
James Lamb committed
764

765
  # Create new dataset
766
767
768
769
770
771
772
773
774
775
776
777
778
779
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
      , ...
    ))
  )
James Lamb's avatar
James Lamb committed
780

Guolin Ke's avatar
Guolin Ke committed
781
782
}

783
784
785
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
Guolin Ke's avatar
Guolin Ke committed
786
787
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
Nikita Titov's avatar
Nikita Titov committed
788
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
789
#' @param ... other information to pass to \code{info}.
James Lamb's avatar
James Lamb committed
790
#'
Guolin Ke's avatar
Guolin Ke committed
791
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
792
#'
Guolin Ke's avatar
Guolin Ke committed
793
#' @examples
794
#' \donttest{
795
796
797
798
799
800
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
801
#' }
Guolin Ke's avatar
Guolin Ke committed
802
#' @export
803
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
James Lamb's avatar
James Lamb committed
804

805
  # Check if dataset is not a dataset
806
  if (!lgb.is.Dataset(x = dataset)) {
807
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
808
  }
James Lamb's avatar
James Lamb committed
809

810
  # Create validation dataset
811
  return(invisible(dataset$create_valid(data = data, info = info, ...)))
James Lamb's avatar
James Lamb committed
812

813
}
Guolin Ke's avatar
Guolin Ke committed
814

815
816
817
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
818
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
819
#'
Guolin Ke's avatar
Guolin Ke committed
820
#' @examples
821
#' \donttest{
822
823
824
825
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
826
#' }
827
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
828
829
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
830

831
  # Check if dataset is not a dataset
832
  if (!lgb.is.Dataset(x = dataset)) {
833
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
834
  }
James Lamb's avatar
James Lamb committed
835

836
  # Construct the dataset
837
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
838

Guolin Ke's avatar
Guolin Ke committed
839
840
}

841
842
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
843
844
#' @param x Object of class \code{lgb.Dataset}
#' @param ... other parameters
James Lamb's avatar
James Lamb committed
845
#'
Guolin Ke's avatar
Guolin Ke committed
846
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
847
#'
Guolin Ke's avatar
Guolin Ke committed
848
849
850
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
851
#'
Guolin Ke's avatar
Guolin Ke committed
852
#' @examples
853
#' \donttest{
854
855
856
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
857
#'
858
859
860
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
861
#' }
Guolin Ke's avatar
Guolin Ke committed
862
863
864
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
865

866
  # Check if dataset is not a dataset
867
  if (!lgb.is.Dataset(x = x)) {
868
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
869
  }
James Lamb's avatar
James Lamb committed
870

871
  return(x$dim())
James Lamb's avatar
James Lamb committed
872

Guolin Ke's avatar
Guolin Ke committed
873
874
}

875
876
877
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
878
879
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
880
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
881
882
883
884
885
886
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
887
#' \donttest{
888
889
890
891
892
893
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
894
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
895
#' print(dtrain, verbose = TRUE)
896
#' }
Guolin Ke's avatar
Guolin Ke committed
897
#' @rdname dimnames.lgb.Dataset
898
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
899
900
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
901

902
  # Check if dataset is not a dataset
903
  if (!lgb.is.Dataset(x = x)) {
904
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
905
  }
James Lamb's avatar
James Lamb committed
906

907
  # Return dimension names
908
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
909

Guolin Ke's avatar
Guolin Ke committed
910
911
912
913
914
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
915

916
  # Check if invalid element list
917
  if (!identical(class(value), "list") || length(value) != 2L) {
918
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
919
  }
James Lamb's avatar
James Lamb committed
920

921
922
923
924
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
925

926
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
927

928
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
929
    return(x)
James Lamb's avatar
James Lamb committed
930

931
  }
James Lamb's avatar
James Lamb committed
932

933
  # Check for unmatching column size
934
  if (ncol(x) != length(value[[2L]])) {
935
936
    stop(
      "can't assign "
937
      , sQuote(length(value[[2L]]))
938
939
940
941
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
942
  }
James Lamb's avatar
James Lamb committed
943

944
  # Set column names properly, and return
945
  x$set_colnames(colnames = value[[2L]])
946
  return(x)
James Lamb's avatar
James Lamb committed
947

Guolin Ke's avatar
Guolin Ke committed
948
949
}

950
951
952
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
953
#' @param dataset Object of class \code{lgb.Dataset}
954
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
955
956
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
957
#'
Guolin Ke's avatar
Guolin Ke committed
958
#' @examples
959
#' \donttest{
960
961
962
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
963
#'
964
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
965
#' lgb.Dataset.construct(dsub)
966
#' labels <- lightgbm::getinfo(dsub, "label")
967
#' }
Guolin Ke's avatar
Guolin Ke committed
968
#' @export
969
970
971
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
972
973
974
975

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
976

977
  # Check if dataset is not a dataset
978
  if (!lgb.is.Dataset(x = dataset)) {
979
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
980
  }
James Lamb's avatar
James Lamb committed
981

982
  # Return sliced set
983
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
984

Guolin Ke's avatar
Guolin Ke committed
985
986
}

987
988
989
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
990
991
992
993
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#' @return info data
James Lamb's avatar
James Lamb committed
994
#'
Guolin Ke's avatar
Guolin Ke committed
995
996
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
997
#'
Guolin Ke's avatar
Guolin Ke committed
998
999
1000
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
1001
1002
1003
1004
1005
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1006
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1007
#' }
James Lamb's avatar
James Lamb committed
1008
#'
Guolin Ke's avatar
Guolin Ke committed
1009
#' @examples
1010
#' \donttest{
1011
1012
1013
1014
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1015
#'
1016
1017
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1018
#'
1019
1020
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1021
#' }
Guolin Ke's avatar
Guolin Ke committed
1022
#' @export
1023
1024
1025
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1026
1027
1028
1029

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1030

1031
  # Check if dataset is not a dataset
1032
  if (!lgb.is.Dataset(x = dataset)) {
1033
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1034
  }
James Lamb's avatar
James Lamb committed
1035

1036
  return(dataset$getinfo(name = name))
James Lamb's avatar
James Lamb committed
1037

Guolin Ke's avatar
Guolin Ke committed
1038
1039
}

1040
1041
1042
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1043
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1044
1045
1046
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
1047
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1048
#'
Guolin Ke's avatar
Guolin Ke committed
1049
1050
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1051
#'
Guolin Ke's avatar
Guolin Ke committed
1052
#' \itemize{
1053
1054
1055
1056
1057
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1058
1059
1060
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1061
#' }
James Lamb's avatar
James Lamb committed
1062
#'
Guolin Ke's avatar
Guolin Ke committed
1063
#' @examples
1064
#' \donttest{
1065
1066
1067
1068
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1069
#'
1070
1071
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1072
#'
1073
1074
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1075
#' }
Guolin Ke's avatar
Guolin Ke committed
1076
#' @export
1077
1078
1079
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1080
1081
1082
1083

#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1084

1085
  if (!lgb.is.Dataset(x = dataset)) {
1086
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1087
  }
James Lamb's avatar
James Lamb committed
1088

1089
  # Set information
1090
  return(invisible(dataset$setinfo(name = name, info = info)))
Guolin Ke's avatar
Guolin Ke committed
1091
1092
}

1093
1094
1095
1096
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1097
#' @param dataset object of class \code{lgb.Dataset}
1098
1099
1100
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1101
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1102
#'
1103
#' @examples
1104
#' \donttest{
1105
1106
1107
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1108
1109
1110
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1111
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1112
#' }
1113
1114
1115
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1116

1117
  if (!lgb.is.Dataset(x = dataset)) {
1118
1119
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1120

1121
  # Set categoricals
1122
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1123

1124
1125
}

1126
1127
1128
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1129
1130
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1131
#'
1132
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1133
#'
Guolin Ke's avatar
Guolin Ke committed
1134
#' @examples
1135
#' \donttest{
1136
1137
1138
1139
1140
1141
1142
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test = train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
1143
#' }
Guolin Ke's avatar
Guolin Ke committed
1144
1145
1146
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1147

1148
  # Check if dataset is not a dataset
1149
  if (!lgb.is.Dataset(x = dataset)) {
1150
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1151
  }
James Lamb's avatar
James Lamb committed
1152

1153
  # Set reference
1154
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1155
1156
}

1157
1158
1159
1160
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1161
1162
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1163
#'
1164
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1165
#'
Guolin Ke's avatar
Guolin Ke committed
1166
#' @examples
1167
#' \donttest{
1168
1169
1170
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1171
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1172
#' }
Guolin Ke's avatar
Guolin Ke committed
1173
1174
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1175

1176
  # Check if dataset is not a dataset
1177
  if (!lgb.is.Dataset(x = dataset)) {
1178
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1179
  }
James Lamb's avatar
James Lamb committed
1180

1181
  # File-type is not matching
1182
1183
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1184
  }
James Lamb's avatar
James Lamb committed
1185

1186
  # Store binary
1187
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1188
}