lgb.Dataset.R 36 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
2
3
4
#' @importFrom R6 R6Class
Dataset <- R6::R6Class(

5
  classname = "lgb.Dataset",
6
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
7
  public = list(
James Lamb's avatar
James Lamb committed
8

9
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
10
    finalize = function() {
James Lamb's avatar
James Lamb committed
11

12
      # Check the need for freeing handle
13
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
14

15
        # Freeing up handle
16
17
18
19
20
21
        call_state <- 0L
        .Call(
          LGBM_DatasetFree_R
          , private$handle
          , call_state
        )
Guolin Ke's avatar
Guolin Ke committed
22
        private$handle <- NULL
James Lamb's avatar
James Lamb committed
23

Guolin Ke's avatar
Guolin Ke committed
24
      }
James Lamb's avatar
James Lamb committed
25

26
27
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
28
    },
James Lamb's avatar
James Lamb committed
29

30
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
31
    initialize = function(data,
32
33
34
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
35
                          categorical_feature = NULL,
36
37
38
39
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
Guolin Ke's avatar
Guolin Ke committed
40
                          ...) {
James Lamb's avatar
James Lamb committed
41

42
      # validate inputs early to avoid unnecessary computation
43
      if (!(is.null(reference) || lgb.check.r6.class(object = reference, name = "lgb.Dataset"))) {
44
45
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
46
      if (!(is.null(predictor) || lgb.check.r6.class(object = predictor, name = "lgb.Predictor"))) {
47
48
49
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

50
      # Check for additional parameters
51
      additional_params <- list(...)
James Lamb's avatar
James Lamb committed
52

53
54
      # Create known attributes list
      INFO_KEYS <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
55

56
      # Check if attribute key is in the known attribute list
57
      for (key in names(additional_params)) {
James Lamb's avatar
James Lamb committed
58

59
        # Key existing
60
        if (key %in% INFO_KEYS) {
James Lamb's avatar
James Lamb committed
61

62
          # Store as info
63
          info[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
64

Guolin Ke's avatar
Guolin Ke committed
65
        } else {
James Lamb's avatar
James Lamb committed
66

67
          # Store as param
68
          params[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
69

Guolin Ke's avatar
Guolin Ke committed
70
        }
James Lamb's avatar
James Lamb committed
71

Guolin Ke's avatar
Guolin Ke committed
72
      }
James Lamb's avatar
James Lamb committed
73

74
75
76
77
78
79
80
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
81

82
83
84
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
85
      private$reference <- reference
86
      private$colnames <- colnames
87

88
      private$categorical_feature <- categorical_feature
89
90
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
91
      private$used_indices <- sort(used_indices, decreasing = FALSE)
92
      private$info <- info
93
      private$version <- 0L
James Lamb's avatar
James Lamb committed
94

95
96
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
97
    },
James Lamb's avatar
James Lamb committed
98

99
100
101
    create_valid = function(data,
                            info = list(),
                            ...) {
James Lamb's avatar
James Lamb committed
102

103
      # Create new dataset
104
105
106
107
108
109
110
111
112
113
114
115
      ret <- Dataset$new(
        data = data
        , params = private$params
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
        , ...
      )
James Lamb's avatar
James Lamb committed
116

117
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
118

Guolin Ke's avatar
Guolin Ke committed
119
    },
James Lamb's avatar
James Lamb committed
120

121
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
122
    construct = function() {
James Lamb's avatar
James Lamb committed
123

124
      # Check for handle null
125
      if (!lgb.is.null.handle(x = private$handle)) {
126
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
127
      }
James Lamb's avatar
James Lamb committed
128

Guolin Ke's avatar
Guolin Ke committed
129
130
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
131
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
132
133
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
134

135
      # set feature names if they do not exist
136
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
137
138
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
139

140
141
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
142

143
        # Check for character name
144
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
145

146
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
147

148
            # Provided indices, but some indices are missing?
149
            if (sum(is.na(cate_indices)) > 0L) {
150
151
152
153
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
154
            }
James Lamb's avatar
James Lamb committed
155

156
          } else {
James Lamb's avatar
James Lamb committed
157

158
            # Check if more categorical features were output over the feature space
159
            if (max(private$categorical_feature) > length(private$colnames)) {
160
161
162
163
164
165
166
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
167
            }
James Lamb's avatar
James Lamb committed
168

169
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
170
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
171

172
          }
James Lamb's avatar
James Lamb committed
173

174
        # Store indices for categorical features
175
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
176

177
      }
James Lamb's avatar
James Lamb committed
178

Guolin Ke's avatar
Guolin Ke committed
179
180
      # Check has header or not
      has_header <- FALSE
181
      if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
182
183
184
        params_has_header <- tolower(as.character(private$params$has_header)) == "true"
        params_header <- tolower(as.character(private$params$header)) == "true"
        if (params_has_header || params_header) {
Guolin Ke's avatar
Guolin Ke committed
185
186
187
          has_header <- TRUE
        }
      }
James Lamb's avatar
James Lamb committed
188

Guolin Ke's avatar
Guolin Ke committed
189
      # Generate parameter str
190
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
191

192
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
193
194
195
196
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
Guolin Ke's avatar
Guolin Ke committed
197
      handle <- lgb.null.handle()
James Lamb's avatar
James Lamb committed
198

199
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
200
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
201

202
        # Are we using a data file?
203
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
204

205
206
207
          call_state <- 0L
          .Call(
            LGBM_DatasetCreateFromFile_R
208
            , lgb.c_str(x = private$raw_data)
209
210
            , params_str
            , ref_handle
211
212
            , handle
            , call_state
213
          )
James Lamb's avatar
James Lamb committed
214

Guolin Ke's avatar
Guolin Ke committed
215
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
216

217
          # Are we using a matrix?
218
219
220
          call_state <- 0L
          .Call(
            LGBM_DatasetCreateFromMat_R
221
222
223
224
225
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
226
227
            , handle
            , call_state
228
          )
James Lamb's avatar
James Lamb committed
229
230

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
231
          if (length(private$raw_data@p) > 2147483647L) {
232
233
            stop("Cannot support large CSC matrix")
          }
234
          # Are we using a dgCMatrix (sparsed matrix column compressed)
235
236
237
          call_state <- 0L
          .Call(
            LGBM_DatasetCreateFromCSC_R
238
239
240
241
242
243
244
245
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
246
247
            , handle
            , call_state
248
          )
James Lamb's avatar
James Lamb committed
249

Guolin Ke's avatar
Guolin Ke committed
250
        } else {
James Lamb's avatar
James Lamb committed
251

252
          # Unknown data type
253
254
255
256
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
257

Guolin Ke's avatar
Guolin Ke committed
258
        }
James Lamb's avatar
James Lamb committed
259

Guolin Ke's avatar
Guolin Ke committed
260
      } else {
James Lamb's avatar
James Lamb committed
261

262
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
263
        if (is.null(private$reference)) {
264
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
265
        }
James Lamb's avatar
James Lamb committed
266

267
        # Construct subset
268
269
270
        call_state <- 0L
        .Call(
          LGBM_DatasetGetSubset_R
271
272
273
274
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
275
276
          , handle
          , call_state
277
        )
James Lamb's avatar
James Lamb committed
278

Guolin Ke's avatar
Guolin Ke committed
279
      }
280
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
281
282
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
283
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
284
285
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
286

287
288
      # Set feature names
      if (!is.null(private$colnames)) {
289
        self$set_colnames(colnames = private$colnames)
290
      }
291

292
293
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
294

295
        # Setup initial scores
296
        init_score <- private$predictor$predict(
297
          data = private$raw_data
298
299
300
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
301

302
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
303
304
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
305

306
      }
James Lamb's avatar
James Lamb committed
307

308
309
310
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
311
      }
James Lamb's avatar
James Lamb committed
312

313
      # Get private information
314
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
315

316
        # Set infos
317
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
318

Guolin Ke's avatar
Guolin Ke committed
319
          p <- private$info[i]
320
          self$setinfo(name = names(p), info = p[[1L]])
James Lamb's avatar
James Lamb committed
321

Guolin Ke's avatar
Guolin Ke committed
322
        }
James Lamb's avatar
James Lamb committed
323

Guolin Ke's avatar
Guolin Ke committed
324
      }
James Lamb's avatar
James Lamb committed
325

326
      # Get label information existence
327
      if (is.null(self$getinfo(name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
328
329
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
330

331
      return(invisible(self))
James Lamb's avatar
James Lamb committed
332

Guolin Ke's avatar
Guolin Ke committed
333
    },
James Lamb's avatar
James Lamb committed
334

335
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
336
    dim = function() {
James Lamb's avatar
James Lamb committed
337

338
      # Check for handle
339
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
340

341
342
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
343

344
        # Get numeric data and numeric features
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        call_state <- 0L
        .Call(
          LGBM_DatasetGetNumData_R
          , private$handle
          , num_row
          , call_state
        )
        call_state <- 0L
        .Call(
          LGBM_DatasetGetNumFeature_R
          , private$handle
          , num_col
          , call_state
        )
359
        return(
360
          c(num_row, num_col)
361
        )
James Lamb's avatar
James Lamb committed
362
363
364

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

365
        # Check if dgCMatrix (sparse matrix column compressed)
366
        # NOTE: requires Matrix package
367
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
368

Guolin Ke's avatar
Guolin Ke committed
369
      } else {
James Lamb's avatar
James Lamb committed
370

371
        # Trying to work with unknown dimensions is not possible
372
373
374
375
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
376

Guolin Ke's avatar
Guolin Ke committed
377
      }
James Lamb's avatar
James Lamb committed
378

Guolin Ke's avatar
Guolin Ke committed
379
    },
James Lamb's avatar
James Lamb committed
380

381
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
382
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
383

384
      # Check for handle
385
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
386

387
        # Get feature names and write them
388
389
390
391
392
393
394
395
396
397
398
        buf_len <- as.integer(1024L * 1024L)
        act_len <- 0L
        buf <- raw(buf_len)
        call_state <- 0L
        .Call(
          LGBM_DatasetGetFeatureNames_R
          , private$handle
          , buf_len
          , act_len
          , buf
          , call_state
399
        )
400
401
402
403
404
405
406
407
408
409
410
411
412
413
        if (act_len > buf_len) {
          buf_len <- act_len
          buf <- raw(buf_len)
          call_state <- 0L
          .Call(
            LGBM_DatasetGetFeatureNames_R
            , private$handle
            , buf_len
            , act_len
            , buf
            , call_state
          )
        }
        cnames <- lgb.encode.char(arr = buf, len = act_len)
414
        private$colnames <- as.character(base::strsplit(cnames, "\t")[[1L]])
415
        return(private$colnames)
James Lamb's avatar
James Lamb committed
416
417
418

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

419
        # Check if dgCMatrix (sparse matrix column compressed)
420
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
421

Guolin Ke's avatar
Guolin Ke committed
422
      } else {
James Lamb's avatar
James Lamb committed
423

424
        # Trying to work with unknown dimensions is not possible
425
426
427
428
        stop(
          "dim: cannot get dimensions before dataset has been constructed, please call "
          , "lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
429

Guolin Ke's avatar
Guolin Ke committed
430
      }
James Lamb's avatar
James Lamb committed
431

Guolin Ke's avatar
Guolin Ke committed
432
    },
James Lamb's avatar
James Lamb committed
433

434
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
435
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
436

437
438
      # Check column names non-existence
      if (is.null(colnames)) {
439
        return(invisible(self))
440
      }
James Lamb's avatar
James Lamb committed
441

442
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
443
      colnames <- as.character(colnames)
444
      if (length(colnames) == 0L) {
445
        return(invisible(self))
446
      }
James Lamb's avatar
James Lamb committed
447

448
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
449
      private$colnames <- colnames
450
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
451

452
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
453
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
454
455
456
        call_state <- 0L
        .Call(
          LGBM_DatasetSetFeatureNames_R
457
          , private$handle
458
          , lgb.c_str(x = merged_name)
459
          , call_state
460
        )
James Lamb's avatar
James Lamb committed
461

Guolin Ke's avatar
Guolin Ke committed
462
      }
James Lamb's avatar
James Lamb committed
463

464
      return(invisible(self))
James Lamb's avatar
James Lamb committed
465

Guolin Ke's avatar
Guolin Ke committed
466
    },
James Lamb's avatar
James Lamb committed
467

468
    # Get information
Guolin Ke's avatar
Guolin Ke committed
469
    getinfo = function(name) {
James Lamb's avatar
James Lamb committed
470

471
      # Create known attributes list
472
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
473

474
      # Check if attribute key is in the known attribute list
475
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
476
        stop("getinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
Guolin Ke's avatar
Guolin Ke committed
477
      }
James Lamb's avatar
James Lamb committed
478

479
      # Check for info name and handle
480
      if (is.null(private$info[[name]])) {
481

482
        if (lgb.is.null.handle(x = private$handle)) {
483
          stop("Cannot perform getinfo before constructing Dataset.")
484
        }
485

486
        # Get field size of info
487
        info_len <- 0L
488
489
490
        call_state <- 0L
        .Call(
          LGBM_DatasetGetFieldSize_R
491
          , private$handle
492
          , lgb.c_str(x = name)
493
494
          , info_len
          , call_state
495
        )
James Lamb's avatar
James Lamb committed
496

497
        # Check if info is not empty
498
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
499

500
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
501
          ret <- NULL
502
503
504
505
506
          ret <- if (name == "group") {
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
507

508
509
510
          call_state <- 0L
          .Call(
            LGBM_DatasetGetField_R
511
            , private$handle
512
            , lgb.c_str(x = name)
513
514
            , ret
            , call_state
515
          )
James Lamb's avatar
James Lamb committed
516

Guolin Ke's avatar
Guolin Ke committed
517
          private$info[[name]] <- ret
James Lamb's avatar
James Lamb committed
518

Guolin Ke's avatar
Guolin Ke committed
519
520
        }
      }
James Lamb's avatar
James Lamb committed
521

522
      return(private$info[[name]])
James Lamb's avatar
James Lamb committed
523

Guolin Ke's avatar
Guolin Ke committed
524
    },
James Lamb's avatar
James Lamb committed
525

526
    # Set information
Guolin Ke's avatar
Guolin Ke committed
527
    setinfo = function(name, info) {
James Lamb's avatar
James Lamb committed
528

529
      # Create known attributes list
530
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
531

532
      # Check if attribute key is in the known attribute list
533
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
534
535
        stop("setinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
      }
James Lamb's avatar
James Lamb committed
536

537
538
539
540
541
542
      # Check for type of information
      info <- if (name == "group") {
        as.integer(info) # Integer
      } else {
        as.numeric(info) # Numeric
      }
James Lamb's avatar
James Lamb committed
543

544
      # Store information privately
Guolin Ke's avatar
Guolin Ke committed
545
      private$info[[name]] <- info
James Lamb's avatar
James Lamb committed
546

547
      if (!lgb.is.null.handle(x = private$handle) && !is.null(info)) {
James Lamb's avatar
James Lamb committed
548

549
        if (length(info) > 0L) {
James Lamb's avatar
James Lamb committed
550

551
552
553
          call_state <- 0L
          .Call(
            LGBM_DatasetSetField_R
554
            , private$handle
555
            , lgb.c_str(x = name)
556
557
            , info
            , length(info)
558
            , call_state
559
          )
James Lamb's avatar
James Lamb committed
560

561
562
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
563
        }
James Lamb's avatar
James Lamb committed
564

Guolin Ke's avatar
Guolin Ke committed
565
      }
James Lamb's avatar
James Lamb committed
566

567
      return(invisible(self))
James Lamb's avatar
James Lamb committed
568

Guolin Ke's avatar
Guolin Ke committed
569
    },
James Lamb's avatar
James Lamb committed
570

571
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
572
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
573

574
      # Perform slicing
575
576
577
578
579
580
581
582
583
584
585
586
587
      return(
        Dataset$new(
          data = NULL
          , params = private$params
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
          , ...
        )
588
      )
James Lamb's avatar
James Lamb committed
589

Guolin Ke's avatar
Guolin Ke committed
590
    },
James Lamb's avatar
James Lamb committed
591

592
593
594
    # [description] Update Dataset parameters. If it has not been constructed yet,
    #               this operation just happens on the R side (updating private$params).
    #               If it has been constructed, parameters will be updated on the C++ side.
595
    update_params = function(params) {
596
597
598
      if (length(params) == 0L) {
        return(invisible(self))
      }
599
      if (lgb.is.null.handle(x = private$handle)) {
600
601
        private$params <- modifyList(private$params, params)
      } else {
602
603
604
        tryCatch({
          call_state <- 0L
          .Call(
605
            LGBM_DatasetUpdateParamChecking_R
606
607
608
609
610
611
612
            , lgb.params2str(params = private$params)
            , lgb.params2str(params = params)
            , call_state
          )
        }, error = function(e) {
          # If updating failed but raw data is not available, raise an error because
          # achieving what the user asked for is not possible
613
          if (is.null(private$raw_data)) {
614
            stop(e)
615
616
          }

617
618
          # If updating failed but raw data is available, modify the params
          # on the R side and re-set ("deconstruct") the Dataset
619
620
          private$params <- modifyList(private$params, params)
          self$finalize()
621
        })
622
      }
623
      return(invisible(self))
James Lamb's avatar
James Lamb committed
624

Guolin Ke's avatar
Guolin Ke committed
625
    },
James Lamb's avatar
James Lamb committed
626

627
628
629
630
631
632
633
634
635
636
637
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

638
    # Set categorical feature parameter
639
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
640

641
642
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
643
        return(invisible(self))
644
      }
James Lamb's avatar
James Lamb committed
645

646
      # Check for empty data
647
      if (is.null(private$raw_data)) {
648
649
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
650
      }
James Lamb's avatar
James Lamb committed
651

652
      # Overwrite categorical features
653
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
654

655
      # Finalize and return self
656
      self$finalize()
657
      return(invisible(self))
James Lamb's avatar
James Lamb committed
658

659
    },
James Lamb's avatar
James Lamb committed
660

661
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
662
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
663

664
      # Set known references
665
666
667
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)
James Lamb's avatar
James Lamb committed
668

669
670
      # Check for identical references
      if (identical(private$reference, reference)) {
671
        return(invisible(self))
672
      }
James Lamb's avatar
James Lamb committed
673

674
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
675
      if (is.null(private$raw_data)) {
James Lamb's avatar
James Lamb committed
676

677
678
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
James Lamb's avatar
James Lamb committed
679

Guolin Ke's avatar
Guolin Ke committed
680
      }
James Lamb's avatar
James Lamb committed
681

682
      # Check for non-existing reference
Guolin Ke's avatar
Guolin Ke committed
683
      if (!is.null(reference)) {
James Lamb's avatar
James Lamb committed
684

685
        # Reference is unknown
686
        if (!lgb.check.r6.class(object = reference, name = "lgb.Dataset")) {
687
          stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
688
        }
James Lamb's avatar
James Lamb committed
689

Guolin Ke's avatar
Guolin Ke committed
690
      }
James Lamb's avatar
James Lamb committed
691

692
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
693
      private$reference <- reference
James Lamb's avatar
James Lamb committed
694

695
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
696
      self$finalize()
697
      return(invisible(self))
James Lamb's avatar
James Lamb committed
698

Guolin Ke's avatar
Guolin Ke committed
699
    },
James Lamb's avatar
James Lamb committed
700

701
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
702
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
703

704
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
705
      self$construct()
706
707
708
      call_state <- 0L
      .Call(
        LGBM_DatasetSaveBinary_R
709
        , private$handle
710
        , lgb.c_str(x = fname)
711
        , call_state
712
      )
713
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
714
    }
James Lamb's avatar
James Lamb committed
715

Guolin Ke's avatar
Guolin Ke committed
716
717
  ),
  private = list(
718
719
720
721
722
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
723
    categorical_feature = NULL,
724
725
726
727
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
728
    version = 0L,
James Lamb's avatar
James Lamb committed
729

730
731
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
732

733
      # Get handle and construct if needed
734
      if (lgb.is.null.handle(x = private$handle)) {
735
736
        self$construct()
      }
737
      return(private$handle)
James Lamb's avatar
James Lamb committed
738

Guolin Ke's avatar
Guolin Ke committed
739
    },
James Lamb's avatar
James Lamb committed
740

741
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
742
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
743

744
      if (identical(private$predictor, predictor)) {
745
        return(invisible(self))
746
      }
James Lamb's avatar
James Lamb committed
747

748
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
749
      if (is.null(private$raw_data)) {
750
751
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
752
      }
James Lamb's avatar
James Lamb committed
753

754
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
755
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
756

757
        # Predictor is unknown
758
        if (!lgb.check.r6.class(object = predictor, name = "lgb.Predictor")) {
759
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
760
        }
James Lamb's avatar
James Lamb committed
761

Guolin Ke's avatar
Guolin Ke committed
762
      }
James Lamb's avatar
James Lamb committed
763

764
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
765
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
766

767
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
768
      self$finalize()
769
      return(invisible(self))
James Lamb's avatar
James Lamb committed
770

Guolin Ke's avatar
Guolin Ke committed
771
    }
James Lamb's avatar
James Lamb committed
772

Guolin Ke's avatar
Guolin Ke committed
773
774
775
  )
)

776
777
778
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
Guolin Ke's avatar
Guolin Ke committed
779
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
780
781
782
783
784
785
786
#' @param params a list of parameters. See
#'               \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#dataset-parameters}{
#'               The "Dataset Parameters" section of the documentation} for a list of parameters
#'               and valid values.
#' @param reference reference dataset. When LightGBM creates a Dataset, it does some preprocessing like binning
#'                  continuous features into histograms. If you want to apply the same bin boundaries from an existing
#'                  dataset to new \code{data}, pass that existing Dataset to this argument.
Guolin Ke's avatar
Guolin Ke committed
787
#' @param colnames names of columns
788
789
790
791
792
793
794
795
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
#' @param free_raw_data LightGBM constructs its data format, called a "Dataset", from tabular data.
#'                      By default, that Dataset object on the R side does not keep a copy of the raw data.
#'                      This reduces LightGBM's memory consumption, but it means that the Dataset object
#'                      cannot be changed after it has been constructed. If you'd prefer to be able to
#'                      change the Dataset object after construction, set \code{free_raw_data = FALSE}.
Nikita Titov's avatar
Nikita Titov committed
796
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
797
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
James Lamb's avatar
James Lamb committed
798
#'
Guolin Ke's avatar
Guolin Ke committed
799
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
800
#'
Guolin Ke's avatar
Guolin Ke committed
801
#' @examples
802
#' \donttest{
803
804
805
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
806
807
808
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
809
#' lgb.Dataset.construct(dtrain)
810
#' }
Guolin Ke's avatar
Guolin Ke committed
811
812
#' @export
lgb.Dataset <- function(data,
813
814
815
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
816
                        categorical_feature = NULL,
817
818
                        free_raw_data = TRUE,
                        info = list(),
Guolin Ke's avatar
Guolin Ke committed
819
                        ...) {
James Lamb's avatar
James Lamb committed
820

821
  # Create new dataset
822
823
824
825
826
827
828
829
830
831
832
833
834
835
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
      , ...
    ))
  )
James Lamb's avatar
James Lamb committed
836

Guolin Ke's avatar
Guolin Ke committed
837
838
}

839
840
841
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
Guolin Ke's avatar
Guolin Ke committed
842
843
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
Nikita Titov's avatar
Nikita Titov committed
844
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
845
#' @param ... other information to pass to \code{info}.
James Lamb's avatar
James Lamb committed
846
#'
Guolin Ke's avatar
Guolin Ke committed
847
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
848
#'
Guolin Ke's avatar
Guolin Ke committed
849
#' @examples
850
#' \donttest{
851
852
853
854
855
856
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
857
#' }
Guolin Ke's avatar
Guolin Ke committed
858
#' @export
859
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
James Lamb's avatar
James Lamb committed
860

861
  # Check if dataset is not a dataset
862
  if (!lgb.is.Dataset(x = dataset)) {
863
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
864
  }
James Lamb's avatar
James Lamb committed
865

866
  # Create validation dataset
867
  return(invisible(dataset$create_valid(data = data, info = info, ...)))
James Lamb's avatar
James Lamb committed
868

869
}
Guolin Ke's avatar
Guolin Ke committed
870

871
872
873
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
874
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
875
#'
Guolin Ke's avatar
Guolin Ke committed
876
#' @examples
877
#' \donttest{
878
879
880
881
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
882
#' }
883
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
884
885
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
886

887
  # Check if dataset is not a dataset
888
  if (!lgb.is.Dataset(x = dataset)) {
889
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
890
  }
James Lamb's avatar
James Lamb committed
891

892
  # Construct the dataset
893
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
894

Guolin Ke's avatar
Guolin Ke committed
895
896
}

897
898
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
899
900
#' @param x Object of class \code{lgb.Dataset}
#' @param ... other parameters
James Lamb's avatar
James Lamb committed
901
#'
Guolin Ke's avatar
Guolin Ke committed
902
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
903
#'
Guolin Ke's avatar
Guolin Ke committed
904
905
906
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
907
#'
Guolin Ke's avatar
Guolin Ke committed
908
#' @examples
909
#' \donttest{
910
911
912
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
913
#'
914
915
916
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
917
#' }
Guolin Ke's avatar
Guolin Ke committed
918
919
920
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
921

922
  # Check if dataset is not a dataset
923
  if (!lgb.is.Dataset(x = x)) {
924
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
925
  }
James Lamb's avatar
James Lamb committed
926

927
  return(x$dim())
James Lamb's avatar
James Lamb committed
928

Guolin Ke's avatar
Guolin Ke committed
929
930
}

931
932
933
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
934
935
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
936
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
937
938
939
940
941
942
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
943
#' \donttest{
944
945
946
947
948
949
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
950
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
951
#' print(dtrain, verbose = TRUE)
952
#' }
Guolin Ke's avatar
Guolin Ke committed
953
#' @rdname dimnames.lgb.Dataset
954
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
955
956
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
957

958
  # Check if dataset is not a dataset
959
  if (!lgb.is.Dataset(x = x)) {
960
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
961
  }
James Lamb's avatar
James Lamb committed
962

963
  # Return dimension names
964
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
965

Guolin Ke's avatar
Guolin Ke committed
966
967
968
969
970
}

#' @rdname dimnames.lgb.Dataset
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
971

972
  # Check if invalid element list
973
  if (!identical(class(value), "list") || length(value) != 2L) {
974
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
975
  }
James Lamb's avatar
James Lamb committed
976

977
978
979
980
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
981

982
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
983

984
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
985
    return(x)
James Lamb's avatar
James Lamb committed
986

987
  }
James Lamb's avatar
James Lamb committed
988

989
  # Check for unmatching column size
990
  if (ncol(x) != length(value[[2L]])) {
991
992
    stop(
      "can't assign "
993
      , sQuote(length(value[[2L]]))
994
995
996
997
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
998
  }
James Lamb's avatar
James Lamb committed
999

1000
  # Set column names properly, and return
1001
  x$set_colnames(colnames = value[[2L]])
1002
  return(x)
James Lamb's avatar
James Lamb committed
1003

Guolin Ke's avatar
Guolin Ke committed
1004
1005
}

1006
1007
1008
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
1009
#' @param dataset Object of class \code{lgb.Dataset}
1010
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
1011
1012
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
1013
#'
Guolin Ke's avatar
Guolin Ke committed
1014
#' @examples
1015
#' \donttest{
1016
1017
1018
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
1019
#'
1020
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
1021
#' lgb.Dataset.construct(dsub)
1022
#' labels <- lightgbm::getinfo(dsub, "label")
1023
#' }
Guolin Ke's avatar
Guolin Ke committed
1024
#' @export
1025
1026
1027
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
1028
1029
1030
1031

#' @rdname slice
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
1032

1033
  # Check if dataset is not a dataset
1034
  if (!lgb.is.Dataset(x = dataset)) {
1035
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1036
  }
James Lamb's avatar
James Lamb committed
1037

1038
  # Return sliced set
1039
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
1040

Guolin Ke's avatar
Guolin Ke committed
1041
1042
}

1043
1044
1045
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1046
1047
1048
1049
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#' @return info data
James Lamb's avatar
James Lamb committed
1050
#'
Guolin Ke's avatar
Guolin Ke committed
1051
1052
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1053
#'
Guolin Ke's avatar
Guolin Ke committed
1054
1055
1056
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
1057
1058
1059
1060
1061
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1062
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1063
#' }
James Lamb's avatar
James Lamb committed
1064
#'
Guolin Ke's avatar
Guolin Ke committed
1065
#' @examples
1066
#' \donttest{
1067
1068
1069
1070
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1071
#'
1072
1073
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1074
#'
1075
1076
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1077
#' }
Guolin Ke's avatar
Guolin Ke committed
1078
#' @export
1079
1080
1081
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1082
1083
1084
1085

#' @rdname getinfo
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1086

1087
  # Check if dataset is not a dataset
1088
  if (!lgb.is.Dataset(x = dataset)) {
1089
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1090
  }
James Lamb's avatar
James Lamb committed
1091

1092
  return(dataset$getinfo(name = name))
James Lamb's avatar
James Lamb committed
1093

Guolin Ke's avatar
Guolin Ke committed
1094
1095
}

1096
1097
1098
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1099
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1100
1101
1102
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
1103
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1104
#'
Guolin Ke's avatar
Guolin Ke committed
1105
1106
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1107
#'
Guolin Ke's avatar
Guolin Ke committed
1108
#' \itemize{
1109
1110
1111
1112
1113
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1114
1115
1116
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1117
#' }
James Lamb's avatar
James Lamb committed
1118
#'
Guolin Ke's avatar
Guolin Ke committed
1119
#' @examples
1120
#' \donttest{
1121
1122
1123
1124
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1125
#'
1126
1127
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1128
#'
1129
1130
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1131
#' }
Guolin Ke's avatar
Guolin Ke committed
1132
#' @export
1133
1134
1135
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1136
1137
1138
1139

#' @rdname setinfo
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1140

1141
  if (!lgb.is.Dataset(x = dataset)) {
1142
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1143
  }
James Lamb's avatar
James Lamb committed
1144

1145
  # Set information
1146
  return(invisible(dataset$setinfo(name = name, info = info)))
Guolin Ke's avatar
Guolin Ke committed
1147
1148
}

1149
1150
1151
1152
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1153
#' @param dataset object of class \code{lgb.Dataset}
1154
1155
1156
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1157
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1158
#'
1159
#' @examples
1160
#' \donttest{
1161
1162
1163
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1164
1165
1166
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1167
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1168
#' }
1169
1170
1171
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1172

1173
  if (!lgb.is.Dataset(x = dataset)) {
1174
1175
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1176

1177
  # Set categoricals
1178
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1179

1180
1181
}

1182
1183
1184
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1185
1186
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1187
#'
1188
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1189
#'
Guolin Ke's avatar
Guolin Ke committed
1190
#' @examples
1191
#' \donttest{
1192
1193
1194
1195
1196
1197
1198
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test = train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
1199
#' }
Guolin Ke's avatar
Guolin Ke committed
1200
1201
1202
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1203

1204
  # Check if dataset is not a dataset
1205
  if (!lgb.is.Dataset(x = dataset)) {
1206
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1207
  }
James Lamb's avatar
James Lamb committed
1208

1209
  # Set reference
1210
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1211
1212
}

1213
1214
1215
1216
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1217
1218
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1219
#'
1220
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1221
#'
Guolin Ke's avatar
Guolin Ke committed
1222
#' @examples
1223
#' \donttest{
1224
1225
1226
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1227
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1228
#' }
Guolin Ke's avatar
Guolin Ke committed
1229
1230
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1231

1232
  # Check if dataset is not a dataset
1233
  if (!lgb.is.Dataset(x = dataset)) {
1234
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1235
  }
James Lamb's avatar
James Lamb committed
1236

1237
  # File-type is not matching
1238
1239
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1240
  }
James Lamb's avatar
James Lamb committed
1241

1242
  # Store binary
1243
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1244
}