lgb.Dataset.R 32.8 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
2
3
4
#' @importFrom R6 R6Class
Dataset <- R6::R6Class(

5
  classname = "lgb.Dataset",
6
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
7
  public = list(
James Lamb's avatar
James Lamb committed
8

9
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
10
    finalize = function() {
James Lamb's avatar
James Lamb committed
11

12
      # Check the need for freeing handle
Guolin Ke's avatar
Guolin Ke committed
13
      if (!lgb.is.null.handle(private$handle)) {
James Lamb's avatar
James Lamb committed
14

15
        # Freeing up handle
16
        lgb.call(fun_name = "LGBM_DatasetFree_R", ret = NULL, private$handle)
Guolin Ke's avatar
Guolin Ke committed
17
        private$handle <- NULL
James Lamb's avatar
James Lamb committed
18

Guolin Ke's avatar
Guolin Ke committed
19
      }
James Lamb's avatar
James Lamb committed
20

Guolin Ke's avatar
Guolin Ke committed
21
    },
James Lamb's avatar
James Lamb committed
22

23
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
24
    initialize = function(data,
25
26
27
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
28
                          categorical_feature = NULL,
29
30
31
32
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
Guolin Ke's avatar
Guolin Ke committed
33
                          ...) {
James Lamb's avatar
James Lamb committed
34

35
      # validate inputs early to avoid unnecessary computation
36
      if (!(is.null(reference) || lgb.check.r6.class(object = reference, name = "lgb.Dataset"))) {
37
38
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
39
      if (!(is.null(predictor) || lgb.check.r6.class(object = predictor, name = "lgb.Predictor"))) {
40
41
42
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

43
      # Check for additional parameters
44
      additional_params <- list(...)
James Lamb's avatar
James Lamb committed
45

46
47
      # Create known attributes list
      INFO_KEYS <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
48

49
      # Check if attribute key is in the known attribute list
50
      for (key in names(additional_params)) {
James Lamb's avatar
James Lamb committed
51

52
        # Key existing
53
        if (key %in% INFO_KEYS) {
James Lamb's avatar
James Lamb committed
54

55
          # Store as info
56
          info[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
57

Guolin Ke's avatar
Guolin Ke committed
58
        } else {
James Lamb's avatar
James Lamb committed
59

60
          # Store as param
61
          params[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
62

Guolin Ke's avatar
Guolin Ke committed
63
        }
James Lamb's avatar
James Lamb committed
64

Guolin Ke's avatar
Guolin Ke committed
65
      }
James Lamb's avatar
James Lamb committed
66

67
68
69
70
71
72
73
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
74

75
76
77
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
78
      private$reference <- reference
79
      private$colnames <- colnames
80

81
      private$categorical_feature <- categorical_feature
82
83
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
84
      private$used_indices <- sort(used_indices, decreasing = FALSE)
85
      private$info <- info
86
      private$version <- 0L
James Lamb's avatar
James Lamb committed
87

Guolin Ke's avatar
Guolin Ke committed
88
    },
James Lamb's avatar
James Lamb committed
89

90
91
92
    create_valid = function(data,
                            info = list(),
                            ...) {
James Lamb's avatar
James Lamb committed
93

94
      # Create new dataset
95
96
97
98
99
100
101
102
103
104
105
106
      ret <- Dataset$new(
        data = data
        , params = private$params
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
        , ...
      )
James Lamb's avatar
James Lamb committed
107

108
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
109

Guolin Ke's avatar
Guolin Ke committed
110
    },
James Lamb's avatar
James Lamb committed
111

112
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
113
    construct = function() {
James Lamb's avatar
James Lamb committed
114

115
      # Check for handle null
Guolin Ke's avatar
Guolin Ke committed
116
      if (!lgb.is.null.handle(private$handle)) {
117
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
118
      }
James Lamb's avatar
James Lamb committed
119

Guolin Ke's avatar
Guolin Ke committed
120
121
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
122
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
123
124
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
125

Guolin Ke's avatar
Guolin Ke committed
126
      # set feature names if not exist
127
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
128
129
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
130

131
132
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
133

134
        # Check for character name
135
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
136

137
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
138

139
            # Provided indices, but some indices are not existing?
140
            if (sum(is.na(cate_indices)) > 0L) {
141
142
143
144
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
145
            }
James Lamb's avatar
James Lamb committed
146

147
          } else {
James Lamb's avatar
James Lamb committed
148

149
            # Check if more categorical features were output over the feature space
150
            if (max(private$categorical_feature) > length(private$colnames)) {
151
152
153
154
155
156
157
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
158
            }
James Lamb's avatar
James Lamb committed
159

160
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
161
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
162

163
          }
James Lamb's avatar
James Lamb committed
164

165
        # Store indices for categorical features
166
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
167

168
      }
James Lamb's avatar
James Lamb committed
169

Guolin Ke's avatar
Guolin Ke committed
170
171
      # Check has header or not
      has_header <- FALSE
172
      if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
173
174
175
        params_has_header <- tolower(as.character(private$params$has_header)) == "true"
        params_header <- tolower(as.character(private$params$header)) == "true"
        if (params_has_header || params_header) {
Guolin Ke's avatar
Guolin Ke committed
176
177
178
          has_header <- TRUE
        }
      }
James Lamb's avatar
James Lamb committed
179

Guolin Ke's avatar
Guolin Ke committed
180
      # Generate parameter str
181
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
182

183
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
184
185
186
187
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
Guolin Ke's avatar
Guolin Ke committed
188
      handle <- lgb.null.handle()
James Lamb's avatar
James Lamb committed
189

190
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
191
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
192

193
        # Are we using a data file?
194
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
195

196
          handle <- lgb.call(
197
            fun_name = "LGBM_DatasetCreateFromFile_R"
198
199
200
201
202
            , ret = handle
            , lgb.c_str(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
203

Guolin Ke's avatar
Guolin Ke committed
204
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
205

206
          # Are we using a matrix?
207
          handle <- lgb.call(
208
            fun_name = "LGBM_DatasetCreateFromMat_R"
209
210
211
212
213
214
215
            , ret = handle
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
216
217

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
218
          if (length(private$raw_data@p) > 2147483647L) {
219
220
            stop("Cannot support large CSC matrix")
          }
221
          # Are we using a dgCMatrix (sparsed matrix column compressed)
222
          handle <- lgb.call(
223
            fun_name = "LGBM_DatasetCreateFromCSC_R"
224
225
226
227
228
229
230
231
232
233
            , ret = handle
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
234

Guolin Ke's avatar
Guolin Ke committed
235
        } else {
James Lamb's avatar
James Lamb committed
236

237
          # Unknown data type
238
239
240
241
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
242

Guolin Ke's avatar
Guolin Ke committed
243
        }
James Lamb's avatar
James Lamb committed
244

Guolin Ke's avatar
Guolin Ke committed
245
      } else {
James Lamb's avatar
James Lamb committed
246

247
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
248
        if (is.null(private$reference)) {
249
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
250
        }
James Lamb's avatar
James Lamb committed
251

252
        # Construct subset
253
        handle <- lgb.call(
254
          fun_name = "LGBM_DatasetGetSubset_R"
255
256
257
258
259
260
          , ret = handle
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
261

Guolin Ke's avatar
Guolin Ke committed
262
      }
Guolin Ke's avatar
Guolin Ke committed
263
264
265
      if (lgb.is.null.handle(handle)) {
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
266
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
267
268
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
269

270
271
272
273
      # Set feature names
      if (!is.null(private$colnames)) {
        self$set_colnames(private$colnames)
      }
274

275
276
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
277

278
        # Setup initial scores
279
        init_score <- private$predictor$predict(
280
          data = private$raw_data
281
282
283
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
284

285
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
286
287
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
288

289
      }
James Lamb's avatar
James Lamb committed
290

291
292
293
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
294
      }
James Lamb's avatar
James Lamb committed
295

296
      # Get private information
297
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
298

299
        # Set infos
300
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
301

Guolin Ke's avatar
Guolin Ke committed
302
          p <- private$info[i]
303
          self$setinfo(name = names(p), info = p[[1L]])
James Lamb's avatar
James Lamb committed
304

Guolin Ke's avatar
Guolin Ke committed
305
        }
James Lamb's avatar
James Lamb committed
306

Guolin Ke's avatar
Guolin Ke committed
307
      }
James Lamb's avatar
James Lamb committed
308

309
      # Get label information existence
Guolin Ke's avatar
Guolin Ke committed
310
311
312
      if (is.null(self$getinfo("label"))) {
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
313

314
      return(invisible(self))
James Lamb's avatar
James Lamb committed
315

Guolin Ke's avatar
Guolin Ke committed
316
    },
James Lamb's avatar
James Lamb committed
317

318
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
319
    dim = function() {
James Lamb's avatar
James Lamb committed
320

321
      # Check for handle
Guolin Ke's avatar
Guolin Ke committed
322
      if (!lgb.is.null.handle(private$handle)) {
James Lamb's avatar
James Lamb committed
323

324
325
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
326

327
        # Get numeric data and numeric features
328
329
330
331
332
333
334
335
336
337
338
339
        c(
          lgb.call(
            fun_name = "LGBM_DatasetGetNumData_R"
            , ret = num_row
            , private$handle
          ),
          lgb.call(
            fun_name = "LGBM_DatasetGetNumFeature_R"
            , ret = num_col
            , private$handle
          )
        )
James Lamb's avatar
James Lamb committed
340
341
342

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

343
        # Check if dgCMatrix (sparse matrix column compressed)
344
        # NOTE: requires Matrix package
345
        dim(private$raw_data)
James Lamb's avatar
James Lamb committed
346

Guolin Ke's avatar
Guolin Ke committed
347
      } else {
James Lamb's avatar
James Lamb committed
348

349
        # Trying to work with unknown dimensions is not possible
350
351
352
353
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
354

Guolin Ke's avatar
Guolin Ke committed
355
      }
James Lamb's avatar
James Lamb committed
356

Guolin Ke's avatar
Guolin Ke committed
357
    },
James Lamb's avatar
James Lamb committed
358

359
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
360
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
361

362
      # Check for handle
Guolin Ke's avatar
Guolin Ke committed
363
      if (!lgb.is.null.handle(private$handle)) {
James Lamb's avatar
James Lamb committed
364

365
        # Get feature names and write them
366
367
368
369
        cnames <- lgb.call.return.str(
            fun_name = "LGBM_DatasetGetFeatureNames_R"
            , private$handle
        )
370
        private$colnames <- as.character(base::strsplit(cnames, "\t")[[1L]])
371
        private$colnames
James Lamb's avatar
James Lamb committed
372
373
374

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

375
        # Check if dgCMatrix (sparse matrix column compressed)
376
        colnames(private$raw_data)
James Lamb's avatar
James Lamb committed
377

Guolin Ke's avatar
Guolin Ke committed
378
      } else {
James Lamb's avatar
James Lamb committed
379

380
        # Trying to work with unknown dimensions is not possible
381
382
383
384
        stop(
          "dim: cannot get dimensions before dataset has been constructed, please call "
          , "lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
385

Guolin Ke's avatar
Guolin Ke committed
386
      }
James Lamb's avatar
James Lamb committed
387

Guolin Ke's avatar
Guolin Ke committed
388
    },
James Lamb's avatar
James Lamb committed
389

390
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
391
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
392

393
394
      # Check column names non-existence
      if (is.null(colnames)) {
395
        return(invisible(self))
396
      }
James Lamb's avatar
James Lamb committed
397

398
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
399
      colnames <- as.character(colnames)
400
      if (length(colnames) == 0L) {
401
        return(invisible(self))
402
      }
James Lamb's avatar
James Lamb committed
403

404
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
405
406
      private$colnames <- colnames
      if (!lgb.is.null.handle(private$handle)) {
James Lamb's avatar
James Lamb committed
407

408
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
409
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
410
        lgb.call(
411
          fun_name = "LGBM_DatasetSetFeatureNames_R"
412
413
414
415
          , ret = NULL
          , private$handle
          , lgb.c_str(merged_name)
        )
James Lamb's avatar
James Lamb committed
416

Guolin Ke's avatar
Guolin Ke committed
417
      }
James Lamb's avatar
James Lamb committed
418

419
      return(invisible(self))
James Lamb's avatar
James Lamb committed
420

Guolin Ke's avatar
Guolin Ke committed
421
    },
James Lamb's avatar
James Lamb committed
422

423
    # Get information
Guolin Ke's avatar
Guolin Ke committed
424
    getinfo = function(name) {
James Lamb's avatar
James Lamb committed
425

426
      # Create known attributes list
427
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
428

429
      # Check if attribute key is in the known attribute list
430
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
431
        stop("getinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
Guolin Ke's avatar
Guolin Ke committed
432
      }
James Lamb's avatar
James Lamb committed
433

434
      # Check for info name and handle
435
      if (is.null(private$info[[name]])) {
436

437
        if (lgb.is.null.handle(private$handle)) {
438
          stop("Cannot perform getinfo before constructing Dataset.")
439
        }
440

441
        # Get field size of info
442
        info_len <- 0L
443
        info_len <- lgb.call(
444
          fun_name = "LGBM_DatasetGetFieldSize_R"
445
446
447
448
          , ret = info_len
          , private$handle
          , lgb.c_str(name)
        )
James Lamb's avatar
James Lamb committed
449

450
        # Check if info is not empty
451
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
452

453
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
454
          ret <- NULL
455
456
457
458
459
          ret <- if (name == "group") {
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
460

461
          ret <- lgb.call(
462
            fun_name = "LGBM_DatasetGetField_R"
463
464
465
466
            , ret = ret
            , private$handle
            , lgb.c_str(name)
          )
James Lamb's avatar
James Lamb committed
467

Guolin Ke's avatar
Guolin Ke committed
468
          private$info[[name]] <- ret
James Lamb's avatar
James Lamb committed
469

Guolin Ke's avatar
Guolin Ke committed
470
471
        }
      }
James Lamb's avatar
James Lamb committed
472

473
      private$info[[name]]
James Lamb's avatar
James Lamb committed
474

Guolin Ke's avatar
Guolin Ke committed
475
    },
James Lamb's avatar
James Lamb committed
476

477
    # Set information
Guolin Ke's avatar
Guolin Ke committed
478
    setinfo = function(name, info) {
James Lamb's avatar
James Lamb committed
479

480
      # Create known attributes list
481
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
482

483
      # Check if attribute key is in the known attribute list
484
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
485
486
        stop("setinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
      }
James Lamb's avatar
James Lamb committed
487

488
489
490
491
492
493
      # Check for type of information
      info <- if (name == "group") {
        as.integer(info) # Integer
      } else {
        as.numeric(info) # Numeric
      }
James Lamb's avatar
James Lamb committed
494

495
      # Store information privately
Guolin Ke's avatar
Guolin Ke committed
496
      private$info[[name]] <- info
James Lamb's avatar
James Lamb committed
497

498
      if (!lgb.is.null.handle(private$handle) && !is.null(info)) {
James Lamb's avatar
James Lamb committed
499

500
        if (length(info) > 0L) {
James Lamb's avatar
James Lamb committed
501

502
          lgb.call(
503
            fun_name = "LGBM_DatasetSetField_R"
504
505
506
507
508
509
            , ret = NULL
            , private$handle
            , lgb.c_str(name)
            , info
            , length(info)
          )
James Lamb's avatar
James Lamb committed
510

511
512
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
513
        }
James Lamb's avatar
James Lamb committed
514

Guolin Ke's avatar
Guolin Ke committed
515
      }
James Lamb's avatar
James Lamb committed
516

517
      return(invisible(self))
James Lamb's avatar
James Lamb committed
518

Guolin Ke's avatar
Guolin Ke committed
519
    },
James Lamb's avatar
James Lamb committed
520

521
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
522
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
523

524
      # Perform slicing
525
526
527
528
529
530
531
532
533
534
535
536
      Dataset$new(
        data = NULL
        , params = private$params
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = sort(idxset, decreasing = FALSE)
        , info = NULL
        , ...
      )
James Lamb's avatar
James Lamb committed
537

Guolin Ke's avatar
Guolin Ke committed
538
    },
James Lamb's avatar
James Lamb committed
539

540
    # Update parameters
541
    update_params = function(params) {
542
543
544
545
546
547
548
549
550
      if (length(params) == 0L) {
        return(invisible(self))
      }
      if (lgb.is.null.handle(private$handle)) {
        private$params <- modifyList(private$params, params)
      } else {
        call_state <- 0L
        call_state <- .Call(
          "LGBM_DatasetUpdateParamChecking_R"
551
552
          , lgb.params2str(params = private$params)
          , lgb.params2str(params = params)
553
554
          , call_state
          , PACKAGE = "lib_lightgbm"
555
        )
556
557
558
559
560
561
562
563
564
565
566
567
        call_state <- as.integer(call_state)
        if (call_state != 0L) {

          # raise error if raw data is freed
          if (is.null(private$raw_data)) {
            lgb.last_error()
          }

          # Overwrite paramms
          private$params <- modifyList(private$params, params)
          self$finalize()
        }
568
      }
569
      return(invisible(self))
James Lamb's avatar
James Lamb committed
570

Guolin Ke's avatar
Guolin Ke committed
571
    },
James Lamb's avatar
James Lamb committed
572

573
574
575
576
577
578
579
580
581
582
583
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

584
    # Set categorical feature parameter
585
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
586

587
588
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
589
        return(invisible(self))
590
      }
James Lamb's avatar
James Lamb committed
591

592
      # Check for empty data
593
      if (is.null(private$raw_data)) {
594
595
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
596
      }
James Lamb's avatar
James Lamb committed
597

598
      # Overwrite categorical features
599
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
600

601
      # Finalize and return self
602
      self$finalize()
603
      return(invisible(self))
James Lamb's avatar
James Lamb committed
604

605
    },
James Lamb's avatar
James Lamb committed
606

607
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
608
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
609

610
      # Set known references
611
      self$set_categorical_feature(reference$.__enclos_env__$private$categorical_feature)
Guolin Ke's avatar
Guolin Ke committed
612
613
      self$set_colnames(reference$get_colnames())
      private$set_predictor(reference$.__enclos_env__$private$predictor)
James Lamb's avatar
James Lamb committed
614

615
616
      # Check for identical references
      if (identical(private$reference, reference)) {
617
        return(invisible(self))
618
      }
James Lamb's avatar
James Lamb committed
619

620
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
621
      if (is.null(private$raw_data)) {
James Lamb's avatar
James Lamb committed
622

623
624
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
James Lamb's avatar
James Lamb committed
625

Guolin Ke's avatar
Guolin Ke committed
626
      }
James Lamb's avatar
James Lamb committed
627

628
      # Check for non-existing reference
Guolin Ke's avatar
Guolin Ke committed
629
      if (!is.null(reference)) {
James Lamb's avatar
James Lamb committed
630

631
        # Reference is unknown
632
        if (!lgb.check.r6.class(object = reference, name = "lgb.Dataset")) {
633
          stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
634
        }
James Lamb's avatar
James Lamb committed
635

Guolin Ke's avatar
Guolin Ke committed
636
      }
James Lamb's avatar
James Lamb committed
637

638
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
639
      private$reference <- reference
James Lamb's avatar
James Lamb committed
640

641
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
642
      self$finalize()
643
      return(invisible(self))
James Lamb's avatar
James Lamb committed
644

Guolin Ke's avatar
Guolin Ke committed
645
    },
James Lamb's avatar
James Lamb committed
646

647
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
648
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
649

650
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
651
      self$construct()
652
      lgb.call(
653
        fun_name = "LGBM_DatasetSaveBinary_R"
654
655
656
657
        , ret = NULL
        , private$handle
        , lgb.c_str(fname)
      )
658
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
659
    }
James Lamb's avatar
James Lamb committed
660

Guolin Ke's avatar
Guolin Ke committed
661
662
  ),
  private = list(
663
664
665
666
667
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
668
    categorical_feature = NULL,
669
670
671
672
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
673
    version = 0L,
James Lamb's avatar
James Lamb committed
674

675
676
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
677

678
679
680
681
      # Get handle and construct if needed
      if (lgb.is.null.handle(private$handle)) {
        self$construct()
      }
682
      private$handle
James Lamb's avatar
James Lamb committed
683

Guolin Ke's avatar
Guolin Ke committed
684
    },
James Lamb's avatar
James Lamb committed
685

686
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
687
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
688

689
      if (identical(private$predictor, predictor)) {
690
        return(invisible(self))
691
      }
James Lamb's avatar
James Lamb committed
692

693
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
694
      if (is.null(private$raw_data)) {
695
696
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
697
      }
James Lamb's avatar
James Lamb committed
698

699
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
700
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
701

702
        # Predictor is unknown
703
        if (!lgb.check.r6.class(object = predictor, name = "lgb.Predictor")) {
704
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
705
        }
James Lamb's avatar
James Lamb committed
706

Guolin Ke's avatar
Guolin Ke committed
707
      }
James Lamb's avatar
James Lamb committed
708

709
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
710
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
711

712
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
713
      self$finalize()
714
      return(invisible(self))
James Lamb's avatar
James Lamb committed
715

Guolin Ke's avatar
Guolin Ke committed
716
    }
James Lamb's avatar
James Lamb committed
717

Guolin Ke's avatar
Guolin Ke committed
718
719
720
  )
)

721
722
723
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
Guolin Ke's avatar
Guolin Ke committed
724
725
726
727
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param params a list of parameters
#' @param reference reference dataset
#' @param colnames names of columns
728
#' @param categorical_feature categorical features
Guolin Ke's avatar
Guolin Ke committed
729
#' @param free_raw_data TRUE for need to free raw data after construct
Nikita Titov's avatar
Nikita Titov committed
730
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
731
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
James Lamb's avatar
James Lamb committed
732
#'
Guolin Ke's avatar
Guolin Ke committed
733
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
734
#'
Guolin Ke's avatar
Guolin Ke committed
735
#' @examples
736
#' \donttest{
737
738
739
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
740
741
742
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
743
#' lgb.Dataset.construct(dtrain)
744
#' }
Guolin Ke's avatar
Guolin Ke committed
745
746
#' @export
lgb.Dataset <- function(data,
747
748
749
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
750
                        categorical_feature = NULL,
751
752
                        free_raw_data = TRUE,
                        info = list(),
Guolin Ke's avatar
Guolin Ke committed
753
                        ...) {
James Lamb's avatar
James Lamb committed
754

755
  # Create new dataset
756
757
758
759
760
761
762
763
764
765
766
767
  invisible(Dataset$new(
    data = data
    , params = params
    , reference = reference
    , colnames = colnames
    , categorical_feature = categorical_feature
    , predictor = NULL
    , free_raw_data = free_raw_data
    , used_indices = NULL
    , info = info
    , ...
  ))
James Lamb's avatar
James Lamb committed
768

Guolin Ke's avatar
Guolin Ke committed
769
770
}

771
772
773
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
Guolin Ke's avatar
Guolin Ke committed
774
775
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
Nikita Titov's avatar
Nikita Titov committed
776
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
777
#' @param ... other information to pass to \code{info}.
James Lamb's avatar
James Lamb committed
778
#'
Guolin Ke's avatar
Guolin Ke committed
779
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
780
#'
Guolin Ke's avatar
Guolin Ke committed
781
#' @examples
782
#' \donttest{
783
784
785
786
787
788
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
789
#' }
Guolin Ke's avatar
Guolin Ke committed
790
#' @export
791
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
James Lamb's avatar
James Lamb committed
792

793
  # Check if dataset is not a dataset
794
795
  if (!lgb.is.Dataset(dataset)) {
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
796
  }
James Lamb's avatar
James Lamb committed
797

798
  # Create validation dataset
799
  invisible(dataset$create_valid(data = data, info = info, ...))
James Lamb's avatar
James Lamb committed
800

801
}
Guolin Ke's avatar
Guolin Ke committed
802

803
804
805
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
806
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
807
#'
Guolin Ke's avatar
Guolin Ke committed
808
#' @examples
809
#' \donttest{
810
811
812
813
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
814
#' }
815
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
816
817
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
818

819
  # Check if dataset is not a dataset
820
821
  if (!lgb.is.Dataset(dataset)) {
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
822
  }
James Lamb's avatar
James Lamb committed
823

824
  # Construct the dataset
825
  invisible(dataset$construct())
James Lamb's avatar
James Lamb committed
826

Guolin Ke's avatar
Guolin Ke committed
827
828
}

829
830
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
831
832
#' @param x Object of class \code{lgb.Dataset}
#' @param ... other parameters
James Lamb's avatar
James Lamb committed
833
#'
Guolin Ke's avatar
Guolin Ke committed
834
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
835
#'
Guolin Ke's avatar
Guolin Ke committed
836
837
838
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
839
#'
Guolin Ke's avatar
Guolin Ke committed
840
#' @examples
841
#' \donttest{
842
843
844
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
845
#'
846
847
848
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
849
#' }
Guolin Ke's avatar
Guolin Ke committed
850
851
852
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
853

854
  # Check if dataset is not a dataset
855
856
  if (!lgb.is.Dataset(x)) {
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
857
  }
James Lamb's avatar
James Lamb committed
858

859
  x$dim()
James Lamb's avatar
James Lamb committed
860

Guolin Ke's avatar
Guolin Ke committed
861
862
}

863
864
865
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
866
867
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
868
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
869
870
871
872
873
874
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
875
#' \donttest{
876
877
878
879
880
881
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
882
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
883
#' print(dtrain, verbose = TRUE)
884
#' }
Guolin Ke's avatar
Guolin Ke committed
885
#' @rdname dimnames.lgb.Dataset
886
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
887
888
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
889

890
  # Check if dataset is not a dataset
891
892
  if (!lgb.is.Dataset(x)) {
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
893
  }
James Lamb's avatar
James Lamb committed
894

895
  # Return dimension names
896
  list(NULL, x$get_colnames())
James Lamb's avatar
James Lamb committed
897

Guolin Ke's avatar
Guolin Ke committed
898
899
900
}

#' @rdname dimnames.lgb.Dataset
901
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
902
903
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
904

905
  # Check if invalid element list
906
  if (!identical(class(value), "list") || length(value) != 2L) {
907
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
908
  }
James Lamb's avatar
James Lamb committed
909

910
911
912
913
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
914

915
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
916

Guolin Ke's avatar
Guolin Ke committed
917
918
    x$set_colnames(NULL)
    return(x)
James Lamb's avatar
James Lamb committed
919

920
  }
James Lamb's avatar
James Lamb committed
921

922
  # Check for unmatching column size
923
  if (ncol(x) != length(value[[2L]])) {
924
925
    stop(
      "can't assign "
926
      , sQuote(length(value[[2L]]))
927
928
929
930
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
931
  }
James Lamb's avatar
James Lamb committed
932

933
  # Set column names properly, and return
934
  x$set_colnames(value[[2L]])
935
  x
James Lamb's avatar
James Lamb committed
936

Guolin Ke's avatar
Guolin Ke committed
937
938
}

939
940
941
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
942
#' @param dataset Object of class \code{lgb.Dataset}
943
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
944
945
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
946
#'
Guolin Ke's avatar
Guolin Ke committed
947
#' @examples
948
#' \donttest{
949
950
951
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
952
#'
953
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
954
#' lgb.Dataset.construct(dsub)
955
#' labels <- lightgbm::getinfo(dsub, "label")
956
#' }
Guolin Ke's avatar
Guolin Ke committed
957
#' @export
958
959
960
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
961
962

#' @rdname slice
963
#' @return constructed sub dataset
Guolin Ke's avatar
Guolin Ke committed
964
965
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
966

967
  # Check if dataset is not a dataset
968
969
  if (!lgb.is.Dataset(dataset)) {
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
970
  }
James Lamb's avatar
James Lamb committed
971

972
  # Return sliced set
973
  invisible(dataset$slice(idxset = idxset, ...))
James Lamb's avatar
James Lamb committed
974

Guolin Ke's avatar
Guolin Ke committed
975
976
}

977
978
979
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
980
981
982
983
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#' @return info data
James Lamb's avatar
James Lamb committed
984
#'
Guolin Ke's avatar
Guolin Ke committed
985
986
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
987
#'
Guolin Ke's avatar
Guolin Ke committed
988
989
990
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
Nikita Titov's avatar
Nikita Titov committed
991
992
#'     \item \code{group}: group size ;
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
993
#' }
James Lamb's avatar
James Lamb committed
994
#'
Guolin Ke's avatar
Guolin Ke committed
995
#' @examples
996
#' \donttest{
997
998
999
1000
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1001
#'
1002
1003
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1004
#'
1005
1006
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1007
#' }
Guolin Ke's avatar
Guolin Ke committed
1008
#' @export
1009
1010
1011
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1012
1013

#' @rdname getinfo
1014
#' @return info data
Guolin Ke's avatar
Guolin Ke committed
1015
1016
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1017

1018
  # Check if dataset is not a dataset
1019
1020
  if (!lgb.is.Dataset(dataset)) {
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1021
  }
James Lamb's avatar
James Lamb committed
1022

1023
  dataset$getinfo(name)
James Lamb's avatar
James Lamb committed
1024

Guolin Ke's avatar
Guolin Ke committed
1025
1026
}

1027
1028
1029
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1030
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1031
1032
1033
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
1034
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1035
#'
Guolin Ke's avatar
Guolin Ke committed
1036
1037
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1038
#'
Guolin Ke's avatar
Guolin Ke committed
1039
#' \itemize{
1040
1041
1042
1043
1044
1045
1046
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 1000-row dataset that contains 250 4-document query results,
#'         set this to \code{rep(4L, 250L)}}
Guolin Ke's avatar
Guolin Ke committed
1047
#' }
James Lamb's avatar
James Lamb committed
1048
#'
Guolin Ke's avatar
Guolin Ke committed
1049
#' @examples
1050
#' \donttest{
1051
1052
1053
1054
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1055
#'
1056
1057
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1058
#'
1059
1060
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1061
#' }
Guolin Ke's avatar
Guolin Ke committed
1062
#' @export
1063
1064
1065
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1066
1067

#' @rdname setinfo
1068
#' @return the dataset you passed in
Guolin Ke's avatar
Guolin Ke committed
1069
1070
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1071

1072
1073
  if (!lgb.is.Dataset(dataset)) {
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1074
  }
James Lamb's avatar
James Lamb committed
1075

1076
  # Set information
1077
  invisible(dataset$setinfo(name = name, info = info))
Guolin Ke's avatar
Guolin Ke committed
1078
1079
}

1080
1081
1082
1083
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1084
#' @param dataset object of class \code{lgb.Dataset}
1085
1086
1087
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1088
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1089
#'
1090
#' @examples
1091
#' \donttest{
1092
1093
1094
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1095
1096
1097
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1098
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1099
#' }
1100
1101
1102
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1103

1104
1105
1106
  if (!lgb.is.Dataset(dataset)) {
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1107

1108
  # Set categoricals
1109
  invisible(dataset$set_categorical_feature(categorical_feature))
James Lamb's avatar
James Lamb committed
1110

1111
1112
}

1113
1114
1115
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1116
1117
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1118
#'
1119
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1120
#'
Guolin Ke's avatar
Guolin Ke committed
1121
#' @examples
1122
#' \donttest{
1123
1124
1125
1126
1127
1128
1129
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test = train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
1130
#' }
Guolin Ke's avatar
Guolin Ke committed
1131
1132
1133
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1134

1135
  # Check if dataset is not a dataset
1136
1137
  if (!lgb.is.Dataset(dataset)) {
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1138
  }
James Lamb's avatar
James Lamb committed
1139

1140
  # Set reference
1141
  invisible(dataset$set_reference(reference))
Guolin Ke's avatar
Guolin Ke committed
1142
1143
}

1144
1145
1146
1147
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1148
1149
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1150
#'
1151
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1152
#'
Guolin Ke's avatar
Guolin Ke committed
1153
#' @examples
1154
#' \donttest{
1155
1156
1157
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1158
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1159
#' }
Guolin Ke's avatar
Guolin Ke committed
1160
1161
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1162

1163
  # Check if dataset is not a dataset
1164
1165
  if (!lgb.is.Dataset(dataset)) {
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1166
  }
James Lamb's avatar
James Lamb committed
1167

1168
  # File-type is not matching
1169
1170
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1171
  }
James Lamb's avatar
James Lamb committed
1172

1173
  # Store binary
1174
  invisible(dataset$save_binary(fname))
Guolin Ke's avatar
Guolin Ke committed
1175
}