lgb.Dataset.R 33.8 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
#' @importFrom methods is
James Lamb's avatar
James Lamb committed
2
3
4
#' @importFrom R6 R6Class
Dataset <- R6::R6Class(

5
  classname = "lgb.Dataset",
6
  cloneable = FALSE,
Guolin Ke's avatar
Guolin Ke committed
7
  public = list(
James Lamb's avatar
James Lamb committed
8

9
    # Finalize will free up the handles
Guolin Ke's avatar
Guolin Ke committed
10
    finalize = function() {
James Lamb's avatar
James Lamb committed
11

12
      # Check the need for freeing handle
13
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
14

15
        # Freeing up handle
16
        lgb.call(fun_name = "LGBM_DatasetFree_R", ret = NULL, private$handle)
Guolin Ke's avatar
Guolin Ke committed
17
        private$handle <- NULL
James Lamb's avatar
James Lamb committed
18

Guolin Ke's avatar
Guolin Ke committed
19
      }
James Lamb's avatar
James Lamb committed
20

21
22
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
23
    },
James Lamb's avatar
James Lamb committed
24

25
    # Initialize will create a starter dataset
Guolin Ke's avatar
Guolin Ke committed
26
    initialize = function(data,
27
28
29
                          params = list(),
                          reference = NULL,
                          colnames = NULL,
30
                          categorical_feature = NULL,
31
32
33
34
                          predictor = NULL,
                          free_raw_data = TRUE,
                          used_indices = NULL,
                          info = list(),
Guolin Ke's avatar
Guolin Ke committed
35
                          ...) {
James Lamb's avatar
James Lamb committed
36

37
      # validate inputs early to avoid unnecessary computation
38
      if (!(is.null(reference) || lgb.check.r6.class(object = reference, name = "lgb.Dataset"))) {
39
40
          stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
      }
41
      if (!(is.null(predictor) || lgb.check.r6.class(object = predictor, name = "lgb.Predictor"))) {
42
43
44
          stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
      }

45
      # Check for additional parameters
46
      additional_params <- list(...)
James Lamb's avatar
James Lamb committed
47

48
49
      # Create known attributes list
      INFO_KEYS <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
50

51
      # Check if attribute key is in the known attribute list
52
      for (key in names(additional_params)) {
James Lamb's avatar
James Lamb committed
53

54
        # Key existing
55
        if (key %in% INFO_KEYS) {
James Lamb's avatar
James Lamb committed
56

57
          # Store as info
58
          info[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
59

Guolin Ke's avatar
Guolin Ke committed
60
        } else {
James Lamb's avatar
James Lamb committed
61

62
          # Store as param
63
          params[[key]] <- additional_params[[key]]
James Lamb's avatar
James Lamb committed
64

Guolin Ke's avatar
Guolin Ke committed
65
        }
James Lamb's avatar
James Lamb committed
66

Guolin Ke's avatar
Guolin Ke committed
67
      }
James Lamb's avatar
James Lamb committed
68

69
70
71
72
73
74
75
      # Check for matrix format
      if (is.matrix(data)) {
        # Check whether matrix is the correct type first ("double")
        if (storage.mode(data) != "double") {
          storage.mode(data) <- "double"
        }
      }
James Lamb's avatar
James Lamb committed
76

77
78
79
      # Setup private attributes
      private$raw_data <- data
      private$params <- params
Guolin Ke's avatar
Guolin Ke committed
80
      private$reference <- reference
81
      private$colnames <- colnames
82

83
      private$categorical_feature <- categorical_feature
84
85
      private$predictor <- predictor
      private$free_raw_data <- free_raw_data
86
      private$used_indices <- sort(used_indices, decreasing = FALSE)
87
      private$info <- info
88
      private$version <- 0L
James Lamb's avatar
James Lamb committed
89

90
91
      return(invisible(NULL))

Guolin Ke's avatar
Guolin Ke committed
92
    },
James Lamb's avatar
James Lamb committed
93

94
95
96
    create_valid = function(data,
                            info = list(),
                            ...) {
James Lamb's avatar
James Lamb committed
97

98
      # Create new dataset
99
100
101
102
103
104
105
106
107
108
109
110
      ret <- Dataset$new(
        data = data
        , params = private$params
        , reference = self
        , colnames = private$colnames
        , categorical_feature = private$categorical_feature
        , predictor = private$predictor
        , free_raw_data = private$free_raw_data
        , used_indices = NULL
        , info = info
        , ...
      )
James Lamb's avatar
James Lamb committed
111

112
      return(invisible(ret))
James Lamb's avatar
James Lamb committed
113

Guolin Ke's avatar
Guolin Ke committed
114
    },
James Lamb's avatar
James Lamb committed
115

116
    # Dataset constructor
Guolin Ke's avatar
Guolin Ke committed
117
    construct = function() {
James Lamb's avatar
James Lamb committed
118

119
      # Check for handle null
120
      if (!lgb.is.null.handle(x = private$handle)) {
121
        return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
122
      }
James Lamb's avatar
James Lamb committed
123

Guolin Ke's avatar
Guolin Ke committed
124
125
      # Get feature names
      cnames <- NULL
James Lamb's avatar
James Lamb committed
126
      if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {
Guolin Ke's avatar
Guolin Ke committed
127
128
        cnames <- colnames(private$raw_data)
      }
James Lamb's avatar
James Lamb committed
129

Guolin Ke's avatar
Guolin Ke committed
130
      # set feature names if not exist
131
      if (is.null(private$colnames) && !is.null(cnames)) {
Guolin Ke's avatar
Guolin Ke committed
132
133
        private$colnames <- as.character(cnames)
      }
James Lamb's avatar
James Lamb committed
134

135
136
      # Get categorical feature index
      if (!is.null(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
137

138
        # Check for character name
139
        if (is.character(private$categorical_feature)) {
James Lamb's avatar
James Lamb committed
140

141
            cate_indices <- as.list(match(private$categorical_feature, private$colnames) - 1L)
James Lamb's avatar
James Lamb committed
142

143
            # Provided indices, but some indices are not existing?
144
            if (sum(is.na(cate_indices)) > 0L) {
145
146
147
148
              stop(
                "lgb.self.get.handle: supplied an unknown feature in categorical_feature: "
                , sQuote(private$categorical_feature[is.na(cate_indices)])
              )
149
            }
James Lamb's avatar
James Lamb committed
150

151
          } else {
James Lamb's avatar
James Lamb committed
152

153
            # Check if more categorical features were output over the feature space
154
            if (max(private$categorical_feature) > length(private$colnames)) {
155
156
157
158
159
160
161
              stop(
                "lgb.self.get.handle: supplied a too large value in categorical_feature: "
                , max(private$categorical_feature)
                , " but only "
                , length(private$colnames)
                , " features"
              )
162
            }
James Lamb's avatar
James Lamb committed
163

164
            # Store indices as [0, n-1] indexed instead of [1, n] indexed
165
            cate_indices <- as.list(private$categorical_feature - 1L)
James Lamb's avatar
James Lamb committed
166

167
          }
James Lamb's avatar
James Lamb committed
168

169
        # Store indices for categorical features
170
        private$params$categorical_feature <- cate_indices
James Lamb's avatar
James Lamb committed
171

172
      }
James Lamb's avatar
James Lamb committed
173

Guolin Ke's avatar
Guolin Ke committed
174
175
      # Check has header or not
      has_header <- FALSE
176
      if (!is.null(private$params$has_header) || !is.null(private$params$header)) {
177
178
179
        params_has_header <- tolower(as.character(private$params$has_header)) == "true"
        params_header <- tolower(as.character(private$params$header)) == "true"
        if (params_has_header || params_header) {
Guolin Ke's avatar
Guolin Ke committed
180
181
182
          has_header <- TRUE
        }
      }
James Lamb's avatar
James Lamb committed
183

Guolin Ke's avatar
Guolin Ke committed
184
      # Generate parameter str
185
      params_str <- lgb.params2str(params = private$params)
James Lamb's avatar
James Lamb committed
186

187
      # Get handle of reference dataset
Guolin Ke's avatar
Guolin Ke committed
188
189
190
191
      ref_handle <- NULL
      if (!is.null(private$reference)) {
        ref_handle <- private$reference$.__enclos_env__$private$get_handle()
      }
Guolin Ke's avatar
Guolin Ke committed
192
      handle <- lgb.null.handle()
James Lamb's avatar
James Lamb committed
193

194
      # Not subsetting
Guolin Ke's avatar
Guolin Ke committed
195
      if (is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
196

197
        # Are we using a data file?
198
        if (is.character(private$raw_data)) {
James Lamb's avatar
James Lamb committed
199

200
          handle <- lgb.call(
201
            fun_name = "LGBM_DatasetCreateFromFile_R"
202
            , ret = handle
203
            , lgb.c_str(x = private$raw_data)
204
205
206
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
207

Guolin Ke's avatar
Guolin Ke committed
208
        } else if (is.matrix(private$raw_data)) {
James Lamb's avatar
James Lamb committed
209

210
          # Are we using a matrix?
211
          handle <- lgb.call(
212
            fun_name = "LGBM_DatasetCreateFromMat_R"
213
214
215
216
217
218
219
            , ret = handle
            , private$raw_data
            , nrow(private$raw_data)
            , ncol(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
220
221

        } else if (methods::is(private$raw_data, "dgCMatrix")) {
222
          if (length(private$raw_data@p) > 2147483647L) {
223
224
            stop("Cannot support large CSC matrix")
          }
225
          # Are we using a dgCMatrix (sparsed matrix column compressed)
226
          handle <- lgb.call(
227
            fun_name = "LGBM_DatasetCreateFromCSC_R"
228
229
230
231
232
233
234
235
236
237
            , ret = handle
            , private$raw_data@p
            , private$raw_data@i
            , private$raw_data@x
            , length(private$raw_data@p)
            , length(private$raw_data@x)
            , nrow(private$raw_data)
            , params_str
            , ref_handle
          )
James Lamb's avatar
James Lamb committed
238

Guolin Ke's avatar
Guolin Ke committed
239
        } else {
James Lamb's avatar
James Lamb committed
240

241
          # Unknown data type
242
243
244
245
          stop(
            "lgb.Dataset.construct: does not support constructing from "
            , sQuote(class(private$raw_data))
          )
James Lamb's avatar
James Lamb committed
246

Guolin Ke's avatar
Guolin Ke committed
247
        }
James Lamb's avatar
James Lamb committed
248

Guolin Ke's avatar
Guolin Ke committed
249
      } else {
James Lamb's avatar
James Lamb committed
250

251
        # Reference is empty
Guolin Ke's avatar
Guolin Ke committed
252
        if (is.null(private$reference)) {
253
          stop("lgb.Dataset.construct: reference cannot be NULL for constructing data subset")
Guolin Ke's avatar
Guolin Ke committed
254
        }
James Lamb's avatar
James Lamb committed
255

256
        # Construct subset
257
        handle <- lgb.call(
258
          fun_name = "LGBM_DatasetGetSubset_R"
259
260
261
262
263
264
          , ret = handle
          , ref_handle
          , c(private$used_indices) # Adding c() fixes issue in R v3.5
          , length(private$used_indices)
          , params_str
        )
James Lamb's avatar
James Lamb committed
265

Guolin Ke's avatar
Guolin Ke committed
266
      }
267
      if (lgb.is.null.handle(x = handle)) {
Guolin Ke's avatar
Guolin Ke committed
268
269
        stop("lgb.Dataset.construct: cannot create Dataset handle")
      }
270
      # Setup class and private type
Guolin Ke's avatar
Guolin Ke committed
271
272
      class(handle) <- "lgb.Dataset.handle"
      private$handle <- handle
James Lamb's avatar
James Lamb committed
273

274
275
      # Set feature names
      if (!is.null(private$colnames)) {
276
        self$set_colnames(colnames = private$colnames)
277
      }
278

279
280
      # Load init score if requested
      if (!is.null(private$predictor) && is.null(private$used_indices)) {
James Lamb's avatar
James Lamb committed
281

282
        # Setup initial scores
283
        init_score <- private$predictor$predict(
284
          data = private$raw_data
285
286
287
          , rawscore = TRUE
          , reshape = TRUE
        )
James Lamb's avatar
James Lamb committed
288

289
        # Not needed to transpose, for is col_marjor
Guolin Ke's avatar
Guolin Ke committed
290
291
        init_score <- as.vector(init_score)
        private$info$init_score <- init_score
James Lamb's avatar
James Lamb committed
292

293
      }
James Lamb's avatar
James Lamb committed
294

295
296
297
      # Should we free raw data?
      if (isTRUE(private$free_raw_data)) {
        private$raw_data <- NULL
Guolin Ke's avatar
Guolin Ke committed
298
      }
James Lamb's avatar
James Lamb committed
299

300
      # Get private information
301
      if (length(private$info) > 0L) {
James Lamb's avatar
James Lamb committed
302

303
        # Set infos
304
        for (i in seq_along(private$info)) {
James Lamb's avatar
James Lamb committed
305

Guolin Ke's avatar
Guolin Ke committed
306
          p <- private$info[i]
307
          self$setinfo(name = names(p), info = p[[1L]])
James Lamb's avatar
James Lamb committed
308

Guolin Ke's avatar
Guolin Ke committed
309
        }
James Lamb's avatar
James Lamb committed
310

Guolin Ke's avatar
Guolin Ke committed
311
      }
James Lamb's avatar
James Lamb committed
312

313
      # Get label information existence
314
      if (is.null(self$getinfo(name = "label"))) {
Guolin Ke's avatar
Guolin Ke committed
315
316
        stop("lgb.Dataset.construct: label should be set")
      }
James Lamb's avatar
James Lamb committed
317

318
      return(invisible(self))
James Lamb's avatar
James Lamb committed
319

Guolin Ke's avatar
Guolin Ke committed
320
    },
James Lamb's avatar
James Lamb committed
321

322
    # Dimension function
Guolin Ke's avatar
Guolin Ke committed
323
    dim = function() {
James Lamb's avatar
James Lamb committed
324

325
      # Check for handle
326
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
327

328
329
        num_row <- 0L
        num_col <- 0L
James Lamb's avatar
James Lamb committed
330

331
        # Get numeric data and numeric features
332
333
334
335
336
337
338
339
340
341
342
343
        return(
          c(
            lgb.call(
              fun_name = "LGBM_DatasetGetNumData_R"
              , ret = num_row
              , private$handle
            ),
            lgb.call(
              fun_name = "LGBM_DatasetGetNumFeature_R"
              , ret = num_col
              , private$handle
            )
344
345
          )
        )
James Lamb's avatar
James Lamb committed
346
347
348

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

349
        # Check if dgCMatrix (sparse matrix column compressed)
350
        # NOTE: requires Matrix package
351
        return(dim(private$raw_data))
James Lamb's avatar
James Lamb committed
352

Guolin Ke's avatar
Guolin Ke committed
353
      } else {
James Lamb's avatar
James Lamb committed
354

355
        # Trying to work with unknown dimensions is not possible
356
357
358
359
        stop(
          "dim: cannot get dimensions before dataset has been constructed, "
          , "please call lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
360

Guolin Ke's avatar
Guolin Ke committed
361
      }
James Lamb's avatar
James Lamb committed
362

Guolin Ke's avatar
Guolin Ke committed
363
    },
James Lamb's avatar
James Lamb committed
364

365
    # Get column names
Guolin Ke's avatar
Guolin Ke committed
366
    get_colnames = function() {
James Lamb's avatar
James Lamb committed
367

368
      # Check for handle
369
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
370

371
        # Get feature names and write them
372
373
374
375
        cnames <- lgb.call.return.str(
            fun_name = "LGBM_DatasetGetFeatureNames_R"
            , private$handle
        )
376
        private$colnames <- as.character(base::strsplit(cnames, "\t")[[1L]])
377
        return(private$colnames)
James Lamb's avatar
James Lamb committed
378
379
380

      } else if (is.matrix(private$raw_data) || methods::is(private$raw_data, "dgCMatrix")) {

381
        # Check if dgCMatrix (sparse matrix column compressed)
382
        return(colnames(private$raw_data))
James Lamb's avatar
James Lamb committed
383

Guolin Ke's avatar
Guolin Ke committed
384
      } else {
James Lamb's avatar
James Lamb committed
385

386
        # Trying to work with unknown dimensions is not possible
387
388
389
390
        stop(
          "dim: cannot get dimensions before dataset has been constructed, please call "
          , "lgb.Dataset.construct explicitly"
        )
James Lamb's avatar
James Lamb committed
391

Guolin Ke's avatar
Guolin Ke committed
392
      }
James Lamb's avatar
James Lamb committed
393

Guolin Ke's avatar
Guolin Ke committed
394
    },
James Lamb's avatar
James Lamb committed
395

396
    # Set column names
Guolin Ke's avatar
Guolin Ke committed
397
    set_colnames = function(colnames) {
James Lamb's avatar
James Lamb committed
398

399
400
      # Check column names non-existence
      if (is.null(colnames)) {
401
        return(invisible(self))
402
      }
James Lamb's avatar
James Lamb committed
403

404
      # Check empty column names
Guolin Ke's avatar
Guolin Ke committed
405
      colnames <- as.character(colnames)
406
      if (length(colnames) == 0L) {
407
        return(invisible(self))
408
      }
James Lamb's avatar
James Lamb committed
409

410
      # Write column names
Guolin Ke's avatar
Guolin Ke committed
411
      private$colnames <- colnames
412
      if (!lgb.is.null.handle(x = private$handle)) {
James Lamb's avatar
James Lamb committed
413

414
        # Merge names with tab separation
Guolin Ke's avatar
Guolin Ke committed
415
        merged_name <- paste0(as.list(private$colnames), collapse = "\t")
416
        lgb.call(
417
          fun_name = "LGBM_DatasetSetFeatureNames_R"
418
419
          , ret = NULL
          , private$handle
420
          , lgb.c_str(x = merged_name)
421
        )
James Lamb's avatar
James Lamb committed
422

Guolin Ke's avatar
Guolin Ke committed
423
      }
James Lamb's avatar
James Lamb committed
424

425
      return(invisible(self))
James Lamb's avatar
James Lamb committed
426

Guolin Ke's avatar
Guolin Ke committed
427
    },
James Lamb's avatar
James Lamb committed
428

429
    # Get information
Guolin Ke's avatar
Guolin Ke committed
430
    getinfo = function(name) {
James Lamb's avatar
James Lamb committed
431

432
      # Create known attributes list
433
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
434

435
      # Check if attribute key is in the known attribute list
436
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
437
        stop("getinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
Guolin Ke's avatar
Guolin Ke committed
438
      }
James Lamb's avatar
James Lamb committed
439

440
      # Check for info name and handle
441
      if (is.null(private$info[[name]])) {
442

443
        if (lgb.is.null.handle(x = private$handle)) {
444
          stop("Cannot perform getinfo before constructing Dataset.")
445
        }
446

447
        # Get field size of info
448
        info_len <- 0L
449
        info_len <- lgb.call(
450
          fun_name = "LGBM_DatasetGetFieldSize_R"
451
452
          , ret = info_len
          , private$handle
453
          , lgb.c_str(x = name)
454
        )
James Lamb's avatar
James Lamb committed
455

456
        # Check if info is not empty
457
        if (info_len > 0L) {
James Lamb's avatar
James Lamb committed
458

459
          # Get back fields
Guolin Ke's avatar
Guolin Ke committed
460
          ret <- NULL
461
462
463
464
465
          ret <- if (name == "group") {
            integer(info_len) # Integer
          } else {
            numeric(info_len) # Numeric
          }
James Lamb's avatar
James Lamb committed
466

467
          ret <- lgb.call(
468
            fun_name = "LGBM_DatasetGetField_R"
469
470
            , ret = ret
            , private$handle
471
            , lgb.c_str(x = name)
472
          )
James Lamb's avatar
James Lamb committed
473

Guolin Ke's avatar
Guolin Ke committed
474
          private$info[[name]] <- ret
James Lamb's avatar
James Lamb committed
475

Guolin Ke's avatar
Guolin Ke committed
476
477
        }
      }
James Lamb's avatar
James Lamb committed
478

479
      return(private$info[[name]])
James Lamb's avatar
James Lamb committed
480

Guolin Ke's avatar
Guolin Ke committed
481
    },
James Lamb's avatar
James Lamb committed
482

483
    # Set information
Guolin Ke's avatar
Guolin Ke committed
484
    setinfo = function(name, info) {
James Lamb's avatar
James Lamb committed
485

486
      # Create known attributes list
487
      INFONAMES <- c("label", "weight", "init_score", "group")
James Lamb's avatar
James Lamb committed
488

489
      # Check if attribute key is in the known attribute list
490
      if (!is.character(name) || length(name) != 1L || !name %in% INFONAMES) {
491
492
        stop("setinfo: name must one of the following: ", paste0(sQuote(INFONAMES), collapse = ", "))
      }
James Lamb's avatar
James Lamb committed
493

494
495
496
497
498
499
      # Check for type of information
      info <- if (name == "group") {
        as.integer(info) # Integer
      } else {
        as.numeric(info) # Numeric
      }
James Lamb's avatar
James Lamb committed
500

501
      # Store information privately
Guolin Ke's avatar
Guolin Ke committed
502
      private$info[[name]] <- info
James Lamb's avatar
James Lamb committed
503

504
      if (!lgb.is.null.handle(x = private$handle) && !is.null(info)) {
James Lamb's avatar
James Lamb committed
505

506
        if (length(info) > 0L) {
James Lamb's avatar
James Lamb committed
507

508
          lgb.call(
509
            fun_name = "LGBM_DatasetSetField_R"
510
511
            , ret = NULL
            , private$handle
512
            , lgb.c_str(x = name)
513
514
515
            , info
            , length(info)
          )
James Lamb's avatar
James Lamb committed
516

517
518
          private$version <- private$version + 1L

Guolin Ke's avatar
Guolin Ke committed
519
        }
James Lamb's avatar
James Lamb committed
520

Guolin Ke's avatar
Guolin Ke committed
521
      }
James Lamb's avatar
James Lamb committed
522

523
      return(invisible(self))
James Lamb's avatar
James Lamb committed
524

Guolin Ke's avatar
Guolin Ke committed
525
    },
James Lamb's avatar
James Lamb committed
526

527
    # Slice dataset
Guolin Ke's avatar
Guolin Ke committed
528
    slice = function(idxset, ...) {
James Lamb's avatar
James Lamb committed
529

530
      # Perform slicing
531
532
533
534
535
536
537
538
539
540
541
542
543
      return(
        Dataset$new(
          data = NULL
          , params = private$params
          , reference = self
          , colnames = private$colnames
          , categorical_feature = private$categorical_feature
          , predictor = private$predictor
          , free_raw_data = private$free_raw_data
          , used_indices = sort(idxset, decreasing = FALSE)
          , info = NULL
          , ...
        )
544
      )
James Lamb's avatar
James Lamb committed
545

Guolin Ke's avatar
Guolin Ke committed
546
    },
James Lamb's avatar
James Lamb committed
547

548
    # Update parameters
549
    update_params = function(params) {
550
551
552
      if (length(params) == 0L) {
        return(invisible(self))
      }
553
      if (lgb.is.null.handle(x = private$handle)) {
554
555
556
557
558
        private$params <- modifyList(private$params, params)
      } else {
        call_state <- 0L
        call_state <- .Call(
          "LGBM_DatasetUpdateParamChecking_R"
559
560
          , lgb.params2str(params = private$params)
          , lgb.params2str(params = params)
561
562
          , call_state
          , PACKAGE = "lib_lightgbm"
563
        )
564
565
566
567
568
569
570
571
572
573
574
575
        call_state <- as.integer(call_state)
        if (call_state != 0L) {

          # raise error if raw data is freed
          if (is.null(private$raw_data)) {
            lgb.last_error()
          }

          # Overwrite paramms
          private$params <- modifyList(private$params, params)
          self$finalize()
        }
576
      }
577
      return(invisible(self))
James Lamb's avatar
James Lamb committed
578

Guolin Ke's avatar
Guolin Ke committed
579
    },
James Lamb's avatar
James Lamb committed
580

581
582
583
584
585
586
587
588
589
590
591
    get_params = function() {
      dataset_params <- unname(unlist(.DATASET_PARAMETERS()))
      ret <- list()
      for (param_key in names(private$params)) {
        if (param_key %in% dataset_params) {
          ret[[param_key]] <- private$params[[param_key]]
        }
      }
      return(ret)
    },

592
    # Set categorical feature parameter
593
    set_categorical_feature = function(categorical_feature) {
James Lamb's avatar
James Lamb committed
594

595
596
      # Check for identical input
      if (identical(private$categorical_feature, categorical_feature)) {
597
        return(invisible(self))
598
      }
James Lamb's avatar
James Lamb committed
599

600
      # Check for empty data
601
      if (is.null(private$raw_data)) {
602
603
        stop("set_categorical_feature: cannot set categorical feature after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
604
      }
James Lamb's avatar
James Lamb committed
605

606
      # Overwrite categorical features
607
      private$categorical_feature <- categorical_feature
James Lamb's avatar
James Lamb committed
608

609
      # Finalize and return self
610
      self$finalize()
611
      return(invisible(self))
James Lamb's avatar
James Lamb committed
612

613
    },
James Lamb's avatar
James Lamb committed
614

615
    # Set reference
Guolin Ke's avatar
Guolin Ke committed
616
    set_reference = function(reference) {
James Lamb's avatar
James Lamb committed
617

618
      # Set known references
619
620
621
      self$set_categorical_feature(categorical_feature = reference$.__enclos_env__$private$categorical_feature)
      self$set_colnames(colnames = reference$get_colnames())
      private$set_predictor(predictor = reference$.__enclos_env__$private$predictor)
James Lamb's avatar
James Lamb committed
622

623
624
      # Check for identical references
      if (identical(private$reference, reference)) {
625
        return(invisible(self))
626
      }
James Lamb's avatar
James Lamb committed
627

628
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
629
      if (is.null(private$raw_data)) {
James Lamb's avatar
James Lamb committed
630

631
632
        stop("set_reference: cannot set reference after freeing raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
James Lamb's avatar
James Lamb committed
633

Guolin Ke's avatar
Guolin Ke committed
634
      }
James Lamb's avatar
James Lamb committed
635

636
      # Check for non-existing reference
Guolin Ke's avatar
Guolin Ke committed
637
      if (!is.null(reference)) {
James Lamb's avatar
James Lamb committed
638

639
        # Reference is unknown
640
        if (!lgb.check.r6.class(object = reference, name = "lgb.Dataset")) {
641
          stop("set_reference: Can only use lgb.Dataset as a reference")
Guolin Ke's avatar
Guolin Ke committed
642
        }
James Lamb's avatar
James Lamb committed
643

Guolin Ke's avatar
Guolin Ke committed
644
      }
James Lamb's avatar
James Lamb committed
645

646
      # Store reference
Guolin Ke's avatar
Guolin Ke committed
647
      private$reference <- reference
James Lamb's avatar
James Lamb committed
648

649
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
650
      self$finalize()
651
      return(invisible(self))
James Lamb's avatar
James Lamb committed
652

Guolin Ke's avatar
Guolin Ke committed
653
    },
James Lamb's avatar
James Lamb committed
654

655
    # Save binary model
Guolin Ke's avatar
Guolin Ke committed
656
    save_binary = function(fname) {
James Lamb's avatar
James Lamb committed
657

658
      # Store binary data
Guolin Ke's avatar
Guolin Ke committed
659
      self$construct()
660
      lgb.call(
661
        fun_name = "LGBM_DatasetSaveBinary_R"
662
663
        , ret = NULL
        , private$handle
664
        , lgb.c_str(x = fname)
665
      )
666
      return(invisible(self))
Guolin Ke's avatar
Guolin Ke committed
667
    }
James Lamb's avatar
James Lamb committed
668

Guolin Ke's avatar
Guolin Ke committed
669
670
  ),
  private = list(
671
672
673
674
675
    handle = NULL,
    raw_data = NULL,
    params = list(),
    reference = NULL,
    colnames = NULL,
676
    categorical_feature = NULL,
677
678
679
680
    predictor = NULL,
    free_raw_data = TRUE,
    used_indices = NULL,
    info = NULL,
681
    version = 0L,
James Lamb's avatar
James Lamb committed
682

683
684
    # Get handle
    get_handle = function() {
James Lamb's avatar
James Lamb committed
685

686
      # Get handle and construct if needed
687
      if (lgb.is.null.handle(x = private$handle)) {
688
689
        self$construct()
      }
690
      return(private$handle)
James Lamb's avatar
James Lamb committed
691

Guolin Ke's avatar
Guolin Ke committed
692
    },
James Lamb's avatar
James Lamb committed
693

694
    # Set predictor
Guolin Ke's avatar
Guolin Ke committed
695
    set_predictor = function(predictor) {
James Lamb's avatar
James Lamb committed
696

697
      if (identical(private$predictor, predictor)) {
698
        return(invisible(self))
699
      }
James Lamb's avatar
James Lamb committed
700

701
      # Check for empty data
Guolin Ke's avatar
Guolin Ke committed
702
      if (is.null(private$raw_data)) {
703
704
        stop("set_predictor: cannot set predictor after free raw data,
          please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
Guolin Ke's avatar
Guolin Ke committed
705
      }
James Lamb's avatar
James Lamb committed
706

707
      # Check for empty predictor
Guolin Ke's avatar
Guolin Ke committed
708
      if (!is.null(predictor)) {
James Lamb's avatar
James Lamb committed
709

710
        # Predictor is unknown
711
        if (!lgb.check.r6.class(object = predictor, name = "lgb.Predictor")) {
712
          stop("set_predictor: Can only use lgb.Predictor as predictor")
Guolin Ke's avatar
Guolin Ke committed
713
        }
James Lamb's avatar
James Lamb committed
714

Guolin Ke's avatar
Guolin Ke committed
715
      }
James Lamb's avatar
James Lamb committed
716

717
      # Store predictor
Guolin Ke's avatar
Guolin Ke committed
718
      private$predictor <- predictor
James Lamb's avatar
James Lamb committed
719

720
      # Finalize and return self
Guolin Ke's avatar
Guolin Ke committed
721
      self$finalize()
722
      return(invisible(self))
James Lamb's avatar
James Lamb committed
723

Guolin Ke's avatar
Guolin Ke committed
724
    }
James Lamb's avatar
James Lamb committed
725

Guolin Ke's avatar
Guolin Ke committed
726
727
728
  )
)

729
730
731
#' @title Construct \code{lgb.Dataset} object
#' @description Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
#'              or local file (that was created previously by saving an \code{lgb.Dataset}).
Guolin Ke's avatar
Guolin Ke committed
732
733
734
735
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
#' @param params a list of parameters
#' @param reference reference dataset
#' @param colnames names of columns
736
#' @param categorical_feature categorical features
Guolin Ke's avatar
Guolin Ke committed
737
#' @param free_raw_data TRUE for need to free raw data after construct
Nikita Titov's avatar
Nikita Titov committed
738
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
739
#' @param ... other information to pass to \code{info} or parameters pass to \code{params}
James Lamb's avatar
James Lamb committed
740
#'
Guolin Ke's avatar
Guolin Ke committed
741
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
742
#'
Guolin Ke's avatar
Guolin Ke committed
743
#' @examples
744
#' \donttest{
745
746
747
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
748
749
750
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
751
#' lgb.Dataset.construct(dtrain)
752
#' }
Guolin Ke's avatar
Guolin Ke committed
753
754
#' @export
lgb.Dataset <- function(data,
755
756
757
                        params = list(),
                        reference = NULL,
                        colnames = NULL,
758
                        categorical_feature = NULL,
759
760
                        free_raw_data = TRUE,
                        info = list(),
Guolin Ke's avatar
Guolin Ke committed
761
                        ...) {
James Lamb's avatar
James Lamb committed
762

763
  # Create new dataset
764
765
766
767
768
769
770
771
772
773
774
775
776
777
  return(
    invisible(Dataset$new(
      data = data
      , params = params
      , reference = reference
      , colnames = colnames
      , categorical_feature = categorical_feature
      , predictor = NULL
      , free_raw_data = free_raw_data
      , used_indices = NULL
      , info = info
      , ...
    ))
  )
James Lamb's avatar
James Lamb committed
778

Guolin Ke's avatar
Guolin Ke committed
779
780
}

781
782
783
#' @name lgb.Dataset.create.valid
#' @title Construct validation data
#' @description Construct validation data according to training data
Guolin Ke's avatar
Guolin Ke committed
784
785
#' @param dataset \code{lgb.Dataset} object, training data
#' @param data a \code{matrix} object, a \code{dgCMatrix} object or a character representing a filename
Nikita Titov's avatar
Nikita Titov committed
786
#' @param info a list of information of the \code{lgb.Dataset} object
Guolin Ke's avatar
Guolin Ke committed
787
#' @param ... other information to pass to \code{info}.
James Lamb's avatar
James Lamb committed
788
#'
Guolin Ke's avatar
Guolin Ke committed
789
#' @return constructed dataset
James Lamb's avatar
James Lamb committed
790
#'
Guolin Ke's avatar
Guolin Ke committed
791
#' @examples
792
#' \donttest{
793
794
795
796
797
798
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
799
#' }
Guolin Ke's avatar
Guolin Ke committed
800
#' @export
801
lgb.Dataset.create.valid <- function(dataset, data, info = list(), ...) {
James Lamb's avatar
James Lamb committed
802

803
  # Check if dataset is not a dataset
804
  if (!lgb.is.Dataset(x = dataset)) {
805
    stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
806
  }
James Lamb's avatar
James Lamb committed
807

808
  # Create validation dataset
809
  return(invisible(dataset$create_valid(data = data, info = info, ...)))
James Lamb's avatar
James Lamb committed
810

811
}
Guolin Ke's avatar
Guolin Ke committed
812

813
814
815
#' @name lgb.Dataset.construct
#' @title Construct Dataset explicitly
#' @description Construct Dataset explicitly
Guolin Ke's avatar
Guolin Ke committed
816
#' @param dataset Object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
817
#'
Guolin Ke's avatar
Guolin Ke committed
818
#' @examples
819
#' \donttest{
820
821
822
823
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
824
#' }
825
#' @return constructed dataset
Guolin Ke's avatar
Guolin Ke committed
826
827
#' @export
lgb.Dataset.construct <- function(dataset) {
James Lamb's avatar
James Lamb committed
828

829
  # Check if dataset is not a dataset
830
  if (!lgb.is.Dataset(x = dataset)) {
831
    stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
832
  }
James Lamb's avatar
James Lamb committed
833

834
  # Construct the dataset
835
  return(invisible(dataset$construct()))
James Lamb's avatar
James Lamb committed
836

Guolin Ke's avatar
Guolin Ke committed
837
838
}

839
840
#' @title Dimensions of an \code{lgb.Dataset}
#' @description Returns a vector of numbers of rows and of columns in an \code{lgb.Dataset}.
Guolin Ke's avatar
Guolin Ke committed
841
842
#' @param x Object of class \code{lgb.Dataset}
#' @param ... other parameters
James Lamb's avatar
James Lamb committed
843
#'
Guolin Ke's avatar
Guolin Ke committed
844
#' @return a vector of numbers of rows and of columns
James Lamb's avatar
James Lamb committed
845
#'
Guolin Ke's avatar
Guolin Ke committed
846
847
848
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{lgb.Dataset} object.
James Lamb's avatar
James Lamb committed
849
#'
Guolin Ke's avatar
Guolin Ke committed
850
#' @examples
851
#' \donttest{
852
853
854
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
855
#'
856
857
858
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
859
#' }
Guolin Ke's avatar
Guolin Ke committed
860
861
862
#' @rdname dim
#' @export
dim.lgb.Dataset <- function(x, ...) {
James Lamb's avatar
James Lamb committed
863

864
  # Check if dataset is not a dataset
865
  if (!lgb.is.Dataset(x = x)) {
866
    stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
867
  }
James Lamb's avatar
James Lamb committed
868

869
  return(x$dim())
James Lamb's avatar
James Lamb committed
870

Guolin Ke's avatar
Guolin Ke committed
871
872
}

873
874
875
#' @title Handling of column names of \code{lgb.Dataset}
#' @description Only column names are supported for \code{lgb.Dataset}, thus setting of
#'              row names would have no effect and returned row names would be NULL.
Guolin Ke's avatar
Guolin Ke committed
876
877
#' @param x object of class \code{lgb.Dataset}
#' @param value a list of two elements: the first one is ignored
878
#'              and the second one is column names
Guolin Ke's avatar
Guolin Ke committed
879
880
881
882
883
884
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
885
#' \donttest{
886
887
888
889
890
891
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
#' dimnames(dtrain)
#' colnames(dtrain)
892
#' colnames(dtrain) <- make.names(seq_len(ncol(train$data)))
893
#' print(dtrain, verbose = TRUE)
894
#' }
Guolin Ke's avatar
Guolin Ke committed
895
#' @rdname dimnames.lgb.Dataset
896
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
897
898
#' @export
dimnames.lgb.Dataset <- function(x) {
James Lamb's avatar
James Lamb committed
899

900
  # Check if dataset is not a dataset
901
  if (!lgb.is.Dataset(x = x)) {
902
    stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
903
  }
James Lamb's avatar
James Lamb committed
904

905
  # Return dimension names
906
  return(list(NULL, x$get_colnames()))
James Lamb's avatar
James Lamb committed
907

Guolin Ke's avatar
Guolin Ke committed
908
909
910
}

#' @rdname dimnames.lgb.Dataset
911
#' @return A list with the dimension names of the dataset
Guolin Ke's avatar
Guolin Ke committed
912
913
#' @export
`dimnames<-.lgb.Dataset` <- function(x, value) {
James Lamb's avatar
James Lamb committed
914

915
  # Check if invalid element list
916
  if (!identical(class(value), "list") || length(value) != 2L) {
917
    stop("invalid ", sQuote("value"), " given: must be a list of two elements")
918
  }
James Lamb's avatar
James Lamb committed
919

920
921
922
923
  # Check for unknown row names
  if (!is.null(value[[1L]])) {
    stop("lgb.Dataset does not have rownames")
  }
James Lamb's avatar
James Lamb committed
924

925
  if (is.null(value[[2L]])) {
James Lamb's avatar
James Lamb committed
926

927
    x$set_colnames(colnames = NULL)
Guolin Ke's avatar
Guolin Ke committed
928
    return(x)
James Lamb's avatar
James Lamb committed
929

930
  }
James Lamb's avatar
James Lamb committed
931

932
  # Check for unmatching column size
933
  if (ncol(x) != length(value[[2L]])) {
934
935
    stop(
      "can't assign "
936
      , sQuote(length(value[[2L]]))
937
938
939
940
      , " colnames to an lgb.Dataset with "
      , sQuote(ncol(x))
      , " columns"
    )
Guolin Ke's avatar
Guolin Ke committed
941
  }
James Lamb's avatar
James Lamb committed
942

943
  # Set column names properly, and return
944
  x$set_colnames(colnames = value[[2L]])
945
  return(x)
James Lamb's avatar
James Lamb committed
946

Guolin Ke's avatar
Guolin Ke committed
947
948
}

949
950
951
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#'              original \code{lgb.Dataset} object
Nikita Titov's avatar
Nikita Titov committed
952
#' @param dataset Object of class \code{lgb.Dataset}
953
#' @param idxset an integer vector of indices of rows needed
Guolin Ke's avatar
Guolin Ke committed
954
955
#' @param ... other parameters (currently not used)
#' @return constructed sub dataset
James Lamb's avatar
James Lamb committed
956
#'
Guolin Ke's avatar
Guolin Ke committed
957
#' @examples
958
#' \donttest{
959
960
961
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
James Lamb's avatar
James Lamb committed
962
#'
963
#' dsub <- lightgbm::slice(dtrain, seq_len(42L))
964
#' lgb.Dataset.construct(dsub)
965
#' labels <- lightgbm::getinfo(dsub, "label")
966
#' }
Guolin Ke's avatar
Guolin Ke committed
967
#' @export
968
969
970
slice <- function(dataset, ...) {
  UseMethod("slice")
}
Guolin Ke's avatar
Guolin Ke committed
971
972

#' @rdname slice
973
#' @return constructed sub dataset
Guolin Ke's avatar
Guolin Ke committed
974
975
#' @export
slice.lgb.Dataset <- function(dataset, idxset, ...) {
James Lamb's avatar
James Lamb committed
976

977
  # Check if dataset is not a dataset
978
  if (!lgb.is.Dataset(x = dataset)) {
979
    stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
980
  }
James Lamb's avatar
James Lamb committed
981

982
  # Return sliced set
983
  return(invisible(dataset$slice(idxset = idxset, ...)))
James Lamb's avatar
James Lamb committed
984

Guolin Ke's avatar
Guolin Ke committed
985
986
}

987
988
989
#' @name getinfo
#' @title Get information of an \code{lgb.Dataset} object
#' @description Get one attribute of a \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
990
991
992
993
#' @param dataset Object of class \code{lgb.Dataset}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#' @return info data
James Lamb's avatar
James Lamb committed
994
#'
Guolin Ke's avatar
Guolin Ke committed
995
996
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
997
#'
Guolin Ke's avatar
Guolin Ke committed
998
999
1000
#' \itemize{
#'     \item \code{label}: label lightgbm learn from ;
#'     \item \code{weight}: to do a weight rescale ;
1001
1002
1003
1004
1005
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Nikita Titov's avatar
Nikita Titov committed
1006
#'     \item \code{init_score}: initial score is the base prediction lightgbm will boost from.
Guolin Ke's avatar
Guolin Ke committed
1007
#' }
James Lamb's avatar
James Lamb committed
1008
#'
Guolin Ke's avatar
Guolin Ke committed
1009
#' @examples
1010
#' \donttest{
1011
1012
1013
1014
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1015
#'
1016
1017
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1018
#'
1019
1020
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all(labels2 == 1 - labels))
1021
#' }
Guolin Ke's avatar
Guolin Ke committed
1022
#' @export
1023
1024
1025
getinfo <- function(dataset, ...) {
  UseMethod("getinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1026
1027

#' @rdname getinfo
1028
#' @return info data
Guolin Ke's avatar
Guolin Ke committed
1029
1030
#' @export
getinfo.lgb.Dataset <- function(dataset, name, ...) {
James Lamb's avatar
James Lamb committed
1031

1032
  # Check if dataset is not a dataset
1033
  if (!lgb.is.Dataset(x = dataset)) {
1034
    stop("getinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1035
  }
James Lamb's avatar
James Lamb committed
1036

1037
  return(dataset$getinfo(name = name))
James Lamb's avatar
James Lamb committed
1038

Guolin Ke's avatar
Guolin Ke committed
1039
1040
}

1041
1042
1043
#' @name setinfo
#' @title Set information of an \code{lgb.Dataset} object
#' @description Set one attribute of a \code{lgb.Dataset}
Nikita Titov's avatar
Nikita Titov committed
1044
#' @param dataset Object of class \code{lgb.Dataset}
Guolin Ke's avatar
Guolin Ke committed
1045
1046
1047
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
1048
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1049
#'
Guolin Ke's avatar
Guolin Ke committed
1050
1051
#' @details
#' The \code{name} field can be one of the following:
James Lamb's avatar
James Lamb committed
1052
#'
Guolin Ke's avatar
Guolin Ke committed
1053
#' \itemize{
1054
1055
1056
1057
1058
#'     \item{\code{label}: vector of labels to use as the target variable}
#'     \item{\code{weight}: to do a weight rescale}
#'     \item{\code{init_score}: initial score is the base prediction lightgbm will boost from}
#'     \item{\code{group}: used for learning-to-rank tasks. An integer vector describing how to
#'         group rows together as ordered results from the same set of candidate results to be ranked.
1059
1060
1061
#'         For example, if you have a 100-document dataset with \code{group = c(10, 20, 40, 10, 10, 10)},
#'         that means that you have 6 groups, where the first 10 records are in the first group,
#'         records 11-30 are in the second group, etc.}
Guolin Ke's avatar
Guolin Ke committed
1062
#' }
James Lamb's avatar
James Lamb committed
1063
#'
Guolin Ke's avatar
Guolin Ke committed
1064
#' @examples
1065
#' \donttest{
1066
1067
1068
1069
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' lgb.Dataset.construct(dtrain)
James Lamb's avatar
James Lamb committed
1070
#'
1071
1072
#' labels <- lightgbm::getinfo(dtrain, "label")
#' lightgbm::setinfo(dtrain, "label", 1 - labels)
James Lamb's avatar
James Lamb committed
1073
#'
1074
1075
#' labels2 <- lightgbm::getinfo(dtrain, "label")
#' stopifnot(all.equal(labels2, 1 - labels))
1076
#' }
Guolin Ke's avatar
Guolin Ke committed
1077
#' @export
1078
1079
1080
setinfo <- function(dataset, ...) {
  UseMethod("setinfo")
}
Guolin Ke's avatar
Guolin Ke committed
1081
1082

#' @rdname setinfo
1083
#' @return the dataset you passed in
Guolin Ke's avatar
Guolin Ke committed
1084
1085
#' @export
setinfo.lgb.Dataset <- function(dataset, name, info, ...) {
James Lamb's avatar
James Lamb committed
1086

1087
  if (!lgb.is.Dataset(x = dataset)) {
1088
    stop("setinfo.lgb.Dataset: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1089
  }
James Lamb's avatar
James Lamb committed
1090

1091
  # Set information
1092
  return(invisible(dataset$setinfo(name = name, info = info)))
Guolin Ke's avatar
Guolin Ke committed
1093
1094
}

1095
1096
1097
1098
#' @name lgb.Dataset.set.categorical
#' @title Set categorical feature of \code{lgb.Dataset}
#' @description Set the categorical features of an \code{lgb.Dataset} object. Use this function
#'              to tell LightGBM which features should be treated as categorical.
1099
#' @param dataset object of class \code{lgb.Dataset}
1100
1101
1102
#' @param categorical_feature categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g.
#'                            \code{c(1L, 10L)} to say "the first and tenth columns").
1103
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1104
#'
1105
#' @examples
1106
#' \donttest{
1107
1108
1109
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1110
1111
1112
#' data_file <- tempfile(fileext = ".data")
#' lgb.Dataset.save(dtrain, data_file)
#' dtrain <- lgb.Dataset(data_file)
1113
#' lgb.Dataset.set.categorical(dtrain, 1L:2L)
1114
#' }
1115
1116
1117
#' @rdname lgb.Dataset.set.categorical
#' @export
lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
James Lamb's avatar
James Lamb committed
1118

1119
  if (!lgb.is.Dataset(x = dataset)) {
1120
1121
    stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
  }
James Lamb's avatar
James Lamb committed
1122

1123
  # Set categoricals
1124
  return(invisible(dataset$set_categorical_feature(categorical_feature = categorical_feature)))
James Lamb's avatar
James Lamb committed
1125

1126
1127
}

1128
1129
1130
#' @name lgb.Dataset.set.reference
#' @title Set reference of \code{lgb.Dataset}
#' @description If you want to use validation data, you should set reference to training data
Guolin Ke's avatar
Guolin Ke committed
1131
1132
#' @param dataset object of class \code{lgb.Dataset}
#' @param reference object of class \code{lgb.Dataset}
James Lamb's avatar
James Lamb committed
1133
#'
1134
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1135
#'
Guolin Ke's avatar
Guolin Ke committed
1136
#' @examples
1137
#' \donttest{
1138
1139
1140
1141
1142
1143
1144
#' data(agaricus.train, package ="lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
#' data(agaricus.test, package = "lightgbm")
#' test <- agaricus.test
#' dtest <- lgb.Dataset(test$data, test = train$label)
#' lgb.Dataset.set.reference(dtest, dtrain)
1145
#' }
Guolin Ke's avatar
Guolin Ke committed
1146
1147
1148
#' @rdname lgb.Dataset.set.reference
#' @export
lgb.Dataset.set.reference <- function(dataset, reference) {
James Lamb's avatar
James Lamb committed
1149

1150
  # Check if dataset is not a dataset
1151
  if (!lgb.is.Dataset(x = dataset)) {
1152
    stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1153
  }
James Lamb's avatar
James Lamb committed
1154

1155
  # Set reference
1156
  return(invisible(dataset$set_reference(reference = reference)))
Guolin Ke's avatar
Guolin Ke committed
1157
1158
}

1159
1160
1161
1162
#' @name lgb.Dataset.save
#' @title Save \code{lgb.Dataset} to a binary file
#' @description Please note that \code{init_score} is not saved in binary file.
#'              If you need it, please set it again after loading Dataset.
Guolin Ke's avatar
Guolin Ke committed
1163
1164
#' @param dataset object of class \code{lgb.Dataset}
#' @param fname object filename of output file
James Lamb's avatar
James Lamb committed
1165
#'
1166
#' @return the dataset you passed in
James Lamb's avatar
James Lamb committed
1167
#'
Guolin Ke's avatar
Guolin Ke committed
1168
#' @examples
1169
#' \donttest{
1170
1171
1172
#' data(agaricus.train, package = "lightgbm")
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label = train$label)
1173
#' lgb.Dataset.save(dtrain, tempfile(fileext = ".bin"))
1174
#' }
Guolin Ke's avatar
Guolin Ke committed
1175
1176
#' @export
lgb.Dataset.save <- function(dataset, fname) {
James Lamb's avatar
James Lamb committed
1177

1178
  # Check if dataset is not a dataset
1179
  if (!lgb.is.Dataset(x = dataset)) {
1180
    stop("lgb.Dataset.set: input dataset should be an lgb.Dataset object")
Guolin Ke's avatar
Guolin Ke committed
1181
  }
James Lamb's avatar
James Lamb committed
1182

1183
  # File-type is not matching
1184
1185
  if (!is.character(fname)) {
    stop("lgb.Dataset.set: fname should be a character or a file connection")
Guolin Ke's avatar
Guolin Ke committed
1186
  }
James Lamb's avatar
James Lamb committed
1187

1188
  # Store binary
1189
  return(invisible(dataset$save_binary(fname = fname)))
Guolin Ke's avatar
Guolin Ke committed
1190
}