lightgbm.R 11.7 KB
Newer Older
James Lamb's avatar
James Lamb committed
1
2
3
#' @name lgb_shared_params
#' @title Shared parameter docs
#' @description Parameter docs shared by \code{lgb.train}, \code{lgb.cv}, and \code{lightgbm}
4
#' @param callbacks List of callback functions that are applied at each iteration.
5
6
7
#' @param data a \code{lgb.Dataset} object, used for training. Some functions, such as \code{\link{lgb.cv}},
#'             may allow you to pass other types of data like \code{matrix} and then separately supply
#'             \code{label} as a keyword argument.
8
9
10
11
12
#' @param early_stopping_rounds int. Activates early stopping. When this parameter is non-null,
#'                              training will stop if the evaluation of any metric on any validation set
#'                              fails to improve for \code{early_stopping_rounds} consecutive boosting rounds.
#'                              If training stops early, the returned model will have attribute \code{best_iter}
#'                              set to the iteration number of the best iteration.
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#' @param eval evaluation function(s). This can be a character vector, function, or list with a mixture of
#'             strings and functions.
#'
#'             \itemize{
#'                 \item{\bold{a. character vector}:
#'                     If you provide a character vector to this argument, it should contain strings with valid
#'                     evaluation metrics.
#'                     See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#metric}{
#'                     The "metric" section of the documentation}
#'                     for a list of valid metrics.
#'                 }
#'                 \item{\bold{b. function}:
#'                      You can provide a custom evaluation function. This
#'                      should accept the keyword arguments \code{preds} and \code{dtrain} and should return a named
#'                      list with three elements:
#'                      \itemize{
#'                          \item{\code{name}: A string with the name of the metric, used for printing
#'                              and storing results.
#'                          }
#'                          \item{\code{value}: A single number indicating the value of the metric for the
#'                              given predictions and true values
#'                          }
#'                          \item{
#'                              \code{higher_better}: A boolean indicating whether higher values indicate a better fit.
#'                              For example, this would be \code{FALSE} for metrics like MAE or RMSE.
#'                          }
#'                      }
#'                 }
#'                 \item{\bold{c. list}:
#'                     If a list is given, it should only contain character vectors and functions.
#'                     These should follow the requirements from the descriptions above.
#'                 }
#'             }
James Lamb's avatar
James Lamb committed
46
47
48
#' @param eval_freq evaluation output frequency, only effect when verbose > 0
#' @param init_model path of model file of \code{lgb.Booster} object, will continue training from this model
#' @param nrounds number of training rounds
49
50
51
#' @param obj objective function, can be character or custom objective function. Examples include
#'            \code{regression}, \code{regression_l1}, \code{huber},
#'            \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}
52
53
#' @param params a list of parameters. See \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html}{
#'               the "Parameters" section of the documentation} for a list of parameters and valid values.
James Lamb's avatar
James Lamb committed
54
#' @param verbose verbosity for output, if <= 0, also will disable the print of evaluation during training
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#' @section Early Stopping:
#'
#'          "early stopping" refers to stopping the training process if the model's performance on a given
#'          validation set does not improve for several consecutive iterations.
#'
#'          If multiple arguments are given to \code{eval}, their order will be preserved. If you enable
#'          early stopping by setting \code{early_stopping_rounds} in \code{params}, by default all
#'          metrics will be considered for early stopping.
#'
#'          If you want to only consider the first metric for early stopping, pass
#'          \code{first_metric_only = TRUE} in \code{params}. Note that if you also specify \code{metric}
#'          in \code{params}, that metric will be considered the "first" one. If you omit \code{metric},
#'          a default metric will be used based on your choice for the parameter \code{obj} (keyword argument)
#'          or \code{objective} (passed into \code{params}).
69
#' @keywords internal
James Lamb's avatar
James Lamb committed
70
71
72
NULL

#' @name lightgbm
73
#' @title Train a LightGBM model
74
#' @description Simple interface for training a LightGBM model.
James Lamb's avatar
James Lamb committed
75
76
77
78
79
80
#' @inheritParams lgb_shared_params
#' @param label Vector of labels, used if \code{data} is not an \code{\link{lgb.Dataset}}
#' @param weight vector of response values. If not NULL, will set to dataset
#' @param save_name File name to use when writing the trained model to disk. Should end in ".model".
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
#'     \itemize{
81
82
#'        \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
#'        \item{\code{obj}: objective function, can be character or custom objective function. Examples include
James Lamb's avatar
James Lamb committed
83
84
#'                   \code{regression}, \code{regression_l1}, \code{huber},
#'                    \code{binary}, \code{lambdarank}, \code{multiclass}, \code{multiclass}}
85
86
87
88
89
90
91
#'        \item{\code{eval}: evaluation function, can be (a list of) character or custom eval function}
#'        \item{\code{record}: Boolean, TRUE will record iteration message to \code{booster$record_evals}}
#'        \item{\code{colnames}: feature names, if not null, will use this to overwrite the names in dataset}
#'        \item{\code{categorical_feature}: categorical features. This can either be a character vector of feature
#'                            names or an integer vector with the indices of the features (e.g. \code{c(1L, 10L)} to
#'                            say "the first and tenth columns").}
#'        \item{\code{reset_data}: Boolean, setting it to TRUE (not the default value) will transform the booster model
James Lamb's avatar
James Lamb committed
92
#'                          into a predictor model which frees up memory and the original datasets}
93
94
95
#'         \item{\code{boosting}: Boosting type. \code{"gbdt"}, \code{"rf"}, \code{"dart"} or \code{"goss"}.}
#'         \item{\code{num_leaves}: Maximum number of leaves in one tree.}
#'         \item{\code{max_depth}: Limit the max depth for tree model. This is used to deal with
James Lamb's avatar
James Lamb committed
96
#'                          overfit when #data is small. Tree still grow by leaf-wise.}
97
#'          \item{\code{num_threads}: Number of threads for LightGBM. For the best speed, set this to
98
99
100
#'                             the number of real CPU cores(\code{parallel::detectCores(logical = FALSE)}),
#'                             not the number of threads (most CPU using hyper-threading to generate 2 threads
#'                             per CPU core).}
James Lamb's avatar
James Lamb committed
101
#'     }
102
#' @inheritSection lgb_shared_params Early Stopping
103
#' @return a trained \code{lgb.Booster}
Guolin Ke's avatar
Guolin Ke committed
104
#' @export
105
106
107
108
lightgbm <- function(data,
                     label = NULL,
                     weight = NULL,
                     params = list(),
109
                     nrounds = 100L,
110
                     verbose = 1L,
111
112
113
114
115
116
                     eval_freq = 1L,
                     early_stopping_rounds = NULL,
                     save_name = "lightgbm.model",
                     init_model = NULL,
                     callbacks = list(),
                     ...) {
117

118
  # validate inputs early to avoid unnecessary computation
119
  if (nrounds <= 0L) {
120
121
    stop("nrounds should be greater than zero")
  }
122
123
124
125

  # Set data to a temporary variable
  dtrain <- data

126
  # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
127
  if (!lgb.is.Dataset(x = dtrain)) {
128
    dtrain <- lgb.Dataset(data = data, label = label, weight = weight)
Guolin Ke's avatar
Guolin Ke committed
129
  }
Guolin Ke's avatar
Guolin Ke committed
130

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
  train_args <- list(
    "params" = params
    , "data" = dtrain
    , "nrounds" = nrounds
    , "verbose" = verbose
    , "eval_freq" = eval_freq
    , "early_stopping_rounds" = early_stopping_rounds
    , "init_model" = init_model
    , "callbacks" = callbacks
  )
  train_args <- append(train_args, list(...))

  if (! "valids" %in% names(train_args)) {
    train_args[["valids"]] <- list()
  }

147
  # Set validation as oneself
148
  if (verbose > 0L) {
149
    train_args[["valids"]][["train"]] <- dtrain
150
  }
151

152
  # Train a model using the regular way
153
154
155
  bst <- do.call(
    what = lgb.train
    , args = train_args
156
  )
157

158
  # Store model under a specific name
159
  bst$save_model(filename = save_name)
160

161
  return(bst)
Guolin Ke's avatar
Guolin Ke committed
162
163
}

164
165
166
167
168
#' @name agaricus.train
#' @title Training part from Mushroom Data Set
#' @description This data set is originally from the Mushroom data set,
#'              UCI Machine Learning Repository.
#'              This data set includes the following fields:
169
#'
170
171
172
173
#'               \itemize{
#'                   \item{\code{label}: the label for each record}
#'                   \item{\code{data}: a sparse Matrix of \code{dgCMatrix} class, with 126 columns.}
#'                }
Guolin Ke's avatar
Guolin Ke committed
174
175
176
#'
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
177
178
179
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
Guolin Ke's avatar
Guolin Ke committed
180
#' School of Information and Computer Science.
181
#'
Guolin Ke's avatar
Guolin Ke committed
182
183
184
#' @docType data
#' @keywords datasets
#' @usage data(agaricus.train)
185
#' @format A list containing a label vector, and a dgCMatrix object with 6513
Guolin Ke's avatar
Guolin Ke committed
186
187
188
#' rows and 127 variables
NULL

189
190
191
192
193
194
195
196
197
198
#' @name agaricus.test
#' @title Test part from Mushroom Data Set
#' @description This data set is originally from the Mushroom data set,
#'              UCI Machine Learning Repository.
#'              This data set includes the following fields:
#'
#'              \itemize{
#'                  \item{\code{label}: the label for each record}
#'                  \item{\code{data}: a sparse Matrix of \code{dgCMatrix} class, with 126 columns.}
#'              }
Guolin Ke's avatar
Guolin Ke committed
199
200
#' @references
#' https://archive.ics.uci.edu/ml/datasets/Mushroom
201
202
203
#'
#' Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
#' [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
Guolin Ke's avatar
Guolin Ke committed
204
#' School of Information and Computer Science.
205
#'
Guolin Ke's avatar
Guolin Ke committed
206
207
208
#' @docType data
#' @keywords datasets
#' @usage data(agaricus.test)
209
#' @format A list containing a label vector, and a dgCMatrix object with 1611
Guolin Ke's avatar
Guolin Ke committed
210
211
212
#' rows and 126 variables
NULL

213
214
215
216
#' @name bank
#' @title Bank Marketing Data Set
#' @description This data set is originally from the Bank Marketing data set,
#'              UCI Machine Learning Repository.
217
#'
218
219
#'              It contains only the following: bank.csv with 10% of the examples and 17 inputs,
#'              randomly selected from 3 (older version of this dataset with less inputs).
220
221
222
#'
#' @references
#' http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
223
#'
224
225
226
227
228
229
230
231
232
#' S. Moro, P. Cortez and P. Rita. (2014)
#' A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems
#'
#' @docType data
#' @keywords datasets
#' @usage data(bank)
#' @format A data.table with 4521 rows and 17 variables
NULL

Guolin Ke's avatar
Guolin Ke committed
233
# Various imports
Guolin Ke's avatar
Guolin Ke committed
234
#' @import methods
235
#' @importFrom Matrix Matrix
Guolin Ke's avatar
Guolin Ke committed
236
#' @importFrom R6 R6Class
James Lamb's avatar
James Lamb committed
237
#' @useDynLib lib_lightgbm , .registration = TRUE
238
NULL
James Lamb's avatar
James Lamb committed
239
240
241
242
243
244
245

# Suppress false positive warnings from R CMD CHECK about
# "unrecognized global variable"
globalVariables(c(
    "."
    , ".N"
    , ".SD"
246
    , "abs_contribution"
247
    , "bar_color"
James Lamb's avatar
James Lamb committed
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    , "Contribution"
    , "Cover"
    , "Feature"
    , "Frequency"
    , "Gain"
    , "internal_count"
    , "internal_value"
    , "leaf_index"
    , "leaf_parent"
    , "leaf_value"
    , "node_parent"
    , "split_feature"
    , "split_gain"
    , "split_index"
    , "tree_index"
))