Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tianlh
LightGBM-DCU
Commits
88e6beb8
"tests/vscode:/vscode.git/clone" did not exist on "c9bcba44c9f3883bb62a43cbfb278b9ee522a861"
Commit
88e6beb8
authored
Jan 08, 2017
by
Guolin Ke
Browse files
[R-package] add cv support
parent
535cdc65
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
420 additions
and
9 deletions
+420
-9
R-package/NAMESPACE
R-package/NAMESPACE
+1
-0
R-package/R/callback.R
R-package/R/callback.R
+0
-4
R-package/R/lgb.cv.R
R-package/R/lgb.cv.R
+291
-0
R-package/demo/00Index
R-package/demo/00Index
+1
-0
R-package/demo/README.md
R-package/demo/README.md
+1
-0
R-package/demo/cross_validation.R
R-package/demo/cross_validation.R
+45
-0
R-package/man/lgb.train.Rd
R-package/man/lgb.train.Rd
+73
-5
R-package/tests/testthat/test_basic.R
R-package/tests/testthat/test_basic.R
+8
-0
No files found.
R-package/NAMESPACE
View file @
88e6beb8
...
...
@@ -14,6 +14,7 @@ export(lgb.Dataset.create.valid)
export(lgb.Dataset.save)
export(lgb.Dataset.set.categorical)
export(lgb.Dataset.set.reference)
export(lgb.cv)
export(lgb.dump)
export(lgb.get.eval.result)
export(lgb.load)
...
...
R-package/R/callback.R
View file @
88e6beb8
...
...
@@ -228,10 +228,6 @@ add.cb <- function(cb_list, cb) {
# this removes only the first one
cb_list
[
'cb.early.stop'
]
<-
NULL
}
if
(
'cb.cv.predict'
%in%
names
(
cb_list
))
{
cb_list
<-
c
(
cb_list
,
cb_list
[
'cb.cv.predict'
])
cb_list
[
'cb.cv.predict'
]
<-
NULL
}
cb_list
}
...
...
R-package/R/lgb.cv.R
0 → 100644
View file @
88e6beb8
CVBooster
<-
R6Class
(
"lgb.CVBooster"
,
cloneable
=
FALSE
,
public
=
list
(
best_iter
=
-1
,
record_evals
=
list
(),
boosters
=
list
(),
initialize
=
function
(
x
){
self
$
boosters
<-
x
}
)
)
#' Main CV logic for LightGBM
#'
#' Main CV logic for LightGBM
#'
#' @param params List of parameters
#' @param data a \code{lgb.Dataset} object, used for CV
#' @param nrounds number of CV rounds
#' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
#' @param label vector of response values. Should be provided only when data is an R-matrix.
#' @param weight vector of response values. If not NULL, will set to dataset
#' @param obj objective function, can be character or custom objective function
#' @param eval evaluation function, can be (list of) character or custom eval function
#' @param verbose verbosity for output
#' if verbose > 0 , also will record iteration message to booster$record_evals
#' @param eval_freq evalutaion output frequence
#' @param showsd \code{boolean}, whether to show standard deviation of cross validation
#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
#' by the values of outcome labels.
#' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
#' (each element must be a vector of test fold's indices). When folds are supplied,
#' the \code{nfold} and \code{stratified} parameters are ignored.
#' @param init_model path of model file of \code{lgb.Booster} object, will continue train from this model
#' @param colnames feature names, if not null, will use this to overwrite the names in dataset
#' @param categorical_feature list of str or int
#' type int represents index,
#' type str represents feature names
#' @param early_stopping_rounds int
#' Activates early stopping.
#' Requires at least one validation data and one metric
#' If there's more than one, will check all of them
#' Returns the model with (best_iter + early_stopping_rounds)
#' If early stopping occurs, the model will have 'best_iter' field
#' @param callbacks list of callback functions
#' List of callback functions that are applied at each iteration.
#' @param ... other parameters, see parameters.md for more informations
#' @return a trained booster model \code{lgb.Booster}.
#' @examples
#' library(lightgbm)
#' data(agaricus.train, package='lightgbm')
#' train <- agaricus.train
#' dtrain <- lgb.Dataset(train$data, label=train$label)
#' params <- list(objective="regression", metric="l2")
#' model <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
#'
#' @rdname lgb.train
#' @export
lgb.cv
<-
function
(
params
=
list
(),
data
,
nrounds
=
10
,
nfold
=
3
,
label
=
NULL
,
weight
=
NULL
,
obj
=
NULL
,
eval
=
NULL
,
verbose
=
1
,
eval_freq
=
1L
,
showsd
=
TRUE
,
stratified
=
TRUE
,
folds
=
NULL
,
init_model
=
NULL
,
colnames
=
NULL
,
categorical_feature
=
NULL
,
early_stopping_rounds
=
NULL
,
callbacks
=
list
(),
...
)
{
addiction_params
<-
list
(
...
)
params
<-
append
(
params
,
addiction_params
)
params
$
verbose
<-
verbose
params
<-
lgb.check.obj
(
params
,
obj
)
params
<-
lgb.check.eval
(
params
,
eval
)
fobj
<-
NULL
feval
<-
NULL
if
(
typeof
(
params
$
objective
)
==
"closure"
){
fobj
<-
params
$
objective
params
$
objective
<-
"NONE"
}
if
(
typeof
(
eval
)
==
"closure"
){
feval
<-
eval
}
lgb.check.params
(
params
)
predictor
<-
NULL
if
(
is.character
(
init_model
)){
predictor
<-
Predictor
$
new
(
init_model
)
}
else
if
(
lgb.is.Booster
(
init_model
))
{
predictor
<-
init_model
$
to_predictor
()
}
begin_iteration
<-
1
if
(
!
is.null
(
predictor
)){
begin_iteration
<-
predictor
$
current_iter
()
+
1
}
end_iteration
<-
begin_iteration
+
nrounds
-
1
# check dataset
if
(
!
lgb.is.Dataset
(
data
)){
if
(
is.null
(
label
)){
stop
(
"Labels must be provided for lgb.cv"
)
}
data
<-
lgb.Dataset
(
data
,
label
=
label
)
}
if
(
!
is.null
(
weight
))
data
$
set_info
(
"weight"
,
weight
)
data
$
update_params
(
params
)
data
$
.__enclos_env__
$
private
$
set_predictor
(
predictor
)
if
(
!
is.null
(
colnames
)){
data
$
set_colnames
(
colnames
)
}
data
$
set_categorical_feature
(
categorical_feature
)
data
$
construct
()
# CV folds
if
(
!
is.null
(
folds
))
{
if
(
class
(
folds
)
!=
"list"
||
length
(
folds
)
<
2
)
stop
(
"'folds' must be a list with 2 or more elements that are vectors of indices for each CV-fold"
)
nfold
<-
length
(
folds
)
}
else
{
if
(
nfold
<=
1
)
stop
(
"'nfold' must be > 1"
)
folds
<-
generate.cv.folds
(
nfold
,
nrow
(
data
),
stratified
,
getinfo
(
data
,
'label'
),
params
)
}
# process callbacks
if
(
eval_freq
>
0
){
callbacks
<-
add.cb
(
callbacks
,
cb.print.evaluation
(
eval_freq
))
}
if
(
verbose
>
0
)
{
callbacks
<-
add.cb
(
callbacks
,
cb.record.evaluation
())
}
# Early stopping callback
if
(
!
is.null
(
early_stopping_rounds
))
{
if
(
early_stopping_rounds
>
0
){
callbacks
<-
add.cb
(
callbacks
,
cb.early.stop
(
early_stopping_rounds
,
verbose
=
verbose
))
}
}
cb
<-
categorize.callbacks
(
callbacks
)
# construct booster
bst_folds
<-
lapply
(
1
:
length
(
folds
),
function
(
k
)
{
dtest
<-
slice
(
data
,
folds
[[
k
]])
dtrain
<-
slice
(
data
,
unlist
(
folds
[
-
k
]))
booster
<-
Booster
$
new
(
params
,
dtrain
)
booster
$
add_valid
(
dtest
,
"valid"
)
list
(
booster
=
booster
)
})
cv_booster
<-
CVBooster
$
new
(
bst_folds
)
# callback env
env
<-
CB_ENV
$
new
()
env
$
model
<-
cv_booster
env
$
begin_iteration
<-
begin_iteration
env
$
end_iteration
<-
end_iteration
#start training
for
(
i
in
begin_iteration
:
end_iteration
){
env
$
iteration
<-
i
env
$
eval_list
<-
list
()
for
(
f
in
cb
$
pre_iter
)
f
(
env
)
# update one iter
msg
<-
lapply
(
cv_booster
$
boosters
,
function
(
fd
)
{
fd
$
booster
$
update
(
fobj
=
fobj
)
fd
$
booster
$
eval_valid
(
feval
=
feval
)
})
merged_msg
<-
lgb.merge.cv.result
(
msg
)
env
$
eval_list
<-
merged_msg
$
eval_list
if
(
showsd
)
env
$
eval_err_list
<-
merged_msg
$
eval_err_list
for
(
f
in
cb
$
post_iter
)
f
(
env
)
# met early stopping
if
(
env
$
met_early_stop
)
break
}
return
(
cv_booster
)
}
# Generates random (stratified if needed) CV folds
generate.cv.folds
<-
function
(
nfold
,
nrows
,
stratified
,
label
,
params
)
{
# cannot do it for rank
if
(
exists
(
'objective'
,
where
=
params
)
&&
is.character
(
params
$
objective
)
&&
params
$
objective
==
'lambdarank'
)
{
stop
(
"\n\tAutomatic generation of CV-folds is not implemented for lambdarank!\n"
,
"\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n"
)
}
# shuffle
rnd_idx
<-
sample
(
1
:
nrows
)
if
(
stratified
&&
length
(
label
)
==
length
(
rnd_idx
))
{
y
<-
label
[
rnd_idx
]
y
<-
factor
(
y
)
folds
<-
lgb.stratified.folds
(
y
,
nfold
)
}
else
{
# make simple non-stratified folds
kstep
<-
length
(
rnd_idx
)
%/%
nfold
folds
<-
list
()
for
(
i
in
1
:
(
nfold
-
1
))
{
folds
[[
i
]]
<-
rnd_idx
[
1
:
kstep
]
rnd_idx
<-
rnd_idx
[
-
(
1
:
kstep
)]
}
folds
[[
nfold
]]
<-
rnd_idx
}
return
(
folds
)
}
# Creates CV folds stratified by the values of y.
# It was borrowed from caret::lgb.stratified.folds and simplified
# by always returning an unnamed list of fold indices.
lgb.stratified.folds
<-
function
(
y
,
k
=
10
)
{
if
(
is.numeric
(
y
))
{
## Group the numeric data based on their magnitudes
## and sample within those groups.
## When the number of samples is low, we may have
## issues further slicing the numeric data into
## groups. The number of groups will depend on the
## ratio of the number of folds to the sample size.
## At most, we will use quantiles. If the sample
## is too small, we just do regular unstratified
## CV
cuts
<-
floor
(
length
(
y
)
/
k
)
if
(
cuts
<
2
)
cuts
<-
2
if
(
cuts
>
5
)
cuts
<-
5
y
<-
cut
(
y
,
unique
(
stats
::
quantile
(
y
,
probs
=
seq
(
0
,
1
,
length
=
cuts
))),
include.lowest
=
TRUE
)
}
if
(
k
<
length
(
y
))
{
## reset levels so that the possible levels and
## the levels in the vector are the same
y
<-
factor
(
as.character
(
y
))
numInClass
<-
table
(
y
)
foldVector
<-
vector
(
mode
=
"integer"
,
length
(
y
))
## For each class, balance the fold allocation as far
## as possible, then resample the remainder.
## The final assignment of folds is also randomized.
for
(
i
in
1
:
length
(
numInClass
))
{
## create a vector of integers from 1:k as many times as possible without
## going over the number of samples in the class. Note that if the number
## of samples in a class is less than k, nothing is producd here.
seqVector
<-
rep
(
1
:
k
,
numInClass
[
i
]
%/%
k
)
## add enough random integers to get length(seqVector) == numInClass[i]
if
(
numInClass
[
i
]
%%
k
>
0
)
seqVector
<-
c
(
seqVector
,
sample
(
1
:
k
,
numInClass
[
i
]
%%
k
))
## shuffle the integers for fold assignment and assign to this classes's data
foldVector
[
which
(
y
==
dimnames
(
numInClass
)
$
y
[
i
])]
<-
sample
(
seqVector
)
}
}
else
{
foldVector
<-
seq
(
along
=
y
)
}
out
<-
split
(
seq
(
along
=
y
),
foldVector
)
names
(
out
)
<-
NULL
out
}
lgb.merge.cv.result
<-
function
(
msg
,
showsd
=
TRUE
){
eval_len
<-
length
(
msg
[[
1
]])
eval_result
<-
lapply
(
1
:
eval_len
,
function
(
j
)
{
as.numeric
(
lapply
(
1
:
length
(
msg
),
function
(
i
){
msg
[[
i
]][[
j
]]
$
value
}))
})
ret_eval
<-
msg
[[
1
]]
for
(
j
in
1
:
eval_len
){
ret_eval
[[
j
]]
$
value
<-
mean
(
eval_result
[[
j
]])
}
ret_eval_err
<-
NULL
if
(
showsd
){
for
(
j
in
1
:
eval_len
){
ret_eval_err
<-
c
(
ret_eval_err
,
sqrt
(
mean
(
eval_result
[[
j
]]
^
2
)
-
mean
(
eval_result
[[
j
]])
^
2
))
}
ret_eval_err
<-
as.list
(
ret_eval_err
)
}
return
(
list
(
eval_list
=
ret_eval
,
eval_err_list
=
ret_eval_err
))
}
\ No newline at end of file
R-package/demo/00Index
View file @
88e6beb8
basic_walkthrough Basic feature walkthrough
boost_from_prediction Boosting from existing prediction
early_stopping Early Stop in training
cross_validation Cross Validation
\ No newline at end of file
R-package/demo/README.md
View file @
88e6beb8
...
...
@@ -3,5 +3,6 @@ LightGBM R examples
*
[
Basic walkthrough of wrappers
](
basic_walkthrough.R
)
*
[
Boosting from existing prediction
](
boost_from_prediction.R
)
*
[
Early Stopping
](
early_stopping.R
)
*
[
Cross Validation
](
cross_validation.R
)
R-package/demo/cross_validation.R
0 → 100644
View file @
88e6beb8
require
(
lightgbm
)
# load in the agaricus dataset
data
(
agaricus.train
,
package
=
'lightgbm'
)
data
(
agaricus.test
,
package
=
'lightgbm'
)
dtrain
<-
lgb.Dataset
(
agaricus.train
$
data
,
label
=
agaricus.train
$
label
)
dtest
<-
lgb.Dataset
(
agaricus.test
$
data
,
label
=
agaricus.test
$
label
)
nround
<-
2
param
<-
list
(
num_leaves
=
4
,
learning_rate
=
1
,
objective
=
'binary'
)
cat
(
'running cross validation\n'
)
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv
(
param
,
dtrain
,
nround
,
nfold
=
5
,
eval
=
{
'binary_error'
})
cat
(
'running cross validation, disable standard deviation display\n'
)
# do cross validation, this will print result out as
# [iteration] metric_name:mean_value+std_value
# std_value is standard deviation of the metric
lgb.cv
(
param
,
dtrain
,
nround
,
nfold
=
5
,
eval
=
'binary_error'
,
showsd
=
FALSE
)
###
# you can also do cross validation with cutomized loss function
# See custom_objective.R
##
print
(
'running cross validation, with cutomsized loss function'
)
logregobj
<-
function
(
preds
,
dtrain
)
{
labels
<-
getinfo
(
dtrain
,
"label"
)
preds
<-
1
/
(
1
+
exp
(
-
preds
))
grad
<-
preds
-
labels
hess
<-
preds
*
(
1
-
preds
)
return
(
list
(
grad
=
grad
,
hess
=
hess
))
}
evalerror
<-
function
(
preds
,
dtrain
)
{
labels
<-
getinfo
(
dtrain
,
"label"
)
err
<-
as.numeric
(
sum
(
labels
!=
(
preds
>
0
)))
/
length
(
labels
)
return
(
list
(
name
=
"error"
,
value
=
err
,
higher_better
=
FALSE
))
}
# train with customized objective
lgb.cv
(
params
=
param
,
data
=
dtrain
,
nrounds
=
nround
,
obj
=
logregobj
,
eval
=
evalerror
,
nfold
=
5
)
R-package/man/lgb.train.Rd
View file @
88e6beb8
%
Generated
by
roxygen2
:
do
not
edit
by
hand
%
Please
edit
documentation
in
R
/
lgb
.
train
.
R
,
R
/
lightgbm
.
R
\
name
{
lgb
.
train
}
%
Please
edit
documentation
in
R
/
lgb
.
cv
.
R
,
R
/
lgb
.
train
.
R
,
R
/
lightgbm
.
R
\
name
{
lgb
.
cv
}
\
alias
{
lgb
.
cv
}
\
alias
{
lgb
.
train
}
\
alias
{
lightgbm
}
\
title
{
Main
training
logic
for
LightGBM
}
\
title
{
Main
CV
logic
for
LightGBM
}
\
usage
{
lgb
.
cv
(
params
=
list
(),
data
,
nrounds
=
10
,
nfold
=
3
,
label
=
NULL
,
weight
=
NULL
,
obj
=
NULL
,
eval
=
NULL
,
verbose
=
1
,
eval_freq
=
1L
,
showsd
=
TRUE
,
stratified
=
TRUE
,
folds
=
NULL
,
init_model
=
NULL
,
colnames
=
NULL
,
categorical_feature
=
NULL
,
early_stopping_rounds
=
NULL
,
callbacks
=
list
(),
...)
lgb
.
train
(
params
=
list
(),
data
,
nrounds
=
10
,
valids
=
list
(),
obj
=
NULL
,
eval
=
NULL
,
verbose
=
1
,
eval_freq
=
1L
,
init_model
=
NULL
,
colnames
=
NULL
,
categorical_feature
=
NULL
,
...
...
@@ -18,12 +25,62 @@ lightgbm(data, label = NULL, weight = NULL, params = list(),
\
arguments
{
\
item
{
params
}{
List
of
parameters
}
\
item
{
data
}{
a
\
code
{
lgb
.
Dataset
}
object
,
used
for
training
}
\
item
{
data
}{
a
\
code
{
lgb
.
Dataset
}
object
,
used
for
CV
}
\
item
{
nrounds
}{
number
of
training
rounds
}
\
item
{
nrounds
}{
number
of
CV
rounds
}
\
item
{
nfold
}{
the
original
dataset
is
randomly
partitioned
into
\
code
{
nfold
}
equal
size
subsamples
.}
\
item
{
label
}{
vector
of
response
values
.
Should
be
provided
only
when
data
is
an
R
-
matrix
.}
\
item
{
weight
}{
vector
of
response
values
.
If
not
NULL
,
will
set
to
dataset
}
\
item
{
obj
}{
objective
function
,
can
be
character
or
custom
objective
function
}
\
item
{
eval
}{
evaluation
function
,
can
be
(
list
of
)
character
or
custom
eval
function
}
\
item
{
verbose
}{
verbosity
for
output
if
verbose
>
0
,
also
will
record
iteration
message
to
booster
$
record_evals
}
\
item
{
eval_freq
}{
evalutaion
output
frequence
}
\
item
{
showsd
}{\
code
{
boolean
},
whether
to
show
standard
deviation
of
cross
validation
}
\
item
{
stratified
}{
a
\
code
{
boolean
}
indicating
whether
sampling
of
folds
should
be
stratified
by
the
values
of
outcome
labels
.}
\
item
{
folds
}{\
code
{
list
}
provides
a
possibility
to
use
a
list
of
pre
-
defined
CV
folds
(
each
element
must
be
a
vector
of
test
fold
's indices). When folds are supplied,
the \code{nfold} and \code{stratified} parameters are ignored.}
\item{init_model}{path of model file of \code{lgb.Booster} object, will continue train from this model}
\item{colnames}{feature names, if not null, will use this to overwrite the names in dataset}
\item{categorical_feature}{list of str or int
type int represents index,
type str represents feature names}
\item{early_stopping_rounds}{int
Activates early stopping.
Requires at least one validation data and one metric
If there'
s
more
than
one
,
will
check
all
of
them
Returns
the
model
with
(
best_iter
+
early_stopping_rounds
)
If
early
stopping
occurs
,
the
model
will
have
'best_iter'
field
}
\
item
{
callbacks
}{
list
of
callback
functions
List
of
callback
functions
that
are
applied
at
each
iteration
.}
\
item
{...}{
other
parameters
,
see
parameters
.
md
for
more
informations
}
\
item
{
valids
}{
a
list
of
\
code
{
lgb
.
Dataset
}
object
,
used
for
validation
}
\
item
{
params
}{
List
of
parameters
}
\
item
{
data
}{
a
\
code
{
lgb
.
Dataset
}
object
,
used
for
training
}
\
item
{
nrounds
}{
number
of
training
rounds
}
\
item
{
obj
}{
objective
function
,
can
be
character
or
custom
objective
function
}
\
item
{
eval
}{
evaluation
function
,
can
be
(
list
of
)
character
or
custom
eval
function
}
...
...
@@ -54,12 +111,23 @@ List of callback functions that are applied at each iteration.}
\item{...}{other parameters, see parameters.md for more informations}
}
\value{
a trained booster model \code{lgb.Booster}.
a trained booster model \code{lgb.Booster}.
}
\description{
Main CV logic for LightGBM
Main training logic for LightGBM
}
\examples{
library(lightgbm)
data(agaricus.train, package='
lightgbm
')
train <- agaricus.train
dtrain <- lgb.Dataset(train$data, label=train$label)
params <- list(objective="regression", metric="l2")
model <- lgb.cv(params, dtrain, 10, nfold=5, min_data=1, learning_rate=1, early_stopping_rounds=10)
library(lightgbm)
data(agaricus.train, package='
lightgbm
')
train <- agaricus.train
...
...
R-package/tests/testthat/test_basic.R
View file @
88e6beb8
...
...
@@ -75,3 +75,11 @@ test_that("training continuation works", {
expect_lt
(
abs
(
err_bst
-
err_bst2
),
0.01
)
})
test_that
(
"cv works"
,
{
dtrain
<-
lgb.Dataset
(
train
$
data
,
label
=
train
$
label
)
params
<-
list
(
objective
=
"regression"
,
metric
=
"l2,l1"
)
bst
<-
lgb.cv
(
params
,
dtrain
,
10
,
nflod
=
5
,
min_data
=
1
,
learning_rate
=
1
,
early_stopping_rounds
=
10
)
expect_false
(
is.null
(
bst
$
record_evals
))
})
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment