Unverified Commit 33eb0376 authored by david-cortes's avatar david-cortes Committed by GitHub
Browse files

[R-package] Promote number of threads to top-level argument in `lightgbm()`...

[R-package] Promote number of threads to top-level argument in `lightgbm()` and change default to number of cores (#4972)
parent 4ae3d138
...@@ -105,13 +105,13 @@ if [[ $OS_NAME == "macos" ]]; then ...@@ -105,13 +105,13 @@ if [[ $OS_NAME == "macos" ]]; then
fi fi
fi fi
# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat' # Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat'
# to avoid a CI-time dependency on devtools (for devtools::install_deps()) # to avoid a CI-time dependency on devtools (for devtools::install_deps())
# NOTE: testthat is not required when running rchk # NOTE: testthat is not required when running rchk
if [[ "${TASK}" == "r-rchk" ]]; then if [[ "${TASK}" == "r-rchk" ]]; then
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')" packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown')"
else else
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')" packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat')"
fi fi
compile_from_source="both" compile_from_source="both"
if [[ $OS_NAME == "macos" ]]; then if [[ $OS_NAME == "macos" ]]; then
......
...@@ -7,7 +7,7 @@ apt-get install --no-install-recommends -y \ ...@@ -7,7 +7,7 @@ apt-get install --no-install-recommends -y \
# installation of dependencies needs to happen before building the package, # installation of dependencies needs to happen before building the package,
# since `R CMD build` needs to install the package to build vignettes # since `R CMD build` needs to install the package to build vignettes
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh || exit -1 sh build-cran-package.sh || exit -1
......
#!/bin/bash #!/bin/bash
RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh \ sh build-cran-package.sh \
--r-executable=RDvalgrind \ --r-executable=RDvalgrind \
|| exit -1 || exit -1
......
...@@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT ...@@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT
Write-Output "Done installing Rtools" Write-Output "Done installing Rtools"
Write-Output "Installing dependencies" Write-Output "Installing dependencies"
$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')" $packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $? Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $?
# MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't # MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't
......
...@@ -188,7 +188,7 @@ jobs: ...@@ -188,7 +188,7 @@ jobs:
- name: Install packages - name: Install packages
shell: bash shell: bash
run: | run: |
RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }} sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1 RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
- name: Run tests with sanitizers - name: Run tests with sanitizers
...@@ -219,7 +219,7 @@ jobs: ...@@ -219,7 +219,7 @@ jobs:
shell: bash shell: bash
run: | run: |
export PATH=/opt/R-devel/bin/:${PATH} export PATH=/opt/R-devel/bin/:${PATH}
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh sh build-cran-package.sh
R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1 R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
......
...@@ -313,7 +313,7 @@ jobs: ...@@ -313,7 +313,7 @@ jobs:
R_LIB_PATH=~/Rlib R_LIB_PATH=~/Rlib
export R_LIBS=${R_LIB_PATH} export R_LIBS=${R_LIB_PATH}
mkdir -p ${R_LIB_PATH} mkdir -p ${R_LIB_PATH}
RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'), lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1 RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown'), lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh --r-executable=RD || exit -1 sh build-cran-package.sh --r-executable=RD || exit -1
mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz
displayName: 'Build CRAN R-package' displayName: 'Build CRAN R-package'
......
...@@ -50,6 +50,7 @@ VignetteBuilder: knitr ...@@ -50,6 +50,7 @@ VignetteBuilder: knitr
Suggests: Suggests:
knitr, knitr,
processx, processx,
RhpcBLASctl,
rmarkdown, rmarkdown,
testthat testthat
Depends: Depends:
...@@ -61,6 +62,7 @@ Imports: ...@@ -61,6 +62,7 @@ Imports:
jsonlite (>= 1.0), jsonlite (>= 1.0),
Matrix (>= 1.1-0), Matrix (>= 1.1-0),
methods, methods,
parallel,
utils utils
SystemRequirements: SystemRequirements:
C++11 C++11
......
...@@ -52,6 +52,7 @@ importFrom(graphics,barplot) ...@@ -52,6 +52,7 @@ importFrom(graphics,barplot)
importFrom(graphics,par) importFrom(graphics,par)
importFrom(jsonlite,fromJSON) importFrom(jsonlite,fromJSON)
importFrom(methods,is) importFrom(methods,is)
importFrom(parallel,detectCores)
importFrom(stats,quantile) importFrom(stats,quantile)
importFrom(utils,modifyList) importFrom(utils,modifyList)
importFrom(utils,read.delim) importFrom(utils,read.delim)
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#' model <- lightgbm( #' model <- lightgbm(
#' agaricus.train$data #' agaricus.train$data
#' , agaricus.train$label #' , agaricus.train$label
#' , params = list(objective = "binary", nthreads = 1L) #' , params = list(objective = "binary")
#' , nrounds = 5L #' , nrounds = 5L
#' , verbose = 0) #' , verbose = 0)
#' fname <- tempfile(fileext="rds") #' fname <- tempfile(fileext="rds")
......
...@@ -98,6 +98,22 @@ NULL ...@@ -98,6 +98,22 @@ NULL
#' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective}{ #' \href{https://lightgbm.readthedocs.io/en/latest/Parameters.html#objective}{
#' the "objective" item of the "Parameters" section of the documentation}. #' the "objective" item of the "Parameters" section of the documentation}.
#' @param init_score initial score is the base prediction lightgbm will boost from #' @param init_score initial score is the base prediction lightgbm will boost from
#' @param num_threads Number of parallel threads to use. For best speed, this should be set to the number of
#' physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
#' number of maximum threads.
#'
#' Be aware that using too many threads can result in speed degradation in smaller datasets
#' (see the parameters documentation for more details).
#'
#' If passing zero, will use the default number of threads configured for OpenMP
#' (typically controlled through an environment variable \code{OMP_NUM_THREADS}).
#'
#' If passing \code{NULL} (the default), will try to use the number of physical cores in the
#' system, but be aware that getting the number of cores detected correctly requires package
#' \code{RhpcBLASctl} to be installed.
#'
#' This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
#' if passed there.
#' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example #' @param ... Additional arguments passed to \code{\link{lgb.train}}. For example
#' \itemize{ #' \itemize{
#' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} #' \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
...@@ -129,6 +145,7 @@ lightgbm <- function(data, ...@@ -129,6 +145,7 @@ lightgbm <- function(data,
serializable = TRUE, serializable = TRUE,
objective = "regression", objective = "regression",
init_score = NULL, init_score = NULL,
num_threads = NULL,
...) { ...) {
# validate inputs early to avoid unnecessary computation # validate inputs early to avoid unnecessary computation
...@@ -136,6 +153,15 @@ lightgbm <- function(data, ...@@ -136,6 +153,15 @@ lightgbm <- function(data,
stop("nrounds should be greater than zero") stop("nrounds should be greater than zero")
} }
if (is.null(num_threads)) {
num_threads <- lgb.get.default.num.threads()
}
params <- lgb.check.wrapper_param(
main_param_name = "num_threads"
, params = params
, alternative_kwarg_value = num_threads
)
# Set data to a temporary variable # Set data to a temporary variable
dtrain <- data dtrain <- data
......
...@@ -217,3 +217,26 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v ...@@ -217,3 +217,26 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v
params[[main_param_name]] <- alternative_kwarg_value params[[main_param_name]] <- alternative_kwarg_value
return(params) return(params)
} }
#' @importFrom parallel detectCores
lgb.get.default.num.threads <- function() {
if (requireNamespace("RhpcBLASctl", quietly = TRUE)) { # nolint
return(RhpcBLASctl::get_num_cores())
} else {
msg <- "Optional package 'RhpcBLASctl' not found."
cores <- 0L
if (Sys.info()["sysname"] != "Linux") {
cores <- parallel::detectCores(logical = FALSE)
if (is.na(cores) || cores < 0L) {
cores <- 0L
}
}
if (cores == 0L) {
msg <- paste(msg, "Will use default number of OpenMP threads.", sep = " ")
} else {
msg <- paste(msg, "Detection of CPU cores might not be accurate.", sep = " ")
}
warning(msg)
return(cores)
}
}
...@@ -25,7 +25,7 @@ data("agaricus.train") ...@@ -25,7 +25,7 @@ data("agaricus.train")
model <- lightgbm( model <- lightgbm(
agaricus.train$data agaricus.train$data
, agaricus.train$label , agaricus.train$label
, params = list(objective = "binary", nthreads = 1L) , params = list(objective = "binary")
, nrounds = 5L , nrounds = 5L
, verbose = 0) , verbose = 0)
fname <- tempfile(fileext="rds") fname <- tempfile(fileext="rds")
......
...@@ -18,6 +18,7 @@ lightgbm( ...@@ -18,6 +18,7 @@ lightgbm(
serializable = TRUE, serializable = TRUE,
objective = "regression", objective = "regression",
init_score = NULL, init_score = NULL,
num_threads = NULL,
... ...
) )
} }
...@@ -60,6 +61,23 @@ the "objective" item of the "Parameters" section of the documentation}.} ...@@ -60,6 +61,23 @@ the "objective" item of the "Parameters" section of the documentation}.}
\item{init_score}{initial score is the base prediction lightgbm will boost from} \item{init_score}{initial score is the base prediction lightgbm will boost from}
\item{num_threads}{Number of parallel threads to use. For best speed, this should be set to the number of
physical cores in the CPU - in a typical x86-64 machine, this corresponds to half the
number of maximum threads.
Be aware that using too many threads can result in speed degradation in smaller datasets
(see the parameters documentation for more details).
If passing zero, will use the default number of threads configured for OpenMP
(typically controlled through an environment variable \code{OMP_NUM_THREADS}).
If passing \code{NULL} (the default), will try to use the number of physical cores in the
system, but be aware that getting the number of cores detected correctly requires package
\code{RhpcBLASctl} to be installed.
This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
if passed there.}
\item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example \item{...}{Additional arguments passed to \code{\link{lgb.train}}. For example
\itemize{ \itemize{
\item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation} \item{\code{valids}: a list of \code{lgb.Dataset} objects, used for validation}
......
...@@ -2928,6 +2928,51 @@ test_that("lightgbm() defaults to 'regression' objective if objective not otherw ...@@ -2928,6 +2928,51 @@ test_that("lightgbm() defaults to 'regression' objective if objective not otherw
expect_false(any(model_txt_lines == "objective=regression_l1")) expect_false(any(model_txt_lines == "objective=regression_l1"))
}) })
test_that("lightgbm() accepts 'num_threads' as either top-level argument or under params", {
bst <- lightgbm(
data = train$data
, label = train$label
, nrounds = 5L
, verbose = VERBOSITY
, num_threads = 1L
)
expect_equal(bst$params$num_threads, 1L)
model_txt_lines <- strsplit(
x = bst$save_model_to_string()
, split = "\n"
)[[1L]]
expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
bst <- lightgbm(
data = train$data
, label = train$label
, nrounds = 5L
, verbose = VERBOSITY
, params = list(num_threads = 1L)
)
expect_equal(bst$params$num_threads, 1L)
model_txt_lines <- strsplit(
x = bst$save_model_to_string()
, split = "\n"
)[[1L]]
expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
bst <- lightgbm(
data = train$data
, label = train$label
, nrounds = 5L
, verbose = VERBOSITY
, num_threads = 10L
, params = list(num_threads = 1L)
)
expect_equal(bst$params$num_threads, 1L)
model_txt_lines <- strsplit(
x = bst$save_model_to_string()
, split = "\n"
)[[1L]]
expect_true(any(grepl("\\[num_threads: 1\\]", model_txt_lines)))
})
test_that("lightgbm() accepts 'weight' and 'weights'", { test_that("lightgbm() accepts 'weight' and 'weights'", {
data(mtcars) data(mtcars)
X <- as.matrix(mtcars[, -1L]) X <- as.matrix(mtcars[, -1L])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment