Unverified Commit 5fa887bb authored by James Lamb's avatar James Lamb Committed by GitHub
Browse files

[R-package] [docs] add intro vignette (#3946) (#4775)



* [R-package] [docs] add intro vignette (#3946)

* add 10 test vignettes

* Revert "add 10 test vignettes"

This reverts commit 40fb2e2f1982402798776ee44e4ec82fc4644d3d.

* Apply suggestions from code review
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
Co-authored-by: default avatarMichael Mayer <mayermichael79@gmail.com>
Co-authored-by: default avatarNikita Titov <nekit94-08@mail.ru>
parent 06e3c4a9
......@@ -8,7 +8,7 @@ SOURCE_DIR <- args[[1L]]
FILES_TO_LINT <- list.files(
path = SOURCE_DIR
, pattern = "\\.r$"
, pattern = "\\.r$|\\.rmd$"
, all.files = TRUE
, ignore.case = TRUE
, full.names = TRUE
......
......@@ -92,13 +92,13 @@ if [[ $OS_NAME == "macos" ]]; then
fi
fi
# Manually install Depends and Imports libraries + 'testthat'
# Manually install Depends and Imports libraries + 'knitr', 'rmarkdown', 'testthat'
# to avoid a CI-time dependency on devtools (for devtools::install_deps())
# NOTE: testthat is not required when running rchk
if [[ "${TASK}" == "r-rchk" ]]; then
packages="c('data.table', 'jsonlite', 'Matrix', 'R6')"
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown')"
else
packages="c('data.table', 'jsonlite', 'Matrix', 'R6', 'testthat')"
packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'rmarkdown', 'testthat')"
fi
compile_from_source="both"
if [[ $OS_NAME == "macos" ]]; then
......
......@@ -5,7 +5,9 @@ apt-get install --no-install-recommends -y \
libxml2-dev \
libssl-dev
Rscript -e "install.packages('rhub', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
# installation of dependencies needs to happen before building the package,
# since `R CMD build` needs to install the package to build vignettes
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'rhub', 'testthat'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh || exit -1
......
#!/bin/bash
RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh \
--r-executable=RDvalgrind \
|| exit -1
......
......@@ -122,7 +122,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT
Write-Output "Done installing Rtools"
Write-Output "Installing dependencies"
$packages = "c('data.table', 'jsonlite', 'Matrix', 'processx', 'R6', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $?
# MiKTeX and pandoc can be skipped on non-MinGW builds, since we don't
......@@ -165,7 +165,15 @@ if ($env:COMPILER -ne "MSVC") {
}
Run-R-Code-Redirect-Stderr "commandArgs <- function(...){$env:BUILD_R_FLAGS}; source('build_r.R')"; Check-Output $?
} elseif ($env:R_BUILD_TYPE -eq "cran") {
# NOTE: gzip and tar are needed to create a CRAN package on Windows, but
# some flavors of tar.exe can fail in some settings on Windows.
# Putting the msys64 utilities at the beginning of PATH temporarily to be
# sure they're used for that purpose.
if ($env:R_MAJOR_VERSION -eq "3") {
$env:PATH = "C:\msys64\usr\bin;" + $env:PATH
}
Run-R-Code-Redirect-Stderr "result <- processx::run(command = 'sh', args = 'build-cran-package.sh', echo = TRUE, windows_verbatim_args = FALSE, error_on_status = TRUE)" ; Check-Output $?
Remove-From-Path ".*msys64.*"
# Test CRAN source .tar.gz in a directory that is not this repo or below it.
# When people install.packages('lightgbm'), they won't have the LightGBM
# git repo around. This is to protect against the use of relative paths
......
......@@ -194,7 +194,7 @@ jobs:
- name: Install packages
shell: bash
run: |
RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
- name: Run tests with sanitizers
......@@ -225,7 +225,7 @@ jobs:
shell: bash
run: |
export PATH=/opt/R-devel/bin/:${PATH}
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh
R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
......
......@@ -57,7 +57,7 @@ jobs:
- name: Install packages
shell: bash
run: |
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'roxygen2', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown', 'roxygen2', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
sh build-cran-package.sh || exit -1
R CMD INSTALL --with-keep.source lightgbm_*.tar.gz || exit -1
- name: Test documentation
......
......@@ -19,7 +19,7 @@ resources:
image: 'ubuntu:latest'
options: "--name ci-container -v /usr/bin/docker:/tmp/docker:ro"
- container: rbase
image: rocker/r-base
image: wch1/r-debug
jobs:
###########################################
- job: Linux
......@@ -300,6 +300,7 @@ jobs:
steps:
- script: |
LGB_VER=$(head -n 1 VERSION.txt | sed "s/rc/-/g")
Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'rmarkdown'), dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())" || exit -1
sh build-cran-package.sh || exit -1
mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz
displayName: 'Build CRAN R-package'
......
......@@ -45,8 +45,11 @@ URL: https://github.com/Microsoft/LightGBM
BugReports: https://github.com/Microsoft/LightGBM/issues
NeedsCompilation: yes
Biarch: true
VignetteBuilder: knitr
Suggests:
knitr,
processx,
rmarkdown,
testthat
Depends:
R (>= 3.5),
......
......@@ -152,7 +152,8 @@ Rscript build_r.R
The `build_r.R` script builds the package in a temporary directory called `lightgbm_r`. It will destroy and recreate that directory each time you run the script. That script supports the following command-line options:
- `-j[jobs]`: number of threads to use when compiling LightGBM. E.g., `-j4` will try to compile 4 objects at a time.
- `--no-build-vignettes`: Skip building vignettes.
- `-j[jobs]`: Number of threads to use when compiling LightGBM. E.g., `-j4` will try to compile 4 objects at a time.
- by default, this script uses single-thread compilation
- for best results, set `-j` to the number of physical CPUs
- `--skip-install`: Build the package tarball, but do not install it.
......@@ -269,6 +270,11 @@ sh build-cran-package.sh
This will create a file `lightgbm_${VERSION}.tar.gz`, where `VERSION` is the version of `LightGBM`.
That script supports the following command-line options:
- `--no-build-vignettes`: Skip building vignettes.
- `--r-executable=[path-to-executable]`: Use an alternative build of R.
Also, CRAN package is generated with every commit to any repo's branch and can be found in "Artifacts" section of the associated Azure Pipelines run.
### Standard Installation from CRAN Package
......
......@@ -41,6 +41,8 @@ navbar:
href: ../
- icon: fa-home fa-lg
href: index.html
- text: Articles
href: articles/index.html
- text: Reference
href: reference/index.html
right:
......
---
title:
"Basic Walkthrough"
description: >
This vignette describes how to train a LightGBM model for binary classification.
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{Basic Walkthrough}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE
, comment = "#>"
, warning = FALSE
, message = FALSE
)
```
## Introduction
Welcome to the world of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), a highly efficient gradient boosting implementation (Ke et al. 2017).
```{r setup}
library(lightgbm)
```
This vignette will guide you through its basic usage. It will show how to build a simple binary classification model based on a subset of the `bank` dataset (Moro, Cortez, and Rita 2014). You will use the two input features "age" and "balance" to predict whether a client has subscribed a term deposit.
## The dataset
The dataset looks as follows.
```{r}
data(bank, package = "lightgbm")
bank[1L:5L, c("y", "age", "balance")]
# Distribution of the response
table(bank$y)
```
## Training the model
The R package of LightGBM offers two functions to train a model:
- `lgb.train()`: This is the main training logic. It offers full flexibility but requires a `Dataset` object created by the `lgb.Dataset()` function.
- `lightgbm()`: Simpler, but less flexible. Data can be passed without having to bother with `lgb.Dataset()`.
### Using the `lightgbm()` function
In a first step, you need to convert data to numeric. Afterwards, you are ready to fit the model by the `lightgbm()` function.
```{r}
# Numeric response and feature matrix
y <- as.numeric(bank$y == "yes")
X <- data.matrix(bank[, c("age", "balance")])
# Train
fit <- lightgbm(
data = X
, label = y
, num_leaves = 4L
, learning_rate = 1.0
, nrounds = 10L
, objective = "binary"
, verbose = -1L
)
# Result
summary(predict(fit, X))
```
It seems to have worked! And the predictions are indeed probabilities between 0 and 1.
### Using the `lgb.train()` function
Alternatively, you can go for the more flexible interface `lgb.train()`. Here, as an additional step, you need to prepare `y` and `X` by the data API `lgb.Dataset()` of LightGBM. Parameters are passed to `lgb.train()` as a named list.
```{r}
# Data interface
dtrain <- lgb.Dataset(X, label = y)
# Parameters
params <- list(
objective = "binary"
, num_leaves = 4L
, learning_rate = 1.0
)
# Train
fit <- lgb.train(
params
, data = dtrain
, nrounds = 10L
, verbose = -1L
)
```
Try it out! If stuck, visit LightGBM's [documentation](https://lightgbm.readthedocs.io/en/latest/R/index.html) for more details.
```{r, echo = FALSE, results = "hide"}
# Cleanup
if (file.exists("lightgbm.model")) {
file.remove("lightgbm.model")
}
```
## References
Ke, Guolin, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. "LightGBM: A Highly Efficient Gradient Boosting Decision Tree." In Advances in Neural Information Processing Systems 30 (NIPS 2017).
Moro, Sérgio, Paulo Cortez, and Paulo Rita. 2014. "A Data-Driven Approach to Predict the Success of Bank Telemarketing." Decision Support Systems 62: 22–31.
......@@ -11,6 +11,11 @@
# non-standard builds of R, such as those provided in
# https://github.com/wch/r-debug.
#
# --no-build-vignettes Pass this flag to skip creating vignettes.
# You might want to do this to avoid installing
# vignette-only dependencies, or to avoid
# portability issues.
#
# [usage]
#
# # default usage
......@@ -18,9 +23,14 @@
#
# # custom R build
# sh build-cran-package.sh --r-executable=RDvalgrind
#
# # skip vignette building
# sh build-cran-package.sh --no-build-vignettes
set -e
# Default values of arguments
BUILD_VIGNETTES=true
LGB_R_EXECUTABLE=R
while [ $# -gt 0 ]; do
......@@ -28,6 +38,9 @@ while [ $# -gt 0 ]; do
--r-executable=*)
LGB_R_EXECUTABLE="${1#*=}"
;;
--no-build-vignettes=*)
BUILD_VIGNETTES=false
;;
*)
echo "invalid argument '${1}'"
exit -1
......@@ -57,6 +70,10 @@ cp -R R-package/* "${TEMP_R_DIR}"
cp -R include "${TEMP_R_DIR}/src/"
cp -R src/* "${TEMP_R_DIR}/src/"
if ${BUILD_VIGNETTES} ; then
cp docs/logo/LightGBM_logo_black_text.svg "${TEMP_R_DIR}/vignettes/"
fi
cp \
external_libs/fast_double_parser/include/fast_double_parser.h \
"${TEMP_R_DIR}/src/include/LightGBM"
......@@ -169,8 +186,46 @@ cd "${TEMP_R_DIR}"
cd "${ORIG_WD}"
"${LGB_R_EXECUTABLE}" CMD build \
--keep-empty-dirs \
lightgbm_r
if ${BUILD_VIGNETTES} ; then
"${LGB_R_EXECUTABLE}" CMD build \
--keep-empty-dirs \
lightgbm_r
echo "removing object files created by vignettes"
rm -rf ./_tmp
mkdir _tmp
TARBALL_NAME="lightgbm_${LGB_VERSION}.tar.gz"
mv "${TARBALL_NAME}" _tmp/
echo "untarring ${TARBALL_NAME}"
cd _tmp
tar -xvf "${TARBALL_NAME}" > /dev/null 2>&1
rm -rf "${TARBALL_NAME}"
cd ..
echo "done untarring ${TARBALL_NAME}"
echo "re-tarring ${TARBALL_NAME}"
tar \
-czv \
-C ./_tmp \
--exclude=*.a \
--exclude=*.dll \
--exclude=*.o \
--exclude=*.so \
--exclude=*.tar.gz \
--exclude=**/conftest.c \
--exclude=**/conftest.exe \
-f "${TARBALL_NAME}" \
lightgbm \
> /dev/null 2>&1
echo "Done creating ${TARBALL_NAME}"
rm -rf ./_tmp
else
"${LGB_R_EXECUTABLE}" CMD build \
--keep-empty-dirs \
--no-build-vignettes \
lightgbm_r
fi
echo "Done building R package"
......@@ -39,6 +39,7 @@ TEMP_SOURCE_DIR <- file.path(TEMP_R_DIR, "src")
}
parsed_args <- .parse_args(args)
SKIP_VIGNETTES <- "--no-build-vignettes" %in% parsed_args[["flags"]]
USING_GPU <- "--use-gpu" %in% parsed_args[["flags"]]
USING_MINGW <- "--use-mingw" %in% parsed_args[["flags"]]
USING_MSYS2 <- "--use-msys2" %in% parsed_args[["flags"]]
......@@ -54,7 +55,8 @@ ARGS_TO_DEFINES <- c(
)
recognized_args <- c(
"--skip-install"
"--no-build-vignettes"
, "--skip-install"
, "--use-gpu"
, "--use-mingw"
, "--use-msys2"
......@@ -424,7 +426,11 @@ writeLines(namespace_contents, NAMESPACE_FILE)
# NOTE: --keep-empty-dirs is necessary to keep the deep paths expected
# by CMake while also meeting the CRAN req to create object files
# on demand
.run_shell_command("R", c("CMD", "build", TEMP_R_DIR, "--keep-empty-dirs"))
r_build_args <- c("CMD", "build", TEMP_R_DIR, "--keep-empty-dirs")
if (isTRUE(SKIP_VIGNETTES)) {
r_build_args <- c(r_build_args, "--no-build-vignettes")
}
.run_shell_command("R", r_build_args)
# Install the package
version <- gsub(
......
......@@ -273,8 +273,10 @@ def generate_r_docs(app: Sphinx) -> None:
r-base=4.1.0=hb67fd72_2 \
r-data.table=1.14.0=r41hcfec24a_0 \
r-jsonlite=1.7.2=r41hcfec24a_0 \
r-knitr=1.35=r41hc72bb7e_0 \
r-matrix=1.3_4=r41he454529_0 \
r-pkgdown=1.6.1=r41hc72bb7e_0 \
r-rmarkdown=2.11=r41hc72bb7e_0 \
r-roxygen2=7.1.1=r41h03ef668_0
source /home/docs/.conda/bin/activate r_env
export TAR=/bin/tar
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment