[ci] Introduce `typos` pre-commit hook (#6564)

Co-authored-by: Nikita Titov <nekit94-08@mail.ru>

[ci] Introduce `typos` pre-commit hook (#6564)
Co-authored-by: Nikita Titov <nekit94-08@mail.ru>
784f3841 · Oliver Borchert · GitHub · 27b00d74 · 784f3841 · 784f3841
Unverified Commit 784f3841 authored Dec 01, 2024 by Oliver Borchert Committed by GitHub Nov 30, 2024
20 changed files
--- a/.ci/test-r-package-windows.ps1
+++ b/.ci/test-r-package-windows.ps1
@@ -171,7 +171,7 @@ Write-Output "Done installing Rtools"
 Write-Output "Installing CMake"
 Add-Type -AssemblyName System.IO.Compression.FileSystem
 [System.IO.Compression.ZipFile]::ExtractToDirectory("$env:CMAKE_PATH/cmake.zip", "$env:CMAKE_PATH") ; Assert-Output $?
-# Remove old CMake shiped with RTools
+# Remove old CMake shipped with RTools
 Remove-Item "$env:RTOOLS_MINGW_BIN/cmake.exe" -Force -ErrorAction Ignore
 Write-Output "Done installing CMake"


--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@@ -39,7 +39,7 @@ jobs:
            This pull request has been automatically locked since there has not been any recent activity since it was closed.
            To start a new related discussion, open a new issue at https://github.com/microsoft/LightGBM/issues
            including a reference to this.
-          # what shoulld the locking status be?
+          # what should the locking status be?
          issue-lock-reason: 'resolved'
          pr-lock-reason: 'resolved'
          process-only: 'issues, prs'
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -39,3 +39,9 @@ repos:
    rev: v0.10.0.1
    hooks:
      - id: shellcheck
+  - repo: https://github.com/crate-ci/typos
+    rev: v1.23.2
+    hooks:
+      - id: typos
+        args: ["--force-exclude"]
+        exclude: (\.gitignore$)|(^\.editorconfig$)
--- a/.typos.toml
+++ b/.typos.toml
+default.extend-ignore-re = [
+  "/Ot",
+  "mis-alignment",
+  "mis-spelled",
+  "posix-seh-rt",
+]
+
+[default.extend-words]
+MAPE = "MAPE"
+datas = "datas"
+interprete = "interprete"
+mape = "mape"
+splitted = "splitted"
+
+[default.extend-identifiers]
+ERRORs = "ERRORs"
+GAM = "GAM"
+ND24s = "ND24s"
+WARNINGs = "WARNINGs"
+fullset = "fullset"
+thess = "thess"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
 option(USE_CUDA "Enable CUDA-accelerated training " OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
-option(USE_SANITIZER "Use santizer flags" OFF)
+option(USE_SANITIZER "Use sanitizer flags" OFF)
 set(
  ENABLED_SANITIZERS
  "address" "leak" "undefined"

--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -1114,7 +1114,7 @@ predict.lgb.Booster <- function(object,
 #'
 #'          Requesting a different prediction type or passing parameters to \link{predict.lgb.Booster}
 #'          will cause it to ignore the fast-predict configuration and take the slow route instead
-#'          (but be aware that an existing configuration might not always be overriden by supplying
+#'          (but be aware that an existing configuration might not always be overridden by supplying
 #'          different parameters or prediction type, so make sure to check that the output is what
 #'          was expected when a prediction is to be made on a single row for something different than
 #'          what is configured).
@@ -1128,7 +1128,7 @@ predict.lgb.Booster <- function(object,
 #'          and as such, this function will produce an error if passing \code{csr=TRUE} and
 #'          \code{type = "contrib"} together.
 #' @inheritParams lgb_predict_shared_params
-#' @param model LighGBM model object (class \code{lgb.Booster}).
+#' @param model LightGBM model object (class \code{lgb.Booster}).
 #'
 #'              \bold{The object will be modified in-place}.
 #' @param csr Whether the prediction function is going to be called on sparse CSR inputs.

--- a/R-package/R/lgb.importance.R
+++ b/R-package/R/lgb.importance.R
@@ -9,7 +9,7 @@
 #'   \item{\code{Feature}: Feature names in the model.}
 #'   \item{\code{Gain}: The total gain of this feature's splits.}
 #'   \item{\code{Cover}: The number of observation related to this feature.}
-#'   \item{\code{Frequency}: The number of times a feature splited in trees.}
+#'   \item{\code{Frequency}: The number of times a feature split in trees.}
 #' }
 #'
 #' @examples

--- a/R-package/R/lgb.model.dt.tree.R
+++ b/R-package/R/lgb.model.dt.tree.R
@@ -10,7 +10,7 @@
 #'        \emph{New in version 4.4.0}
 #'
 #' @return
-#' A \code{data.table} with detailed information about model trees' nodes and leafs.
+#' A \code{data.table} with detailed information about model trees' nodes and leaves.
 #'
 #' The columns of the \code{data.table} are:
 #'

--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@@ -139,7 +139,7 @@ NULL
 #'                    system, but be aware that getting the number of cores detected correctly requires package
 #'                    \code{RhpcBLASctl} to be installed.
 #'
-#'                    This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
+#'                    This parameter gets overridden by \code{num_threads} and its aliases under \code{params}
 #'                    if passed there.
 #'
 #'                    \emph{New in version 4.0.0}

--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -51,7 +51,7 @@ logregobj <- function(preds, dtrain) {

 # User-defined evaluation function returns a pair (metric_name, result, higher_better)
 # NOTE: when you do customized loss function, the default prediction value is margin
-# This may make built-in evalution metric calculate wrong results
+# This may make built-in evaluation metric calculate wrong results
 # For example, we are doing logistic loss, the prediction is score before logistic transformation
 # Keep this in mind when you use the customization, and maybe you need write customized evaluation function
 evalerror <- function(preds, dtrain) {

--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@@ -29,7 +29,7 @@ logregobj <- function(preds, dtrain) {

 # User-defined evaluation function returns a pair (metric_name, result, higher_better)
 # NOTE: when you do customized loss function, the default prediction value is margin
-# This may make built-in evalution metric calculate wrong results
+# This may make built-in evaluation metric calculate wrong results
 # For example, we are doing logistic loss, the prediction is score before logistic transformation
 # The built-in evaluation error assumes input is after logistic transformation
 # Keep this in mind when you use the customization, and maybe you need write customized evaluation function

--- a/R-package/man/lgb.configure_fast_predict.Rd
+++ b/R-package/man/lgb.configure_fast_predict.Rd
@@ -14,7 +14,7 @@ lgb.configure_fast_predict(
 )
 }
 \arguments{
-\item{model}{LighGBM model object (class \code{lgb.Booster}).
+\item{model}{LightGBM model object (class \code{lgb.Booster}).

             \bold{The object will be modified in-place}.}

@@ -98,7 +98,7 @@ Calling this function multiple times with different parameters might not overrid

         Requesting a different prediction type or passing parameters to \link{predict.lgb.Booster}
         will cause it to ignore the fast-predict configuration and take the slow route instead
-         (but be aware that an existing configuration might not always be overriden by supplying
+         (but be aware that an existing configuration might not always be overridden by supplying
         different parameters or prediction type, so make sure to check that the output is what
         was expected when a prediction is to be made on a single row for something different than
         what is configured).

--- a/R-package/man/lgb.importance.Rd
+++ b/R-package/man/lgb.importance.Rd
@@ -17,7 +17,7 @@ For a tree model, a \code{data.table} with the following columns:
  \item{\code{Feature}: Feature names in the model.}
  \item{\code{Gain}: The total gain of this feature's splits.}
  \item{\code{Cover}: The number of observation related to this feature.}
-  \item{\code{Frequency}: The number of times a feature splited in trees.}
+  \item{\code{Frequency}: The number of times a feature split in trees.}
 }
 }
 \description{

--- a/R-package/man/lgb.model.dt.tree.Rd
+++ b/R-package/man/lgb.model.dt.tree.Rd
@@ -18,7 +18,7 @@ lgb.model.dt.tree(model, num_iteration = NULL, start_iteration = 1L)
       \emph{New in version 4.4.0}}
 }
 \value{
-A \code{data.table} with detailed information about model trees' nodes and leafs.
+A \code{data.table} with detailed information about model trees' nodes and leaves.

 The columns of the \code{data.table} are:


--- a/R-package/man/lightgbm.Rd
+++ b/R-package/man/lightgbm.Rd
@@ -93,7 +93,7 @@ set to the iteration number of the best iteration.}
                   system, but be aware that getting the number of cores detected correctly requires package
                   \code{RhpcBLASctl} to be installed.

-                   This parameter gets overriden by \code{num_threads} and its aliases under \code{params}
+                   This parameter gets overridden by \code{num_threads} and its aliases under \code{params}
                   if passed there.

                   \emph{New in version 4.0.0}}

--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -9,7 +9,7 @@ set.seed(708L)
 #               to an accumulator then returns the current value.
 #               This is used to mock the situation where an evaluation
 #               metric increases every iteration
-ACCUMULATOR_NAME <- "INCREASING_METRIC_ACUMULATOR"
+ACCUMULATOR_NAME <- "INCREASING_METRIC_ACCUMULATOR"
 assign(x = ACCUMULATOR_NAME, value = 0.0, envir = .GlobalEnv)

 .increasing_metric <- function(preds, dtrain) {
@@ -1777,7 +1777,7 @@ test_that("lgb.train() works with early stopping for regression with a metric th
    , early_stopping_rounds + 1L
  )

-  # Booster should understand thatt all three of these metrics should be minimized
+  # Booster should understand that all three of these metrics should be minimized
  eval_info <- bst$.__enclos_env__$private$get_eval_info()
  expect_identical(eval_info, c("mape", "rmse", "l1"))
  expect_identical(

--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -14,7 +14,7 @@ logregobj <- function(preds, dtrain) {

 # User-defined evaluation function returns a pair (metric_name, result, higher_better)
 # NOTE: when you do customized loss function, the default prediction value is margin
-# This may make built-in evalution metric calculate wrong results
+# This may make built-in evaluation metric calculate wrong results
 # Keep this in mind when you use the customization, and maybe you need write customized evaluation function
 evalerror <- function(preds, dtrain) {
  labels <- get_field(dtrain, "label")

--- a/R-package/tests/testthat/test_lgb.interprete.R
+++ b/R-package/tests/testthat/test_lgb.interprete.R
@@ -5,7 +5,7 @@
    log(x / (1.0 - x))
 }

-test_that("lgb.intereprete works as expected for binary classification", {
+test_that("lgb.interprete works as expected for binary classification", {
    data(agaricus.train, package = "lightgbm")
    train <- agaricus.train
    dtrain <- lgb.Dataset(train$data, label = train$label)

--- a/R-package/tests/testthat/test_lgb.plot.interpretation.R
+++ b/R-package/tests/testthat/test_lgb.plot.interpretation.R
@@ -5,7 +5,7 @@
    log(x / (1.0 - x))
 }

-test_that("lgb.plot.interepretation works as expected for binary classification", {
+test_that("lgb.plot.interpretation works as expected for binary classification", {
    data(agaricus.train, package = "lightgbm")
    train <- agaricus.train
    dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -57,7 +57,7 @@ test_that("lgb.plot.interepretation works as expected for binary classification"
    expect_null(plot_res)
 })

-test_that("lgb.plot.interepretation works as expected for multiclass classification", {
+test_that("lgb.plot.interpretation works as expected for multiclass classification", {
    data(iris)

    # We must convert factors to numeric

--- a/cmake/Sanitizer.cmake
+++ b/cmake/Sanitizer.cmake
@@ -18,7 +18,7 @@ macro(enable_sanitizer sanitizer)
    set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=undefined -fno-sanitize-recover=undefined")

  else()
-    message(FATAL_ERROR "Santizer ${sanitizer} not supported.")
+    message(FATAL_ERROR "Sanitizer ${sanitizer} not supported.")
  endif()
 endmacro()