Enable crate-ci/typos lint; fix typos (#1005)

Co-authored-by: Titus von Koeller <titus@vonkoeller.com> fix erroneous correction

Enable crate-ci/typos lint; fix typos (#1005)
Co-authored-by: Titus von Koeller <titus@vonkoeller.com> fix erroneous correction
8c507d92 · Aarni Koskela · GitHub · 8a14c638 · 8c507d92 · 8c507d92
Unverified Commit 8c507d92 authored Feb 05, 2024 by Aarni Koskela Committed by GitHub Feb 05, 2024
14 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,3 +17,7 @@ repos:
      - id: mixed-line-ending
        args:
          - --fix=lf
+  - repo: https://github.com/crate-ci/typos
+    rev: v1.17.2
+    hooks:
+      - id: typos
--- a/_typos.toml
+++ b/_typos.toml
+[files]
+[default.extend-identifiers]
+[type.py.extend-words]
+"BA" = "BA"  # used as a commented-out variable in tests
+[type.cuda.extend-words]
+"subtile" = "subtile"
+"subtiles" = "subtiles"
+"transation" = "transation"  # TODO: is this transition, transaction, translation..?
--- a/benchmarking/switchback/make_plot_with_jsonl.py
+++ b/benchmarking/switchback/make_plot_with_jsonl.py
@@ -36,8 +36,8 @@ if __name__ == '__main__':
        ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
        ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
-        ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
+        ('w_quantize_global', '.', '--', 'C4', 'Quantize global W (switchback)'),
-        ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'),
+        ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize global and\ntranspose W (switchback)'),
    ]:
        xs = []
        ys = []

--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -4,7 +4,7 @@ extract factors the build is dependent on:
    [ ] TODO: Q - What if we have multiple GPUs of different makes?
 - CUDA version
 - Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multipl)
+    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiply)
    - CuBLAS-LT: full-build 8-bit optimizer
    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
@@ -263,7 +263,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
        warning_msg = (
            f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
            "We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
-            "but this might missmatch with the CUDA version that is needed for bitsandbytes."
+            "but this might mismatch with the CUDA version that is needed for bitsandbytes."
            "To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
            "For example, if you want to use the CUDA version 122"
            "BNB_CUDA_VERSION=122 python ..."

--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -120,7 +120,7 @@ class GlobalPageManager:
        return cls._instance
    def prefetch_all(self, to_cpu=False):
-        # assume the first added, will be hte
+        # assume the first added, will be the
        # ones that are used first, so swap them in last
        # in the case they are evicted again
        for t in self.paged_tensors[::-1]:
@@ -219,7 +219,7 @@ def elementwise_func(func_name, A, B, value, prefetch=True):
        # paged function are fully asynchronous
        # if we return from this function, we want to the tensor
        # to be in the correct state, that is the final state after the
-        # operation occured. So we synchronize.
+        # operation occurred. So we synchronize.
        torch.cuda.synchronize()
 def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
@@ -589,7 +589,7 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl
 class QuantState:
-    """container for quantization state components to work with Params4bit and similar clases"""
+    """container for quantization state components to work with Params4bit and similar classes"""
    valid_quant_types = ('fp4', 'nf4')
    valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
    valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',

--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -134,10 +134,10 @@ __device__ unsigned char dQuantizeFP4(float x)
  // we do a binary search
  // the pivots are divided by 12 (the FP4 absmax)
-  // since we assum input data is in [-1.0, 1.0]
+  // since we assume input data is in [-1.0, 1.0]
  // !be careful here, its easy to make a mistake
-  // that is difficult to noice if you add an extra
+  // that is difficult to notice if you add an extra
  // zero somewhere!
  int sign = x < 0 ? 0b1000 : 0b0000;
@@ -2259,8 +2259,8 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
  }
  // 4. store data via atomicMax
-  // to store col data efficienctly we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
+  // to store col data efficiently we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
-  // into a striped arangement: [0, 8, 16, 24, ..] for t0
+  // into a striped arrangement: [0, 8, 16, 24, ..] for t0
  __syncthreads();
  BlockExchange(temp_storage.exchange).BlockedToStriped(local_col_absmax_values);
@@ -2310,7 +2310,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
  // data is in 32 column-tile major with tile width 32 columns and numRows rows
  // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
-  // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+  // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
  // C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register))
  // C2. Compute normalization values and store col values in register
  // S1. Store C1 into 16-bit output
@@ -2383,7 +2383,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
    if(valid_items <= 0) // the sub-tile might have more elements than the tile itself
      break;
-    // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+    // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
    LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0);
    ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values);
@@ -2650,7 +2650,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                  // row1 [col0 col1 ... col31]
                  // ...
                  //
-                  // As such we read consequtive entries with 256 threads (8rows x 32 columns)
+                  // As such we read consecutive entries with 256 threads (8rows x 32 columns)
                  // as j increase, the row increase by a factor of 8
                  // We load 8 rows per subrow loop, and subrow increase by 8 per loop
                  // so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
@@ -2747,7 +2747,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                    // each of these has 32 values in total for 32*4 = 128 as offset if odd
                    // every set of 4 columns increases the total offset by 16
                    // each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
-                    // this happends every 8 rows anew (subrow % 8)
+                    // this happens every 8 rows anew (subrow % 8)
                    // one writes 4 columns at once that is (col % 4) for the particular index in the subtile
                    int subcol = warp_lane;
@@ -3073,7 +3073,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
-//// 7. aggreecate files of C into shared memroy block C
+//// 7. aggreecate files of C into shared memory block C
 //// 8. sum (7)
 //// 9. write outputs to matmul output matrix
 //}

--- a/deploy.sh
+++ b/deploy.sh
@@ -5,7 +5,7 @@ echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
 echo $LD_LIBRARY_PATH
 if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -24,7 +24,7 @@ make cpuonly CUDA_VERSION="CPU"
 if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -34,7 +34,7 @@ make cuda110 CUDA_VERSION=110
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -44,7 +44,7 @@ make cuda11x CUDA_VERSION=111
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -54,7 +54,7 @@ make cuda11x CUDA_VERSION=114
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -64,7 +64,7 @@ make cuda11x CUDA_VERSION=115
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -74,7 +74,7 @@ make cuda11x CUDA_VERSION=117
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -84,7 +84,7 @@ make cuda118 CUDA_VERSION=118
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -94,7 +94,7 @@ make cuda12x CUDA_VERSION=120
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -104,7 +104,7 @@ make cuda12x CUDA_VERSION=121
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -114,7 +114,7 @@ make cuda12x CUDA_VERSION=122
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -124,7 +124,7 @@ make cuda12x CUDA_VERSION=123
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -138,7 +138,7 @@ make cuda110_nomatmul CUDA_VERSION=110
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -149,7 +149,7 @@ make cuda11x_nomatmul CUDA_VERSION=111
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -159,7 +159,7 @@ make cuda11x_nomatmul CUDA_VERSION=114
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -169,7 +169,7 @@ make cuda11x_nomatmul CUDA_VERSION=115
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -179,7 +179,7 @@ make cuda11x_nomatmul CUDA_VERSION=117
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -189,7 +189,7 @@ make cuda118_nomatmul CUDA_VERSION=118
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -199,7 +199,7 @@ make cuda12x_nomatmul CUDA_VERSION=120
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -209,7 +209,7 @@ make cuda12x_nomatmul CUDA_VERSION=121
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -219,7 +219,7 @@ make cuda12x_nomatmul CUDA_VERSION=122
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi
@@ -229,7 +229,7 @@ make cuda12x_nomatmul CUDA_VERSION=123
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
  # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
  exit 64
 fi

--- a/docs/source/contributing.mdx
+++ b/docs/source/contributing.mdx
 # Contributors guidelines
-... stil under construction ... (feel free to propose materials, `bitsandbytes` is a community project)
+... still under construction ... (feel free to propose materials, `bitsandbytes` is a community project)
 ## Setup pre-commit hooks
 - Install pre-commit hooks with `pip install pre-commit`.

--- a/docs/source/integrations.mdx
+++ b/docs/source/integrations.mdx
@@ -29,7 +29,7 @@ Please review the [bitsandbytes section in the Accelerate docs](https://huggingf
 # Trainer for the optimizers
-You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on intialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`).
+You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`).
 See the [official API docs for reference](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer).

--- a/docs/source/optimizers.mdx
+++ b/docs/source/optimizers.mdx
@@ -168,9 +168,9 @@ Possible options for the config override are: `betas, eps, weight_decay, lr, opt
 For overrides for particular layers, we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager:
 ```py
 class MyModule(torch.nn.Module):
-  def __init__(din, dout):
+  def __init__(d_in, d_out):
    super(MyModule, self).__init__()
-    self.linear = torch.nn.Linear(din, dout)
+    self.linear = torch.nn.Linear(d_in, d_out)
    # optimization will happen in 32-bit and
    # learning rate will be set to 0.0001 independent of the main learning rate
    config = {'optim_bits': 32, 'lr' : 0.0001}

--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -157,7 +157,7 @@ private:
        FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
        IVec<AVX, float> ip = idxm;
-#else // do not use gather instrucions
+#else // do not use gather instructions
        union U {
            __m256i vec;

--- a/include/Portable.h
+++ b/include/Portable.h
@@ -147,5 +147,5 @@ inline T prev(T x)
    return x;
 }
-} // namepsace Details
+} // namespace Details
 } // namespace BinSearch
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -568,5 +568,5 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 #endif
-} // namepsace Details
+} // namespace Details
 } // namespace BinSearch
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -483,7 +483,14 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
        assert (idx == 0).sum().item() <= b1.numel() * 0.005
-@pytest.mark.parametrize("module", [lambda nin, nout, bias=True: bnb.nn.Linear8bitLt(nin, nout, bias=bias, has_fp16_weights=False), bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4'])
+@pytest.mark.parametrize(
+    "module",
+    [
+        lambda n_in, n_out, bias=True: bnb.nn.Linear8bitLt(n_in, n_out, bias=bias, has_fp16_weights=False),
+        bnb.nn.LinearFP4,
+    ],
+    ids=['Int8Lt', 'FP4'],
+)
 def test_linear_kbit_fp32_bias(module):
    # casts model to fp16 -> int8 automatically
    l1 = module(32, 64).cuda()